From a94d2cdf80a55d3bc50f52af3a0c4a86c0bbff7f Mon Sep 17 00:00:00 2001
From: juanboy <2980526980@qq.com>
Date: Tue, 2 Jul 2024 17:49:46 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90cutComments.py=E3=80=91=E5=88=86?=
 =?UTF-8?q?=E8=AF=8D=E7=BB=9F=E8=AE=A1=E8=AF=8D=E9=A2=91=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E5=AE=9A=E4=B9=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 utils/cipingTotal.py | 31 +++++++++++++++++++++++++++++++
 utils/cutComments.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 utils/cipingTotal.py
 create mode 100644 utils/cutComments.py

diff --git a/utils/cipingTotal.py b/utils/cipingTotal.py
new file mode 100644
index 0000000..6cf81f4
--- /dev/null
+++ b/utils/cipingTotal.py
@@ -0,0 +1,31 @@
+import jieba
+import re
+
+def main():
+    reader = open('./cutComments.txt','r',encoding='utf8')
+    strs = reader.read()
+    result = open('./cipingTotal.csv','w',encoding='utf8')
+
+    # 分词，去重，列表
+    word_list = jieba.cut(strs,cut_all=True)
+
+    new_words = []
+    for i in word_list:
+        m = re.search("\d+",i)
+        n = re.search("\W+",i)
+        if not m and not n and len(i) > 1:
+            new_words.append(i)
+
+    # 统计词频
+    word_count = {}
+    for i in set(new_words):
+        word_count[i] = new_words.count(i)
+
+    # 格式整理
+    list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
+
+    for i in range(100):
+        print(list_count[i],file=result)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/utils/cutComments.py b/utils/cutComments.py
new file mode 100644
index 0000000..ab3ee3c
--- /dev/null
+++ b/utils/cutComments.py
@@ -0,0 +1,29 @@
+from utils.getPublicData import getAllCommentsData
+import jieba
+targetTxt = 'cutComments.txt'
+
+def stopWordList():
+    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+    return stopWords
+
+def seg_depart(sentence):
+    sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
+    stopWords = stopWordList()
+    outStr = ''
+    for word in sentence_depart:
+        if word not in stopWords:
+            if word != '\t':
+                outStr += word
+    return outStr
+
+def writer_comments_cuts():
+    with open(targetTxt,'a+',encoding='utf-8') as targetFile:
+        seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
+        output = ' '.join(seg)
+        targetFile.write(output)
+        targetFile.write('\n')
+        print('写入成功')
+
+
+if __name__ == '__main__':
+    writer_comments_cuts()
\ No newline at end of file