【cutComments.py】分词统计词频函数定义

2024-07-02 17:49:46 +08:00
parent 50781187c2
commit a94d2cdf80
2 changed files with 60 additions and 0 deletions
@@ -0,0 +1,29 @@
+from utils.getPublicData import getAllCommentsData
+import jieba
+targetTxt = 'cutComments.txt'
+
+def stopWordList():
+    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+    return stopWords
+
+def seg_depart(sentence):
+    sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
+    stopWords = stopWordList()
+    outStr = ''
+    for word in sentence_depart:
+        if word not in stopWords:
+            if word != '\t':
+                outStr += word
+    return outStr
+
+def writer_comments_cuts():
+    with open(targetTxt,'a+',encoding='utf-8') as targetFile:
+        seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
+        output = ' '.join(seg)
+        targetFile.write(output)
+        targetFile.write('\n')
+        print('写入成功')
+
+
+if __name__ == '__main__':
+    writer_comments_cuts()