【cutComments.py】分词统计词频函数定义

2024-07-02 17:49:46 +08:00
parent 50781187c2
commit a94d2cdf80
2 changed files with 60 additions and 0 deletions
@@ -0,0 +1,31 @@
+import jieba
+import re
+
+def main():
+    reader = open('./cutComments.txt','r',encoding='utf8')
+    strs = reader.read()
+    result = open('./cipingTotal.csv','w',encoding='utf8')
+
+    # 分词，去重，列表
+    word_list = jieba.cut(strs,cut_all=True)
+
+    new_words = []
+    for i in word_list:
+        m = re.search("\d+",i)
+        n = re.search("\W+",i)
+        if not m and not n and len(i) > 1:
+            new_words.append(i)
+
+    # 统计词频
+    word_count = {}
+    for i in set(new_words):
+        word_count[i] = new_words.count(i)
+
+    # 格式整理
+    list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
+
+    for i in range(100):
+        print(list_count[i],file=result)
+
+if __name__ == '__main__':
+    main()