【大修bug】添加csv表格原始数据，修改词频统计函数bug

2024-07-03 15:45:08 +08:00
parent d93da880cf
commit f98d111c32
6 changed files with 2773 additions and 49 deletions
@@ -0,0 +1,100 @@
+('哈哈', 1236)
+('哈哈哈', 537)
+('哈哈哈哈', 157)
+('真的', 154)
+('期待', 89)
+('喜欢', 89)
+('doge', 88)
+('宝宝', 87)
+('可爱', 79)
+('第一', 73)
+('演唱', 71)
+('亲亲', 71)
+('苦涩', 70)
+('啊啊啊', 68)
+('抱抱', 64)
+('cry', 64)
+('宝贝', 62)
+('姐姐', 51)
+('花花', 50)
+('送花', 48)
+('开心', 47)
+('加油', 47)
+('老师', 46)
+('call', 45)
+('特别', 42)
+('一个', 42)
+('抓狂', 40)
+('嘻嘻', 39)
+('心心', 38)
+('悲伤', 38)
+('世界', 37)
+('感觉', 35)
+('孩子', 35)
+('朋友', 34)
+('鲜花', 34)
+('开学', 34)
+('好好', 34)
+('演唱会', 33)
+('感谢', 32)
+('憧憬', 31)
+('学季', 31)
+('快乐', 30)
+('漂亮', 30)
+('中国', 30)
+('音乐', 29)
+('电影', 28)
+('莲花', 28)
+('骄阳', 28)
+('视频', 27)
+('老公', 27)
+('老婆', 27)
+('值得', 26)
+('好看', 26)
+('消失', 26)
+('希望', 25)
+('呜呜', 25)
+('少年', 25)
+('东西', 25)
+('实力', 24)
+('评论', 24)
+('舞台', 24)
+('生活', 24)
+('单身', 24)
+('努力', 23)
+('唯一', 23)
+('幸福', 23)
+('时间', 23)
+('超级', 23)
+('辈子', 22)
+('童年', 22)
+('时代', 22)
+('可怜', 21)
+('不见', 21)
+('工作', 21)
+('有人', 21)
+('终于', 21)
+('粉丝', 21)
+('国家', 21)
+('callcallcall', 21)
+('永远', 21)
+('太阳', 20)
+('直播', 20)
+('小时', 20)
+('星期', 20)
+('安全', 20)
+('代言', 19)
+('支持', 19)
+('彩虹', 19)
+('妈妈', 18)
+('华为', 18)
+('优秀', 18)
+('好像', 18)
+('越来', 18)
+('大人', 18)
+('父母', 18)
+('害怕', 18)
+('安哥', 18)
+('加班', 18)
+('一点', 18)
+('一场', 17)
@@ -0,0 +1,31 @@
+import jieba
+import re
+
+def main():
+    reader = open('./cutComments.txt','r',encoding='utf8')
+    strs = reader.read()
+    result = open('./cipingTotal.csv','w',encoding='utf8')
+
+    # 分词，去重，列表
+    word_list = jieba.cut(strs,cut_all=True)
+
+    new_words = []
+    for i in word_list:
+        m = re.search("\d+",i)
+        n = re.search("\W+",i)
+        if not m and not n and len(i) > 1:
+            new_words.append(i)
+
+    # 统计词频
+    word_count = {}
+    for i in set(new_words):
+        word_count[i] = new_words.count(i)
+
+    # 格式整理
+    list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
+
+    for i in range(100):
+        print(list_count[i],file=result)
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,29 @@
+from utils.getPublicData import getAllCommentsData
+import jieba
+targetTxt = 'cutComments.txt'
+
+def stopWordList():
+    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+    return stopWords
+
+def seg_depart(sentence):
+    sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
+    stopWords = stopWordList()
+    outStr = ''
+    for word in sentence_depart:
+        if word not in stopWords:
+            if word != '\t':
+                outStr += word
+    return outStr
+
+def writer_comments_cuts():
+    with open(targetTxt,'a+',encoding='utf-8') as targetFile:
+        seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
+        output = ' '.join(seg)
+        targetFile.write(output)
+        targetFile.write('\n')
+        print('写入成功')
+
+
+if __name__ == '__main__':
+    writer_comments_cuts()