bug修改

2024-07-03 19:01:59 +08:00
parent 510f09af59
commit ba9fe57784
22 changed files with 162 additions and 135 deletions
@@ -0,0 +1,100 @@
+('宝宝', 142)
+('祝福', 80)
+('期待', 77)
+('喜欢', 73)
+('恭喜', 73)
+('接接', 71)
+('真的', 62)
+('第一', 50)
+('快乐', 49)
+('祖国', 34)
+('舞台', 33)
+('朋友', 33)
+('老公', 32)
+('毕业', 32)
+('谢谢', 28)
+('好好', 27)
+('开心', 27)
+('维维', 26)
+('加油', 25)
+('哥哥', 25)
+('视频', 24)
+('世界', 24)
+('永远', 23)
+('好听', 23)
+('香港', 23)
+('希望', 22)
+('孩子', 21)
+('七月', 20)
+('朋友圈', 19)
+('敦豪', 19)
+('生活', 18)
+('宝贝', 18)
+('合作', 18)
+('day', 18)
+('好看', 18)
+('可爱', 17)
+('老师', 17)
+('涂山', 17)
+('致敬', 17)
+('中国', 17)
+('感觉', 16)
+('生日', 16)
+('幸福', 16)
+('记得', 16)
+('追风', 16)
+('蟑螂', 16)
+('终于', 16)
+('评论', 15)
+('厉害', 15)
+('下次', 15)
+('一点', 15)
+('双人', 15)
+('见面', 15)
+('关注', 15)
+('实至名归', 14)
+('妹妹', 14)
+('打开', 14)
+('热巴', 14)
+('流水', 14)
+('任何', 13)
+('手机', 13)
+('活动', 13)
+('呜呜', 13)
+('何人', 13)
+('电影', 13)
+('你好', 13)
+('任何人', 13)
+('北京', 13)
+('粉丝', 13)
+('顺利', 13)
+('太棒', 12)
+('支持', 12)
+('奥运', 12)
+('人气', 12)
+('by', 12)
+('漂亮', 12)
+('大哥', 12)
+('生日快乐', 12)
+('老婆', 12)
+('精彩', 12)
+('工作', 12)
+('照顾', 12)
+('迢迢', 12)
+('时间', 12)
+('初心', 12)
+('更好', 11)
+('早安', 11)
+('未来', 11)
+('美好', 11)
+('造型', 11)
+('晚上', 11)
+('满满', 11)
+('火炬', 10)
+('明天', 10)
+('魅力', 10)
+('实况', 10)
+('爷爷', 10)
+('骄傲', 10)
+('有没有', 10)
+('火炬手', 10)
@@ -0,0 +1,31 @@
+import jieba
+import re
+
+def main():
+    reader = open('./cutComments.txt','r',encoding='utf8')
+    strs = reader.read()
+    result = open('cipingTotal.csv', 'w', encoding='utf8')
+
+    # 分词，去重，列表
+    word_list = jieba.cut(strs,cut_all=True)
+
+    new_words = []
+    for i in word_list:
+        m = re.search("\d+",i)
+        n = re.search("\W+",i)
+        if not m and not n and len(i) > 1:
+            new_words.append(i)
+
+    # 统计词频
+    word_count = {}
+    for i in set(new_words):
+        word_count[i] = new_words.count(i)
+
+    # 格式整理
+    list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
+
+    for i in range(100):
+        print(list_count[i],file=result)
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,44 @@
+from utils.getPublicData import getAllCommentsData
+import jieba
+import re
+targetTxt = 'cutComments.txt'
+
+def stopWordList():
+    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+    return stopWords
+
+def seg_depart(sentence):
+    sentence_depart = jieba.cut(" ".join([clean(x[4]) for x in sentence]).strip())
+    stopWords = stopWordList()
+    outStr = ''
+    for word in sentence_depart:
+        if word not in stopWords:
+            if word != '\t':
+                outStr += word
+    return outStr
+
+def writer_comments_cuts():
+    with open(targetTxt,'w+',encoding='utf-8') as targetFile:
+        seg = jieba.cut(seg_depart(getAllCommentsData()))
+        output = ' '.join(seg)
+        targetFile.write(output)
+        targetFile.write('\n')
+        print('写入成功')
+
+def clean(text):
+    text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text)  # 去除正文中的@和回复/转发中的用户名
+    text = re.sub(r"\[\S+\]", "", text)  # 去除表情符号
+    # text = re.sub(r"#\S+#", "", text)      # 保留话题内容
+    # 去除emoji表情的正则表达式
+    text = re.compile(u'[\U00010000-\U0010ffff]').sub('',text)
+    URL_REGEX = re.compile(
+        r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
+        re.IGNORECASE)
+    text = re.sub(URL_REGEX, "", text)  # 去除网址
+    text = text.replace("转发微博", "")  # 去除无意义的词语
+    text = re.sub(r"\s+", " ", text)  # 合并正文中过多的空格
+    return text.strip()
+
+if __name__ == '__main__':
+    writer_comments_cuts()
+    # print(clean("想到一次我也看到了这样的，我把我的外套（喷了淡茉莉香水的）递过去了，我当时觉得她可能是因为地铁空调有点冷一直环抱着，我借给她说冷的话可以披一下，我坐到终点站的，然后她说不用了，我好尴尬哇"))
@@ -1,4 +1,3 @@
 from flask import render_template
-
 def errorResponse(errorMsg):
    return render_template('error.html',errorMsg=errorMsg)
@@ -79,7 +79,7 @@ def getAllArticleData():

 def getAllHotWords():
    data = []
-    df = pd.read_csv('./model/cipingTotal.csv',encoding='utf8')
+    df = pd.read_csv('./utils/cipingTotal.csv',encoding='utf8')
    for i in df.values:
        try:
            data.append([
@@ -1,5 +1,5 @@
 from pymysql import *
-conn = connect(host='10.92.35.13',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
+conn = connect(host='localhost',port=3306,user='root',password='123456',database='weiboarticles')
 cursor = conn.cursor()
 def query(sql,params,type="no_select"):
    params = tuple(params)
@@ -719,6 +719,8 @@ sup
 哇
 哈
 哈哈
+哈哈哈
+哈哈哈哈
 哉
 哎
 哎呀
@@ -742,7 +744,13 @@ sup
 哼唷
 唉
 唯有
+特别
+超级
+越来
+越来越
 啊
+啊啊
+啊啊啊
 啊呀
 啊哈
 啊哟