bug修改

2024-07-03 19:01:59 +08:00
parent 510f09af59
commit ba9fe57784
22 changed files with 162 additions and 135 deletions
@@ -0,0 +1 @@
+app.py
@@ -5,7 +5,7 @@
  </component>
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TemplatesService">
@@ -3,5 +3,8 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.12 (Desktop)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (Desktop)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
 </project>
@@ -1,100 +0,0 @@
-('哈哈', 1236)
-('哈哈哈', 537)
-('哈哈哈哈', 157)
-('真的', 154)
-('期待', 89)
-('喜欢', 89)
-('doge', 88)
-('宝宝', 87)
-('可爱', 79)
-('第一', 73)
-('演唱', 71)
-('亲亲', 71)
-('苦涩', 70)
-('啊啊啊', 68)
-('抱抱', 64)
-('cry', 64)
-('宝贝', 62)
-('姐姐', 51)
-('花花', 50)
-('送花', 48)
-('开心', 47)
-('加油', 47)
-('老师', 46)
-('call', 45)
-('特别', 42)
-('一个', 42)
-('抓狂', 40)
-('嘻嘻', 39)
-('心心', 38)
-('悲伤', 38)
-('世界', 37)
-('感觉', 35)
-('孩子', 35)
-('朋友', 34)
-('鲜花', 34)
-('开学', 34)
-('好好', 34)
-('演唱会', 33)
-('感谢', 32)
-('憧憬', 31)
-('学季', 31)
-('快乐', 30)
-('漂亮', 30)
-('中国', 30)
-('音乐', 29)
-('电影', 28)
-('莲花', 28)
-('骄阳', 28)
-('视频', 27)
-('老公', 27)
-('老婆', 27)
-('值得', 26)
-('好看', 26)
-('消失', 26)
-('希望', 25)
-('呜呜', 25)
-('少年', 25)
-('东西', 25)
-('实力', 24)
-('评论', 24)
-('舞台', 24)
-('生活', 24)
-('单身', 24)
-('努力', 23)
-('唯一', 23)
-('幸福', 23)
-('时间', 23)
-('超级', 23)
-('辈子', 22)
-('童年', 22)
-('时代', 22)
-('可怜', 21)
-('不见', 21)
-('工作', 21)
-('有人', 21)
-('终于', 21)
-('粉丝', 21)
-('国家', 21)
-('callcallcall', 21)
-('永远', 21)
-('太阳', 20)
-('直播', 20)
-('小时', 20)
-('星期', 20)
-('安全', 20)
-('代言', 19)
-('支持', 19)
-('彩虹', 19)
-('妈妈', 18)
-('华为', 18)
-('优秀', 18)
-('好像', 18)
-('越来', 18)
-('大人', 18)
-('父母', 18)
-('害怕', 18)
-('安哥', 18)
-('加班', 18)
-('一点', 18)
-('一场', 17)
@@ -1,29 +0,0 @@
-from utils.getPublicData import getAllCommentsData
-import jieba
-targetTxt = 'cutComments.txt'
-
-def stopWordList():
-    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
-    return stopWords
-
-def seg_depart(sentence):
-    sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip())
-    stopWords = stopWordList()
-    outStr = ''
-    for word in sentence_depart:
-        if word not in stopWords:
-            if word != '\t':
-                outStr += word
-    return outStr
-
-def writer_comments_cuts():
-    with open(targetTxt,'a+',encoding='utf-8') as targetFile:
-        seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True)
-        output = ' '.join(seg)
-        targetFile.write(output)
-        targetFile.write('\n')
-        print('写入成功')
-
-
-if __name__ == '__main__':
-    writer_comments_cuts()
@@ -0,0 +1,100 @@
+('宝宝', 142)
+('祝福', 80)
+('期待', 77)
+('喜欢', 73)
+('恭喜', 73)
+('接接', 71)
+('真的', 62)
+('第一', 50)
+('快乐', 49)
+('祖国', 34)
+('舞台', 33)
+('朋友', 33)
+('老公', 32)
+('毕业', 32)
+('谢谢', 28)
+('好好', 27)
+('开心', 27)
+('维维', 26)
+('加油', 25)
+('哥哥', 25)
+('视频', 24)
+('世界', 24)
+('永远', 23)
+('好听', 23)
+('香港', 23)
+('希望', 22)
+('孩子', 21)
+('七月', 20)
+('朋友圈', 19)
+('敦豪', 19)
+('生活', 18)
+('宝贝', 18)
+('合作', 18)
+('day', 18)
+('好看', 18)
+('可爱', 17)
+('老师', 17)
+('涂山', 17)
+('致敬', 17)
+('中国', 17)
+('感觉', 16)
+('生日', 16)
+('幸福', 16)
+('记得', 16)
+('追风', 16)
+('蟑螂', 16)
+('终于', 16)
+('评论', 15)
+('厉害', 15)
+('下次', 15)
+('一点', 15)
+('双人', 15)
+('见面', 15)
+('关注', 15)
+('实至名归', 14)
+('妹妹', 14)
+('打开', 14)
+('热巴', 14)
+('流水', 14)
+('任何', 13)
+('手机', 13)
+('活动', 13)
+('呜呜', 13)
+('何人', 13)
+('电影', 13)
+('你好', 13)
+('任何人', 13)
+('北京', 13)
+('粉丝', 13)
+('顺利', 13)
+('太棒', 12)
+('支持', 12)
+('奥运', 12)
+('人气', 12)
+('by', 12)
+('漂亮', 12)
+('大哥', 12)
+('生日快乐', 12)
+('老婆', 12)
+('精彩', 12)
+('工作', 12)
+('照顾', 12)
+('迢迢', 12)
+('时间', 12)
+('初心', 12)
+('更好', 11)
+('早安', 11)
+('未来', 11)
+('美好', 11)
+('造型', 11)
+('晚上', 11)
+('满满', 11)
+('火炬', 10)
+('明天', 10)
+('魅力', 10)
+('实况', 10)
+('爷爷', 10)
+('骄傲', 10)
+('有没有', 10)
+('火炬手', 10)
@@ -4,7 +4,7 @@ import re
 def main():
    reader = open('./cutComments.txt','r',encoding='utf8')
    strs = reader.read()
-    result = open('./cipingTotal.csv','w',encoding='utf8')
+    result = open('cipingTotal.csv', 'w', encoding='utf8')

    # 分词，去重，列表
    word_list = jieba.cut(strs,cut_all=True)
@@ -0,0 +1,44 @@
+from utils.getPublicData import getAllCommentsData
+import jieba
+import re
+targetTxt = 'cutComments.txt'
+
+def stopWordList():
+    stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
+    return stopWords
+
+def seg_depart(sentence):
+    sentence_depart = jieba.cut(" ".join([clean(x[4]) for x in sentence]).strip())
+    stopWords = stopWordList()
+    outStr = ''
+    for word in sentence_depart:
+        if word not in stopWords:
+            if word != '\t':
+                outStr += word
+    return outStr
+
+def writer_comments_cuts():
+    with open(targetTxt,'w+',encoding='utf-8') as targetFile:
+        seg = jieba.cut(seg_depart(getAllCommentsData()))
+        output = ' '.join(seg)
+        targetFile.write(output)
+        targetFile.write('\n')
+        print('写入成功')
+
+def clean(text):
+    text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text)  # 去除正文中的@和回复/转发中的用户名
+    text = re.sub(r"\[\S+\]", "", text)  # 去除表情符号
+    # text = re.sub(r"#\S+#", "", text)      # 保留话题内容
+    # 去除emoji表情的正则表达式
+    text = re.compile(u'[\U00010000-\U0010ffff]').sub('',text)
+    URL_REGEX = re.compile(
+        r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
+        re.IGNORECASE)
+    text = re.sub(URL_REGEX, "", text)  # 去除网址
+    text = text.replace("转发微博", "")  # 去除无意义的词语
+    text = re.sub(r"\s+", " ", text)  # 合并正文中过多的空格
+    return text.strip()
+
+if __name__ == '__main__':
+    writer_comments_cuts()
+    # print(clean("想到一次我也看到了这样的，我把我的外套（喷了淡茉莉香水的）递过去了，我当时觉得她可能是因为地铁空调有点冷一直环抱着，我借给她说冷的话可以披一下，我坐到终点站的，然后她说不用了，我好尴尬哇"))
@@ -1,4 +1,3 @@
 from flask import render_template
-
 def errorResponse(errorMsg):
    return render_template('error.html',errorMsg=errorMsg)
@@ -79,7 +79,7 @@ def getAllArticleData():

 def getAllHotWords():
    data = []
-    df = pd.read_csv('./model/cipingTotal.csv',encoding='utf8')
+    df = pd.read_csv('./utils/cipingTotal.csv',encoding='utf8')
    for i in df.values:
        try:
            data.append([
@@ -1,5 +1,5 @@
 from pymysql import *
-conn = connect(host='10.92.35.13',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
+conn = connect(host='localhost',port=3306,user='root',password='123456',database='weiboarticles')
 cursor = conn.cursor()
 def query(sql,params,type="no_select"):
    params = tuple(params)
@@ -719,6 +719,8 @@ sup
 哇
 哈
 哈哈
+哈哈哈
+哈哈哈哈
 哉
 哎
 哎呀
@@ -742,7 +744,13 @@ sup
 哼唷
 唉
 唯有
+特别
+超级
+越来
+越来越
 啊
+啊啊
+啊啊啊
 啊呀
 啊哈
 啊哟