bug修改

This commit is contained in:
juanboy
2024-07-03 19:01:59 +08:00
parent 510f09af59
commit ba9fe57784
22 changed files with 162 additions and 135 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+100
View File
@@ -0,0 +1,100 @@
('宝宝', 142)
('祝福', 80)
('期待', 77)
('喜欢', 73)
('恭喜', 73)
('接接', 71)
('真的', 62)
('第一', 50)
('快乐', 49)
('祖国', 34)
('舞台', 33)
('朋友', 33)
('老公', 32)
('毕业', 32)
('谢谢', 28)
('好好', 27)
('开心', 27)
('维维', 26)
('加油', 25)
('哥哥', 25)
('视频', 24)
('世界', 24)
('永远', 23)
('好听', 23)
('香港', 23)
('希望', 22)
('孩子', 21)
('七月', 20)
('朋友圈', 19)
('敦豪', 19)
('生活', 18)
('宝贝', 18)
('合作', 18)
('day', 18)
('好看', 18)
('可爱', 17)
('老师', 17)
('涂山', 17)
('致敬', 17)
('中国', 17)
('感觉', 16)
('生日', 16)
('幸福', 16)
('记得', 16)
('追风', 16)
('蟑螂', 16)
('终于', 16)
('评论', 15)
('厉害', 15)
('下次', 15)
('一点', 15)
('双人', 15)
('见面', 15)
('关注', 15)
('实至名归', 14)
('妹妹', 14)
('打开', 14)
('热巴', 14)
('流水', 14)
('任何', 13)
('手机', 13)
('活动', 13)
('呜呜', 13)
('何人', 13)
('电影', 13)
('你好', 13)
('任何人', 13)
('北京', 13)
('粉丝', 13)
('顺利', 13)
('太棒', 12)
('支持', 12)
('奥运', 12)
('人气', 12)
('by', 12)
('漂亮', 12)
('大哥', 12)
('生日快乐', 12)
('老婆', 12)
('精彩', 12)
('工作', 12)
('照顾', 12)
('迢迢', 12)
('时间', 12)
('初心', 12)
('更好', 11)
('早安', 11)
('未来', 11)
('美好', 11)
('造型', 11)
('晚上', 11)
('满满', 11)
('火炬', 10)
('明天', 10)
('魅力', 10)
('实况', 10)
('爷爷', 10)
('骄傲', 10)
('有没有', 10)
('火炬手', 10)
1 ('宝宝' 142)
2 ('祝福' 80)
3 ('期待' 77)
4 ('喜欢' 73)
5 ('恭喜' 73)
6 ('接接' 71)
7 ('真的' 62)
8 ('第一' 50)
9 ('快乐' 49)
10 ('祖国' 34)
11 ('舞台' 33)
12 ('朋友' 33)
13 ('老公' 32)
14 ('毕业' 32)
15 ('谢谢' 28)
16 ('好好' 27)
17 ('开心' 27)
18 ('维维' 26)
19 ('加油' 25)
20 ('哥哥' 25)
21 ('视频' 24)
22 ('世界' 24)
23 ('永远' 23)
24 ('好听' 23)
25 ('香港' 23)
26 ('希望' 22)
27 ('孩子' 21)
28 ('七月' 20)
29 ('朋友圈' 19)
30 ('敦豪' 19)
31 ('生活' 18)
32 ('宝贝' 18)
33 ('合作' 18)
34 ('day' 18)
35 ('好看' 18)
36 ('可爱' 17)
37 ('老师' 17)
38 ('涂山' 17)
39 ('致敬' 17)
40 ('中国' 17)
41 ('感觉' 16)
42 ('生日' 16)
43 ('幸福' 16)
44 ('记得' 16)
45 ('追风' 16)
46 ('蟑螂' 16)
47 ('终于' 16)
48 ('评论' 15)
49 ('厉害' 15)
50 ('下次' 15)
51 ('一点' 15)
52 ('双人' 15)
53 ('见面' 15)
54 ('关注' 15)
55 ('实至名归' 14)
56 ('妹妹' 14)
57 ('打开' 14)
58 ('热巴' 14)
59 ('流水' 14)
60 ('任何' 13)
61 ('手机' 13)
62 ('活动' 13)
63 ('呜呜' 13)
64 ('何人' 13)
65 ('电影' 13)
66 ('你好' 13)
67 ('任何人' 13)
68 ('北京' 13)
69 ('粉丝' 13)
70 ('顺利' 13)
71 ('太棒' 12)
72 ('支持' 12)
73 ('奥运' 12)
74 ('人气' 12)
75 ('by' 12)
76 ('漂亮' 12)
77 ('大哥' 12)
78 ('生日快乐' 12)
79 ('老婆' 12)
80 ('精彩' 12)
81 ('工作' 12)
82 ('照顾' 12)
83 ('迢迢' 12)
84 ('时间' 12)
85 ('初心' 12)
86 ('更好' 11)
87 ('早安' 11)
88 ('未来' 11)
89 ('美好' 11)
90 ('造型' 11)
91 ('晚上' 11)
92 ('满满' 11)
93 ('火炬' 10)
94 ('明天' 10)
95 ('魅力' 10)
96 ('实况' 10)
97 ('爷爷' 10)
98 ('骄傲' 10)
99 ('有没有' 10)
100 ('火炬手' 10)
+31
View File
@@ -0,0 +1,31 @@
import jieba
import re
def main():
reader = open('./cutComments.txt','r',encoding='utf8')
strs = reader.read()
result = open('cipingTotal.csv', 'w', encoding='utf8')
# 分词,去重,列表
word_list = jieba.cut(strs,cut_all=True)
new_words = []
for i in word_list:
m = re.search("\d+",i)
n = re.search("\W+",i)
if not m and not n and len(i) > 1:
new_words.append(i)
# 统计词频
word_count = {}
for i in set(new_words):
word_count[i] = new_words.count(i)
# 格式整理
list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
for i in range(100):
print(list_count[i],file=result)
if __name__ == '__main__':
main()
+44
View File
@@ -0,0 +1,44 @@
from utils.getPublicData import getAllCommentsData
import jieba
import re
targetTxt = 'cutComments.txt'
def stopWordList():
stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
return stopWords
def seg_depart(sentence):
sentence_depart = jieba.cut(" ".join([clean(x[4]) for x in sentence]).strip())
stopWords = stopWordList()
outStr = ''
for word in sentence_depart:
if word not in stopWords:
if word != '\t':
outStr += word
return outStr
def writer_comments_cuts():
with open(targetTxt,'w+',encoding='utf-8') as targetFile:
seg = jieba.cut(seg_depart(getAllCommentsData()))
output = ' '.join(seg)
targetFile.write(output)
targetFile.write('\n')
print('写入成功')
def clean(text):
text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text) # 去除正文中的@和回复/转发中的用户名
text = re.sub(r"\[\S+\]", "", text) # 去除表情符号
# text = re.sub(r"#\S+#", "", text) # 保留话题内容
# 去除emoji表情的正则表达式
text = re.compile(u'[\U00010000-\U0010ffff]').sub('',text)
URL_REGEX = re.compile(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
re.IGNORECASE)
text = re.sub(URL_REGEX, "", text) # 去除网址
text = text.replace("转发微博", "") # 去除无意义的词语
text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格
return text.strip()
if __name__ == '__main__':
writer_comments_cuts()
# print(clean("想到一次我也看到了这样的,我把我的外套(喷了淡茉莉香水的)递过去了,我当时觉得她可能是因为地铁空调有点冷一直环抱着,我借给她说冷的话可以披一下,我坐到终点站的,然后她说不用了,我好尴尬哇"))
File diff suppressed because one or more lines are too long
-1
View File
@@ -1,4 +1,3 @@
from flask import render_template
def errorResponse(errorMsg):
return render_template('error.html',errorMsg=errorMsg)
+1 -1
View File
@@ -79,7 +79,7 @@ def getAllArticleData():
def getAllHotWords():
data = []
df = pd.read_csv('./model/cipingTotal.csv',encoding='utf8')
df = pd.read_csv('./utils/cipingTotal.csv',encoding='utf8')
for i in df.values:
try:
data.append([
+1 -1
View File
@@ -1,5 +1,5 @@
from pymysql import *
conn = connect(host='10.92.35.13',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
conn = connect(host='localhost',port=3306,user='root',password='123456',database='weiboarticles')
cursor = conn.cursor()
def query(sql,params,type="no_select"):
params = tuple(params)
+8
View File
@@ -719,6 +719,8 @@ sup
哈哈
哈哈哈
哈哈哈哈
哎呀
@@ -742,7 +744,13 @@ sup
哼唷
唯有
特别
超级
越来
越来越
啊啊
啊啊啊
啊呀
啊哈
啊哟