bug修改
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,100 @@
|
||||
('宝宝', 142)
|
||||
('祝福', 80)
|
||||
('期待', 77)
|
||||
('喜欢', 73)
|
||||
('恭喜', 73)
|
||||
('接接', 71)
|
||||
('真的', 62)
|
||||
('第一', 50)
|
||||
('快乐', 49)
|
||||
('祖国', 34)
|
||||
('舞台', 33)
|
||||
('朋友', 33)
|
||||
('老公', 32)
|
||||
('毕业', 32)
|
||||
('谢谢', 28)
|
||||
('好好', 27)
|
||||
('开心', 27)
|
||||
('维维', 26)
|
||||
('加油', 25)
|
||||
('哥哥', 25)
|
||||
('视频', 24)
|
||||
('世界', 24)
|
||||
('永远', 23)
|
||||
('好听', 23)
|
||||
('香港', 23)
|
||||
('希望', 22)
|
||||
('孩子', 21)
|
||||
('七月', 20)
|
||||
('朋友圈', 19)
|
||||
('敦豪', 19)
|
||||
('生活', 18)
|
||||
('宝贝', 18)
|
||||
('合作', 18)
|
||||
('day', 18)
|
||||
('好看', 18)
|
||||
('可爱', 17)
|
||||
('老师', 17)
|
||||
('涂山', 17)
|
||||
('致敬', 17)
|
||||
('中国', 17)
|
||||
('感觉', 16)
|
||||
('生日', 16)
|
||||
('幸福', 16)
|
||||
('记得', 16)
|
||||
('追风', 16)
|
||||
('蟑螂', 16)
|
||||
('终于', 16)
|
||||
('评论', 15)
|
||||
('厉害', 15)
|
||||
('下次', 15)
|
||||
('一点', 15)
|
||||
('双人', 15)
|
||||
('见面', 15)
|
||||
('关注', 15)
|
||||
('实至名归', 14)
|
||||
('妹妹', 14)
|
||||
('打开', 14)
|
||||
('热巴', 14)
|
||||
('流水', 14)
|
||||
('任何', 13)
|
||||
('手机', 13)
|
||||
('活动', 13)
|
||||
('呜呜', 13)
|
||||
('何人', 13)
|
||||
('电影', 13)
|
||||
('你好', 13)
|
||||
('任何人', 13)
|
||||
('北京', 13)
|
||||
('粉丝', 13)
|
||||
('顺利', 13)
|
||||
('太棒', 12)
|
||||
('支持', 12)
|
||||
('奥运', 12)
|
||||
('人气', 12)
|
||||
('by', 12)
|
||||
('漂亮', 12)
|
||||
('大哥', 12)
|
||||
('生日快乐', 12)
|
||||
('老婆', 12)
|
||||
('精彩', 12)
|
||||
('工作', 12)
|
||||
('照顾', 12)
|
||||
('迢迢', 12)
|
||||
('时间', 12)
|
||||
('初心', 12)
|
||||
('更好', 11)
|
||||
('早安', 11)
|
||||
('未来', 11)
|
||||
('美好', 11)
|
||||
('造型', 11)
|
||||
('晚上', 11)
|
||||
('满满', 11)
|
||||
('火炬', 10)
|
||||
('明天', 10)
|
||||
('魅力', 10)
|
||||
('实况', 10)
|
||||
('爷爷', 10)
|
||||
('骄傲', 10)
|
||||
('有没有', 10)
|
||||
('火炬手', 10)
|
||||
|
@@ -0,0 +1,31 @@
|
||||
import jieba
|
||||
import re
|
||||
|
||||
def main():
|
||||
reader = open('./cutComments.txt','r',encoding='utf8')
|
||||
strs = reader.read()
|
||||
result = open('cipingTotal.csv', 'w', encoding='utf8')
|
||||
|
||||
# 分词,去重,列表
|
||||
word_list = jieba.cut(strs,cut_all=True)
|
||||
|
||||
new_words = []
|
||||
for i in word_list:
|
||||
m = re.search("\d+",i)
|
||||
n = re.search("\W+",i)
|
||||
if not m and not n and len(i) > 1:
|
||||
new_words.append(i)
|
||||
|
||||
# 统计词频
|
||||
word_count = {}
|
||||
for i in set(new_words):
|
||||
word_count[i] = new_words.count(i)
|
||||
|
||||
# 格式整理
|
||||
list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
|
||||
|
||||
for i in range(100):
|
||||
print(list_count[i],file=result)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,44 @@
|
||||
from utils.getPublicData import getAllCommentsData
|
||||
import jieba
|
||||
import re
|
||||
targetTxt = 'cutComments.txt'
|
||||
|
||||
def stopWordList():
|
||||
stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()]
|
||||
return stopWords
|
||||
|
||||
def seg_depart(sentence):
|
||||
sentence_depart = jieba.cut(" ".join([clean(x[4]) for x in sentence]).strip())
|
||||
stopWords = stopWordList()
|
||||
outStr = ''
|
||||
for word in sentence_depart:
|
||||
if word not in stopWords:
|
||||
if word != '\t':
|
||||
outStr += word
|
||||
return outStr
|
||||
|
||||
def writer_comments_cuts():
|
||||
with open(targetTxt,'w+',encoding='utf-8') as targetFile:
|
||||
seg = jieba.cut(seg_depart(getAllCommentsData()))
|
||||
output = ' '.join(seg)
|
||||
targetFile.write(output)
|
||||
targetFile.write('\n')
|
||||
print('写入成功')
|
||||
|
||||
def clean(text):
|
||||
text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text) # 去除正文中的@和回复/转发中的用户名
|
||||
text = re.sub(r"\[\S+\]", "", text) # 去除表情符号
|
||||
# text = re.sub(r"#\S+#", "", text) # 保留话题内容
|
||||
# 去除emoji表情的正则表达式
|
||||
text = re.compile(u'[\U00010000-\U0010ffff]').sub('',text)
|
||||
URL_REGEX = re.compile(
|
||||
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
|
||||
re.IGNORECASE)
|
||||
text = re.sub(URL_REGEX, "", text) # 去除网址
|
||||
text = text.replace("转发微博", "") # 去除无意义的词语
|
||||
text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格
|
||||
return text.strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
writer_comments_cuts()
|
||||
# print(clean("想到一次我也看到了这样的,我把我的外套(喷了淡茉莉香水的)递过去了,我当时觉得她可能是因为地铁空调有点冷一直环抱着,我借给她说冷的话可以披一下,我坐到终点站的,然后她说不用了,我好尴尬哇"))
|
||||
File diff suppressed because one or more lines are too long
@@ -1,4 +1,3 @@
|
||||
from flask import render_template
|
||||
|
||||
def errorResponse(errorMsg):
|
||||
return render_template('error.html',errorMsg=errorMsg)
|
||||
@@ -79,7 +79,7 @@ def getAllArticleData():
|
||||
|
||||
def getAllHotWords():
|
||||
data = []
|
||||
df = pd.read_csv('./model/cipingTotal.csv',encoding='utf8')
|
||||
df = pd.read_csv('./utils/cipingTotal.csv',encoding='utf8')
|
||||
for i in df.values:
|
||||
try:
|
||||
data.append([
|
||||
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
from pymysql import *
|
||||
conn = connect(host='10.92.35.13',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem')
|
||||
conn = connect(host='localhost',port=3306,user='root',password='123456',database='weiboarticles')
|
||||
cursor = conn.cursor()
|
||||
def query(sql,params,type="no_select"):
|
||||
params = tuple(params)
|
||||
|
||||
@@ -719,6 +719,8 @@ sup
|
||||
哇
|
||||
哈
|
||||
哈哈
|
||||
哈哈哈
|
||||
哈哈哈哈
|
||||
哉
|
||||
哎
|
||||
哎呀
|
||||
@@ -742,7 +744,13 @@ sup
|
||||
哼唷
|
||||
唉
|
||||
唯有
|
||||
特别
|
||||
超级
|
||||
越来
|
||||
越来越
|
||||
啊
|
||||
啊啊
|
||||
啊啊啊
|
||||
啊呀
|
||||
啊哈
|
||||
啊哟
|
||||
|
||||
Reference in New Issue
Block a user