【大修bug】添加csv表格原始数据,修改词频统计函数bug

This commit is contained in:
戒酒的李白
2024-07-03 15:45:08 +08:00
parent d93da880cf
commit f98d111c32
6 changed files with 2773 additions and 49 deletions
+100
View File
@@ -0,0 +1,100 @@
('哈哈', 1236)
('哈哈哈', 537)
('哈哈哈哈', 157)
('真的', 154)
('期待', 89)
('喜欢', 89)
('doge', 88)
('宝宝', 87)
('可爱', 79)
('第一', 73)
('演唱', 71)
('亲亲', 71)
('苦涩', 70)
('啊啊啊', 68)
('抱抱', 64)
('cry', 64)
('宝贝', 62)
('姐姐', 51)
('花花', 50)
('送花', 48)
('开心', 47)
('加油', 47)
('老师', 46)
('call', 45)
('特别', 42)
('一个', 42)
('抓狂', 40)
('嘻嘻', 39)
('心心', 38)
('悲伤', 38)
('世界', 37)
('感觉', 35)
('孩子', 35)
('朋友', 34)
('鲜花', 34)
('开学', 34)
('好好', 34)
('演唱会', 33)
('感谢', 32)
('憧憬', 31)
('学季', 31)
('快乐', 30)
('漂亮', 30)
('中国', 30)
('音乐', 29)
('电影', 28)
('莲花', 28)
('骄阳', 28)
('视频', 27)
('老公', 27)
('老婆', 27)
('值得', 26)
('好看', 26)
('消失', 26)
('希望', 25)
('呜呜', 25)
('少年', 25)
('东西', 25)
('实力', 24)
('评论', 24)
('舞台', 24)
('生活', 24)
('单身', 24)
('努力', 23)
('唯一', 23)
('幸福', 23)
('时间', 23)
('超级', 23)
('辈子', 22)
('童年', 22)
('时代', 22)
('可怜', 21)
('不见', 21)
('工作', 21)
('有人', 21)
('终于', 21)
('粉丝', 21)
('国家', 21)
('callcallcall', 21)
('永远', 21)
('太阳', 20)
('直播', 20)
('小时', 20)
('星期', 20)
('安全', 20)
('代言', 19)
('支持', 19)
('彩虹', 19)
('妈妈', 18)
('华为', 18)
('优秀', 18)
('好像', 18)
('越来', 18)
('大人', 18)
('父母', 18)
('害怕', 18)
('安哥', 18)
('加班', 18)
('一点', 18)
('一场', 17)
1 ('哈哈' 1236)
2 ('哈哈哈' 537)
3 ('哈哈哈哈' 157)
4 ('真的' 154)
5 ('期待' 89)
6 ('喜欢' 89)
7 ('doge' 88)
8 ('宝宝' 87)
9 ('可爱' 79)
10 ('第一' 73)
11 ('演唱' 71)
12 ('亲亲' 71)
13 ('苦涩' 70)
14 ('啊啊啊' 68)
15 ('抱抱' 64)
16 ('cry' 64)
17 ('宝贝' 62)
18 ('姐姐' 51)
19 ('花花' 50)
20 ('送花' 48)
21 ('开心' 47)
22 ('加油' 47)
23 ('老师' 46)
24 ('call' 45)
25 ('特别' 42)
26 ('一个' 42)
27 ('抓狂' 40)
28 ('嘻嘻' 39)
29 ('心心' 38)
30 ('悲伤' 38)
31 ('世界' 37)
32 ('感觉' 35)
33 ('孩子' 35)
34 ('朋友' 34)
35 ('鲜花' 34)
36 ('开学' 34)
37 ('好好' 34)
38 ('演唱会' 33)
39 ('感谢' 32)
40 ('憧憬' 31)
41 ('学季' 31)
42 ('快乐' 30)
43 ('漂亮' 30)
44 ('中国' 30)
45 ('音乐' 29)
46 ('电影' 28)
47 ('莲花' 28)
48 ('骄阳' 28)
49 ('视频' 27)
50 ('老公' 27)
51 ('老婆' 27)
52 ('值得' 26)
53 ('好看' 26)
54 ('消失' 26)
55 ('希望' 25)
56 ('呜呜' 25)
57 ('少年' 25)
58 ('东西' 25)
59 ('实力' 24)
60 ('评论' 24)
61 ('舞台' 24)
62 ('生活' 24)
63 ('单身' 24)
64 ('努力' 23)
65 ('唯一' 23)
66 ('幸福' 23)
67 ('时间' 23)
68 ('超级' 23)
69 ('辈子' 22)
70 ('童年' 22)
71 ('时代' 22)
72 ('可怜' 21)
73 ('不见' 21)
74 ('工作' 21)
75 ('有人' 21)
76 ('终于' 21)
77 ('粉丝' 21)
78 ('国家' 21)
79 ('callcallcall' 21)
80 ('永远' 21)
81 ('太阳' 20)
82 ('直播' 20)
83 ('小时' 20)
84 ('星期' 20)
85 ('安全' 20)
86 ('代言' 19)
87 ('支持' 19)
88 ('彩虹' 19)
89 ('妈妈' 18)
90 ('华为' 18)
91 ('优秀' 18)
92 ('好像' 18)
93 ('越来' 18)
94 ('大人' 18)
95 ('父母' 18)
96 ('害怕' 18)
97 ('安哥' 18)
98 ('加班' 18)
99 ('一点' 18)
100 ('一场' 17)
+2565
View File
File diff suppressed because it is too large Load Diff
+60
View File
@@ -0,0 +1,60 @@
typeName,gid,containerid
热门,102803,102803
同城,1028032222,102803_2222
榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
明星,1028034288,102803_ctg1_4288_-_ctg1_4288
车展,1028035188,102803_ctg1_5188_-_ctg1_5188
搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
情感,1028031988,102803_ctg1_1988_-_ctg1_1988
周末,102803600195,102803_ctg1_600195_-_ctg1_600195
电影,1028033288,102803_ctg1_3288_-_ctg1_3288
社会,1028034188,102803_ctg1_4188_-_ctg1_4188
电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
美食,1028032688,102803_ctg1_2688_-_ctg1_2688
俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
国际,1028036288,102803_ctg1_6288_-_ctg1_6288
深度,102803600155,102803_ctg1_600155_-_ctg1_600155
财经,1028036388,102803_ctg1_6388_-_ctg1_6388
读书,1028034588,102803_ctg1_4588_-_ctg1_4588
摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
体育,1028031388,102803_ctg1_1388_-_ctg1_1388
数码,1028035088,102803_ctg1_5088_-_ctg1_5088
综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
星座,1028031688,102803_ctg1_1688_-_ctg1_1688
军事,1028036688,102803_ctg1_6688_-_ctg1_6688
股市,1028031288,102803_ctg1_1288_-_ctg1_1288
房产,1028035588,102803_ctg1_5588_-_ctg1_5588
家居,1028035888,102803_ctg1_5888_-_ctg1_5888
萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
科技,1028032088,102803_ctg1_2088_-_ctg1_2088
科普,1028035988,102803_ctg1_5988_-_ctg1_5988
动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
好物,102803600094,102803_ctg1_600094_-_ctg1_600094
历史,1028036788,102803_ctg1_6788_-_ctg1_6788
艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
法律,1028037388,102803_ctg1_7388_-_ctg1_7388
设计,1028035388,102803_ctg1_5388_-_ctg1_5388
健康,1028032188,102803_ctg1_2188_-_ctg1_2188
音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
校园,102803600177,102803_ctg1_600177_-_ctg1_600177
收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
政务,1028035788,102803_ctg1_5788_-_ctg1_5788
养生,1028036588,102803_ctg1_6588_-_ctg1_6588
育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
教育,102803600080,102803_ctg1_600080_-_ctg1_600080
婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
公益,102803600057,102803_ctg1_600057_-_ctg1_600057
问答,1028037977,102803_ctg1_7977_-_ctg1_7977
三农,1028037188,102803_ctg1_7188_-_ctg1_7188
1 typeName gid containerid
2 热门 102803 102803
3 同城 1028032222 102803_2222
4 榜单 102803600169 102803_ctg1_600169_-_ctg1_600169
5 男篮 102803600279 102803_ctg1_600279_-_ctg1_600279
6 明星 1028034288 102803_ctg1_4288_-_ctg1_4288
7 车展 1028035188 102803_ctg1_5188_-_ctg1_5188
8 搞笑 1028034388 102803_ctg1_4388_-_ctg1_4388
9 情感 1028031988 102803_ctg1_1988_-_ctg1_1988
10 周末 102803600195 102803_ctg1_600195_-_ctg1_600195
11 电影 1028033288 102803_ctg1_3288_-_ctg1_3288
12 社会 1028034188 102803_ctg1_4188_-_ctg1_4188
13 电视剧 1028032488 102803_ctg1_2488_-_ctg1_2488
14 美食 1028032688 102803_ctg1_2688_-_ctg1_2688
15 俄乌局势 102803600267 102803_ctg1_600267_-_ctg1_600267
16 国际 1028036288 102803_ctg1_6288_-_ctg1_6288
17 深度 102803600155 102803_ctg1_600155_-_ctg1_600155
18 财经 1028036388 102803_ctg1_6388_-_ctg1_6388
19 读书 1028034588 102803_ctg1_4588_-_ctg1_4588
20 摄影 1028034988 102803_ctg1_4988_-_ctg1_4988
21 颜值 102803600165 102803_ctg1_600165_-_ctg1_600165
22 体育 1028031388 102803_ctg1_1388_-_ctg1_1388
23 数码 1028035088 102803_ctg1_5088_-_ctg1_5088
24 综艺 1028034688 102803_ctg1_4688_-_ctg1_4688
25 时尚 1028034488 102803_ctg1_4488_-_ctg1_4488
26 星座 1028031688 102803_ctg1_1688_-_ctg1_1688
27 军事 1028036688 102803_ctg1_6688_-_ctg1_6688
28 股市 1028031288 102803_ctg1_1288_-_ctg1_1288
29 房产 1028035588 102803_ctg1_5588_-_ctg1_5588
30 家居 1028035888 102803_ctg1_5888_-_ctg1_5888
31 萌宠 1028032788 102803_ctg1_2788_-_ctg1_2788
32 科技 1028032088 102803_ctg1_2088_-_ctg1_2088
33 科普 1028035988 102803_ctg1_5988_-_ctg1_5988
34 动漫 1028032388 102803_ctg1_2388_-_ctg1_2388
35 运动健身 1028034788 102803_ctg1_4788_-_ctg1_4788
36 旅游 1028032588 102803_ctg1_2588_-_ctg1_2588
37 瘦身 1028036488 102803_ctg1_6488_-_ctg1_6488
38 好物 102803600094 102803_ctg1_600094_-_ctg1_600094
39 历史 1028036788 102803_ctg1_6788_-_ctg1_6788
40 艺术 1028035488 102803_ctg1_5488_-_ctg1_5488
41 美妆 1028031588 102803_ctg1_1588_-_ctg1_1588
42 法律 1028037388 102803_ctg1_7388_-_ctg1_7388
43 设计 1028035388 102803_ctg1_5388_-_ctg1_5388
44 健康 1028032188 102803_ctg1_2188_-_ctg1_2188
45 音乐 1028035288 102803_ctg1_5288_-_ctg1_5288
46 游戏 1028034888 102803_ctg1_4888_-_ctg1_4888
47 新时代 1028037968 102803_ctg1_7968_-_ctg1_7968
48 校园 102803600177 102803_ctg1_600177_-_ctg1_600177
49 收藏 1028038189 102803_ctg1_8189_-_ctg1_8189
50 政务 1028035788 102803_ctg1_5788_-_ctg1_5788
51 养生 1028036588 102803_ctg1_6588_-_ctg1_6588
52 育儿 1028033188 102803_ctg1_3188_-_ctg1_3188
53 抽奖 102803600037 102803_ctg1_600037_-_ctg1_600037
54 教育 102803600080 102803_ctg1_600080_-_ctg1_600080
55 婚恋 1028031788 102803_ctg1_1788_-_ctg1_1788
56 舞蹈 1028038788 102803_ctg1_8788_-_ctg1_8788
57 辟谣 1028036988 102803_ctg1_6988_-_ctg1_6988
58 公益 102803600057 102803_ctg1_600057_-_ctg1_600057
59 问答 1028037977 102803_ctg1_7977_-_ctg1_7977
60 三农 1028037188 102803_ctg1_7188_-_ctg1_7188
+48 -49
View File
@@ -1,21 +1,22 @@
from utils.getPublicData import *
from snownlp import SnowNLP
articleList = getAllArticleData()
commentList = getAllCommentsData()
def getTypeList():# 返回爬取到的所有文章的类型(已去重)
def getTypeList():
return list(set([x[8] for x in getAllArticleData()]))
def getArticleByType(type):# 根据特定文章类型筛选文章
def getArticleByType(type):
articles = []
for i in articleList:
if i[8] == type:
articles.append(i)
return articles
def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
def getArticleCharLikeCount(type):
articles = getArticleByType(type)
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
yData = [0 for x in range(len(xData))]
for article in articles:
likeCount = int(article[1])
if likeCount < 100:
@@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
yData[6] += 1
return xData,yData
def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
def getArticleCharCommentsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
yData = [0 for x in range(len(xData))]
for article in articles:
commentLen = int(article[2])
if commentLen < 100:
@@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
yData[8] += 1
return xData,yData
def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
def getArticleCharRepotsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
yData = [0 for x in range(len(xData))]
@@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
yData[11] += 1
return xData,yData
def getIPCharByArticleRegion():#统计文章发布地域的分布情况
def getIPCharByArticleRegion():
articleRegionDic = {}
for i in articleList:
if i[4] != '':# 如果ip为确定值的话就进行下一步
if i[4] in articleRegionDic.keys():
articleRegionDic[i[4]] += 1
else:
if i[4] != '':
if articleRegionDic.get(i[4],-1) == -1:
articleRegionDic[i[4]] = 1
else:
articleRegionDic[i[4]] += 1
resultData = []
for key,value in articleRegionDic.items():
resultData.append({
@@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
})
return resultData
def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
def getIPCharByCommentsRegion():
commentRegionDic = {}
for i in commentList:
if i[3] != '':
if i[3] in commentRegionDic.keys():
commentRegionDic[i[3]] += 1
else:
if commentRegionDic.get(i[3],-1) == -1:
commentRegionDic[i[3]] = 1
else:
commentRegionDic[i[3]] += 1
resultData = []
for key,value in commentRegionDic.items():
resultData.append({
@@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
})
return resultData
def getCommentCharDataOne():# 统计评论点赞数的分布情况
def getCommentCharDataOne():
xData = []
rangeNum = 20
for item in range(100):
for item in range(1,100):
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
yData = [0 for x in range(len(xData))]
for comment in commentList:
for item in range(100):
if int(comment[2]) < rangeNum * (item + 1):
for item in range(99):
if int(comment[2]) < rangeNum * (item + 2):
yData[item] += 1
break
return xData,yData
def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
def getCommentCharDataTwo():
genderDic = {}
for i in commentList:
if i[6] in genderDic.keys():
genderDic[i[6]] += 1
else:
if genderDic.get(i[6],-1) == -1:
genderDic[i[6]] = 1
resultData = []
for key,value in genderDic.items():
resultData.append({
'name':key,
'value':value
})
else:
genderDic[i[6]] += 1
resultData = [{
'name':x[0],
'value':x[1]
} for x in genderDic.items()]
return resultData
def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
def getYuQingCharDataOne():
hotWordList = getAllHotWords()
xData = ['正面','中性','负面']
yData = [0,0,0]
@@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
yData[1] += 1
elif emotionValue < 0.5:
yData[2] += 1
finalData = [{
bieData = [{
'name':x,
'value':yData[index]
} for index,x in enumerate(xData)]
return xData,yData,finalData
return xData,yData,bieData
def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
def getYuQingCharDataTwo():
xData = ['正面', '中性', '负面']
finalData1 = [{
bieData1 = [{
'name':x,
'value':0
} for x in xData]
finalData2 = [{
bieData2 = [{
'name': x,
'value': 0
} for x in xData]
@@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
for comment in commentList:
emotionValue = SnowNLP(comment[4]).sentiments
if emotionValue > 0.5:
finalData1[0]['value'] += 1
bieData1[0]['value'] += 1
elif emotionValue == 0.5:
finalData1[1]['value'] += 1
bieData1[1]['value'] += 1
elif emotionValue < 0.5:
finalData1[2]['value'] += 1
bieData1[2]['value'] += 1
for artile in articleList:
emotionValue = SnowNLP(artile[5]).sentiments
if emotionValue > 0.5:
finalData2[0]['value'] += 1
bieData2[0]['value'] += 1
elif emotionValue == 0.5:
finalData2[1]['value'] += 1
bieData2[1]['value'] += 1
elif emotionValue < 0.5:
finalData2[2]['value'] += 1
return finalData1,finalData2
bieData2[2]['value'] += 1
return bieData1,bieData2
def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
def getYuQingCharDataThree():
hotWordList = getAllHotWords()
xData = []
yData = []
x1Data = []
y1Data = []
for i in hotWordList[:10]:
xData.append(i[0])
yData.append(int(i[1]))
return xData,yData
x1Data.append(i[0])
y1Data.append(int(i[1]))
return x1Data,y1Data