【大修bug】添加csv表格原始数据,修改词频统计函数bug
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
('哈哈', 1236)
|
||||
('哈哈哈', 537)
|
||||
('哈哈哈哈', 157)
|
||||
('真的', 154)
|
||||
('期待', 89)
|
||||
('喜欢', 89)
|
||||
('doge', 88)
|
||||
('宝宝', 87)
|
||||
('可爱', 79)
|
||||
('第一', 73)
|
||||
('演唱', 71)
|
||||
('亲亲', 71)
|
||||
('苦涩', 70)
|
||||
('啊啊啊', 68)
|
||||
('抱抱', 64)
|
||||
('cry', 64)
|
||||
('宝贝', 62)
|
||||
('姐姐', 51)
|
||||
('花花', 50)
|
||||
('送花', 48)
|
||||
('开心', 47)
|
||||
('加油', 47)
|
||||
('老师', 46)
|
||||
('call', 45)
|
||||
('特别', 42)
|
||||
('一个', 42)
|
||||
('抓狂', 40)
|
||||
('嘻嘻', 39)
|
||||
('心心', 38)
|
||||
('悲伤', 38)
|
||||
('世界', 37)
|
||||
('感觉', 35)
|
||||
('孩子', 35)
|
||||
('朋友', 34)
|
||||
('鲜花', 34)
|
||||
('开学', 34)
|
||||
('好好', 34)
|
||||
('演唱会', 33)
|
||||
('感谢', 32)
|
||||
('憧憬', 31)
|
||||
('学季', 31)
|
||||
('快乐', 30)
|
||||
('漂亮', 30)
|
||||
('中国', 30)
|
||||
('音乐', 29)
|
||||
('电影', 28)
|
||||
('莲花', 28)
|
||||
('骄阳', 28)
|
||||
('视频', 27)
|
||||
('老公', 27)
|
||||
('老婆', 27)
|
||||
('值得', 26)
|
||||
('好看', 26)
|
||||
('消失', 26)
|
||||
('希望', 25)
|
||||
('呜呜', 25)
|
||||
('少年', 25)
|
||||
('东西', 25)
|
||||
('实力', 24)
|
||||
('评论', 24)
|
||||
('舞台', 24)
|
||||
('生活', 24)
|
||||
('单身', 24)
|
||||
('努力', 23)
|
||||
('唯一', 23)
|
||||
('幸福', 23)
|
||||
('时间', 23)
|
||||
('超级', 23)
|
||||
('辈子', 22)
|
||||
('童年', 22)
|
||||
('时代', 22)
|
||||
('可怜', 21)
|
||||
('不见', 21)
|
||||
('工作', 21)
|
||||
('有人', 21)
|
||||
('终于', 21)
|
||||
('粉丝', 21)
|
||||
('国家', 21)
|
||||
('callcallcall', 21)
|
||||
('永远', 21)
|
||||
('太阳', 20)
|
||||
('直播', 20)
|
||||
('小时', 20)
|
||||
('星期', 20)
|
||||
('安全', 20)
|
||||
('代言', 19)
|
||||
('支持', 19)
|
||||
('彩虹', 19)
|
||||
('妈妈', 18)
|
||||
('华为', 18)
|
||||
('优秀', 18)
|
||||
('好像', 18)
|
||||
('越来', 18)
|
||||
('大人', 18)
|
||||
('父母', 18)
|
||||
('害怕', 18)
|
||||
('安哥', 18)
|
||||
('加班', 18)
|
||||
('一点', 18)
|
||||
('一场', 17)
|
||||
|
+2565
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,60 @@
|
||||
typeName,gid,containerid
|
||||
热门,102803,102803
|
||||
同城,1028032222,102803_2222
|
||||
榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
|
||||
男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
|
||||
明星,1028034288,102803_ctg1_4288_-_ctg1_4288
|
||||
车展,1028035188,102803_ctg1_5188_-_ctg1_5188
|
||||
搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
|
||||
情感,1028031988,102803_ctg1_1988_-_ctg1_1988
|
||||
周末,102803600195,102803_ctg1_600195_-_ctg1_600195
|
||||
电影,1028033288,102803_ctg1_3288_-_ctg1_3288
|
||||
社会,1028034188,102803_ctg1_4188_-_ctg1_4188
|
||||
电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
|
||||
美食,1028032688,102803_ctg1_2688_-_ctg1_2688
|
||||
俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
|
||||
国际,1028036288,102803_ctg1_6288_-_ctg1_6288
|
||||
深度,102803600155,102803_ctg1_600155_-_ctg1_600155
|
||||
财经,1028036388,102803_ctg1_6388_-_ctg1_6388
|
||||
读书,1028034588,102803_ctg1_4588_-_ctg1_4588
|
||||
摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
|
||||
颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
|
||||
体育,1028031388,102803_ctg1_1388_-_ctg1_1388
|
||||
数码,1028035088,102803_ctg1_5088_-_ctg1_5088
|
||||
综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
|
||||
时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
|
||||
星座,1028031688,102803_ctg1_1688_-_ctg1_1688
|
||||
军事,1028036688,102803_ctg1_6688_-_ctg1_6688
|
||||
股市,1028031288,102803_ctg1_1288_-_ctg1_1288
|
||||
房产,1028035588,102803_ctg1_5588_-_ctg1_5588
|
||||
家居,1028035888,102803_ctg1_5888_-_ctg1_5888
|
||||
萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
|
||||
科技,1028032088,102803_ctg1_2088_-_ctg1_2088
|
||||
科普,1028035988,102803_ctg1_5988_-_ctg1_5988
|
||||
动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
|
||||
运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
|
||||
旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
|
||||
瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
|
||||
好物,102803600094,102803_ctg1_600094_-_ctg1_600094
|
||||
历史,1028036788,102803_ctg1_6788_-_ctg1_6788
|
||||
艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
|
||||
美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
|
||||
法律,1028037388,102803_ctg1_7388_-_ctg1_7388
|
||||
设计,1028035388,102803_ctg1_5388_-_ctg1_5388
|
||||
健康,1028032188,102803_ctg1_2188_-_ctg1_2188
|
||||
音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
|
||||
游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
|
||||
新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
|
||||
校园,102803600177,102803_ctg1_600177_-_ctg1_600177
|
||||
收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
|
||||
政务,1028035788,102803_ctg1_5788_-_ctg1_5788
|
||||
养生,1028036588,102803_ctg1_6588_-_ctg1_6588
|
||||
育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
|
||||
抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
|
||||
教育,102803600080,102803_ctg1_600080_-_ctg1_600080
|
||||
婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
|
||||
舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
|
||||
辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
|
||||
公益,102803600057,102803_ctg1_600057_-_ctg1_600057
|
||||
问答,1028037977,102803_ctg1_7977_-_ctg1_7977
|
||||
三农,1028037188,102803_ctg1_7188_-_ctg1_7188
|
||||
|
+48
-49
@@ -1,21 +1,22 @@
|
||||
from utils.getPublicData import *
|
||||
from snownlp import SnowNLP
|
||||
articleList = getAllArticleData()
|
||||
commentList = getAllCommentsData()
|
||||
|
||||
def getTypeList():# 返回爬取到的所有文章的类型(已去重)
|
||||
def getTypeList():
|
||||
return list(set([x[8] for x in getAllArticleData()]))
|
||||
|
||||
def getArticleByType(type):# 根据特定文章类型筛选文章
|
||||
def getArticleByType(type):
|
||||
articles = []
|
||||
for i in articleList:
|
||||
if i[8] == type:
|
||||
articles.append(i)
|
||||
return articles
|
||||
|
||||
def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
|
||||
def getArticleCharLikeCount(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
|
||||
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
|
||||
yData = [0 for x in range(len(xData))]
|
||||
for article in articles:
|
||||
likeCount = int(article[1])
|
||||
if likeCount < 100:
|
||||
@@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
|
||||
yData[6] += 1
|
||||
return xData,yData
|
||||
|
||||
def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
|
||||
def getArticleCharCommentsLen(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
|
||||
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
|
||||
yData = [0 for x in range(len(xData))]
|
||||
for article in articles:
|
||||
commentLen = int(article[2])
|
||||
if commentLen < 100:
|
||||
@@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
|
||||
yData[8] += 1
|
||||
return xData,yData
|
||||
|
||||
def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
|
||||
def getArticleCharRepotsLen(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
|
||||
yData = [0 for x in range(len(xData))]
|
||||
@@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
|
||||
yData[11] += 1
|
||||
return xData,yData
|
||||
|
||||
def getIPCharByArticleRegion():#统计文章发布地域的分布情况
|
||||
def getIPCharByArticleRegion():
|
||||
articleRegionDic = {}
|
||||
for i in articleList:
|
||||
if i[4] != '无':# 如果ip为确定值的话就进行下一步
|
||||
if i[4] in articleRegionDic.keys():
|
||||
articleRegionDic[i[4]] += 1
|
||||
else:
|
||||
if i[4] != '无':
|
||||
if articleRegionDic.get(i[4],-1) == -1:
|
||||
articleRegionDic[i[4]] = 1
|
||||
else:
|
||||
articleRegionDic[i[4]] += 1
|
||||
resultData = []
|
||||
for key,value in articleRegionDic.items():
|
||||
resultData.append({
|
||||
@@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
|
||||
})
|
||||
return resultData
|
||||
|
||||
def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
|
||||
def getIPCharByCommentsRegion():
|
||||
commentRegionDic = {}
|
||||
for i in commentList:
|
||||
if i[3] != '无':
|
||||
if i[3] in commentRegionDic.keys():
|
||||
commentRegionDic[i[3]] += 1
|
||||
else:
|
||||
if commentRegionDic.get(i[3],-1) == -1:
|
||||
commentRegionDic[i[3]] = 1
|
||||
else:
|
||||
commentRegionDic[i[3]] += 1
|
||||
resultData = []
|
||||
for key,value in commentRegionDic.items():
|
||||
resultData.append({
|
||||
@@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
|
||||
})
|
||||
return resultData
|
||||
|
||||
def getCommentCharDataOne():# 统计评论点赞数的分布情况
|
||||
def getCommentCharDataOne():
|
||||
xData = []
|
||||
rangeNum = 20
|
||||
for item in range(100):
|
||||
for item in range(1,100):
|
||||
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
|
||||
yData = [0 for x in range(len(xData))]
|
||||
for comment in commentList:
|
||||
for item in range(100):
|
||||
if int(comment[2]) < rangeNum * (item + 1):
|
||||
for item in range(99):
|
||||
if int(comment[2]) < rangeNum * (item + 2):
|
||||
yData[item] += 1
|
||||
break
|
||||
return xData,yData
|
||||
|
||||
def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
|
||||
def getCommentCharDataTwo():
|
||||
genderDic = {}
|
||||
for i in commentList:
|
||||
if i[6] in genderDic.keys():
|
||||
genderDic[i[6]] += 1
|
||||
else:
|
||||
if genderDic.get(i[6],-1) == -1:
|
||||
genderDic[i[6]] = 1
|
||||
resultData = []
|
||||
for key,value in genderDic.items():
|
||||
resultData.append({
|
||||
'name':key,
|
||||
'value':value
|
||||
})
|
||||
else:
|
||||
genderDic[i[6]] += 1
|
||||
resultData = [{
|
||||
'name':x[0],
|
||||
'value':x[1]
|
||||
} for x in genderDic.items()]
|
||||
return resultData
|
||||
|
||||
def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
|
||||
def getYuQingCharDataOne():
|
||||
hotWordList = getAllHotWords()
|
||||
xData = ['正面','中性','负面']
|
||||
yData = [0,0,0]
|
||||
@@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
|
||||
yData[1] += 1
|
||||
elif emotionValue < 0.5:
|
||||
yData[2] += 1
|
||||
finalData = [{
|
||||
bieData = [{
|
||||
'name':x,
|
||||
'value':yData[index]
|
||||
} for index,x in enumerate(xData)]
|
||||
return xData,yData,finalData
|
||||
return xData,yData,bieData
|
||||
|
||||
def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
|
||||
def getYuQingCharDataTwo():
|
||||
xData = ['正面', '中性', '负面']
|
||||
finalData1 = [{
|
||||
bieData1 = [{
|
||||
'name':x,
|
||||
'value':0
|
||||
} for x in xData]
|
||||
finalData2 = [{
|
||||
bieData2 = [{
|
||||
'name': x,
|
||||
'value': 0
|
||||
} for x in xData]
|
||||
@@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
|
||||
for comment in commentList:
|
||||
emotionValue = SnowNLP(comment[4]).sentiments
|
||||
if emotionValue > 0.5:
|
||||
finalData1[0]['value'] += 1
|
||||
bieData1[0]['value'] += 1
|
||||
elif emotionValue == 0.5:
|
||||
finalData1[1]['value'] += 1
|
||||
bieData1[1]['value'] += 1
|
||||
elif emotionValue < 0.5:
|
||||
finalData1[2]['value'] += 1
|
||||
bieData1[2]['value'] += 1
|
||||
for artile in articleList:
|
||||
emotionValue = SnowNLP(artile[5]).sentiments
|
||||
if emotionValue > 0.5:
|
||||
finalData2[0]['value'] += 1
|
||||
bieData2[0]['value'] += 1
|
||||
elif emotionValue == 0.5:
|
||||
finalData2[1]['value'] += 1
|
||||
bieData2[1]['value'] += 1
|
||||
elif emotionValue < 0.5:
|
||||
finalData2[2]['value'] += 1
|
||||
return finalData1,finalData2
|
||||
bieData2[2]['value'] += 1
|
||||
return bieData1,bieData2
|
||||
|
||||
def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
|
||||
def getYuQingCharDataThree():
|
||||
hotWordList = getAllHotWords()
|
||||
xData = []
|
||||
yData = []
|
||||
x1Data = []
|
||||
y1Data = []
|
||||
for i in hotWordList[:10]:
|
||||
xData.append(i[0])
|
||||
yData.append(int(i[1]))
|
||||
return xData,yData
|
||||
x1Data.append(i[0])
|
||||
y1Data.append(int(i[1]))
|
||||
return x1Data,y1Data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user