【大修bug】添加csv表格原始数据,修改词频统计函数bug
This commit is contained in:
@@ -0,0 +1,100 @@
|
|||||||
|
('哈哈', 1236)
|
||||||
|
('哈哈哈', 537)
|
||||||
|
('哈哈哈哈', 157)
|
||||||
|
('真的', 154)
|
||||||
|
('期待', 89)
|
||||||
|
('喜欢', 89)
|
||||||
|
('doge', 88)
|
||||||
|
('宝宝', 87)
|
||||||
|
('可爱', 79)
|
||||||
|
('第一', 73)
|
||||||
|
('演唱', 71)
|
||||||
|
('亲亲', 71)
|
||||||
|
('苦涩', 70)
|
||||||
|
('啊啊啊', 68)
|
||||||
|
('抱抱', 64)
|
||||||
|
('cry', 64)
|
||||||
|
('宝贝', 62)
|
||||||
|
('姐姐', 51)
|
||||||
|
('花花', 50)
|
||||||
|
('送花', 48)
|
||||||
|
('开心', 47)
|
||||||
|
('加油', 47)
|
||||||
|
('老师', 46)
|
||||||
|
('call', 45)
|
||||||
|
('特别', 42)
|
||||||
|
('一个', 42)
|
||||||
|
('抓狂', 40)
|
||||||
|
('嘻嘻', 39)
|
||||||
|
('心心', 38)
|
||||||
|
('悲伤', 38)
|
||||||
|
('世界', 37)
|
||||||
|
('感觉', 35)
|
||||||
|
('孩子', 35)
|
||||||
|
('朋友', 34)
|
||||||
|
('鲜花', 34)
|
||||||
|
('开学', 34)
|
||||||
|
('好好', 34)
|
||||||
|
('演唱会', 33)
|
||||||
|
('感谢', 32)
|
||||||
|
('憧憬', 31)
|
||||||
|
('学季', 31)
|
||||||
|
('快乐', 30)
|
||||||
|
('漂亮', 30)
|
||||||
|
('中国', 30)
|
||||||
|
('音乐', 29)
|
||||||
|
('电影', 28)
|
||||||
|
('莲花', 28)
|
||||||
|
('骄阳', 28)
|
||||||
|
('视频', 27)
|
||||||
|
('老公', 27)
|
||||||
|
('老婆', 27)
|
||||||
|
('值得', 26)
|
||||||
|
('好看', 26)
|
||||||
|
('消失', 26)
|
||||||
|
('希望', 25)
|
||||||
|
('呜呜', 25)
|
||||||
|
('少年', 25)
|
||||||
|
('东西', 25)
|
||||||
|
('实力', 24)
|
||||||
|
('评论', 24)
|
||||||
|
('舞台', 24)
|
||||||
|
('生活', 24)
|
||||||
|
('单身', 24)
|
||||||
|
('努力', 23)
|
||||||
|
('唯一', 23)
|
||||||
|
('幸福', 23)
|
||||||
|
('时间', 23)
|
||||||
|
('超级', 23)
|
||||||
|
('辈子', 22)
|
||||||
|
('童年', 22)
|
||||||
|
('时代', 22)
|
||||||
|
('可怜', 21)
|
||||||
|
('不见', 21)
|
||||||
|
('工作', 21)
|
||||||
|
('有人', 21)
|
||||||
|
('终于', 21)
|
||||||
|
('粉丝', 21)
|
||||||
|
('国家', 21)
|
||||||
|
('callcallcall', 21)
|
||||||
|
('永远', 21)
|
||||||
|
('太阳', 20)
|
||||||
|
('直播', 20)
|
||||||
|
('小时', 20)
|
||||||
|
('星期', 20)
|
||||||
|
('安全', 20)
|
||||||
|
('代言', 19)
|
||||||
|
('支持', 19)
|
||||||
|
('彩虹', 19)
|
||||||
|
('妈妈', 18)
|
||||||
|
('华为', 18)
|
||||||
|
('优秀', 18)
|
||||||
|
('好像', 18)
|
||||||
|
('越来', 18)
|
||||||
|
('大人', 18)
|
||||||
|
('父母', 18)
|
||||||
|
('害怕', 18)
|
||||||
|
('安哥', 18)
|
||||||
|
('加班', 18)
|
||||||
|
('一点', 18)
|
||||||
|
('一场', 17)
|
||||||
|
+2565
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,60 @@
|
|||||||
|
typeName,gid,containerid
|
||||||
|
热门,102803,102803
|
||||||
|
同城,1028032222,102803_2222
|
||||||
|
榜单,102803600169,102803_ctg1_600169_-_ctg1_600169
|
||||||
|
男篮,102803600279,102803_ctg1_600279_-_ctg1_600279
|
||||||
|
明星,1028034288,102803_ctg1_4288_-_ctg1_4288
|
||||||
|
车展,1028035188,102803_ctg1_5188_-_ctg1_5188
|
||||||
|
搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388
|
||||||
|
情感,1028031988,102803_ctg1_1988_-_ctg1_1988
|
||||||
|
周末,102803600195,102803_ctg1_600195_-_ctg1_600195
|
||||||
|
电影,1028033288,102803_ctg1_3288_-_ctg1_3288
|
||||||
|
社会,1028034188,102803_ctg1_4188_-_ctg1_4188
|
||||||
|
电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488
|
||||||
|
美食,1028032688,102803_ctg1_2688_-_ctg1_2688
|
||||||
|
俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267
|
||||||
|
国际,1028036288,102803_ctg1_6288_-_ctg1_6288
|
||||||
|
深度,102803600155,102803_ctg1_600155_-_ctg1_600155
|
||||||
|
财经,1028036388,102803_ctg1_6388_-_ctg1_6388
|
||||||
|
读书,1028034588,102803_ctg1_4588_-_ctg1_4588
|
||||||
|
摄影,1028034988,102803_ctg1_4988_-_ctg1_4988
|
||||||
|
颜值,102803600165,102803_ctg1_600165_-_ctg1_600165
|
||||||
|
体育,1028031388,102803_ctg1_1388_-_ctg1_1388
|
||||||
|
数码,1028035088,102803_ctg1_5088_-_ctg1_5088
|
||||||
|
综艺,1028034688,102803_ctg1_4688_-_ctg1_4688
|
||||||
|
时尚,1028034488,102803_ctg1_4488_-_ctg1_4488
|
||||||
|
星座,1028031688,102803_ctg1_1688_-_ctg1_1688
|
||||||
|
军事,1028036688,102803_ctg1_6688_-_ctg1_6688
|
||||||
|
股市,1028031288,102803_ctg1_1288_-_ctg1_1288
|
||||||
|
房产,1028035588,102803_ctg1_5588_-_ctg1_5588
|
||||||
|
家居,1028035888,102803_ctg1_5888_-_ctg1_5888
|
||||||
|
萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788
|
||||||
|
科技,1028032088,102803_ctg1_2088_-_ctg1_2088
|
||||||
|
科普,1028035988,102803_ctg1_5988_-_ctg1_5988
|
||||||
|
动漫,1028032388,102803_ctg1_2388_-_ctg1_2388
|
||||||
|
运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788
|
||||||
|
旅游,1028032588,102803_ctg1_2588_-_ctg1_2588
|
||||||
|
瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488
|
||||||
|
好物,102803600094,102803_ctg1_600094_-_ctg1_600094
|
||||||
|
历史,1028036788,102803_ctg1_6788_-_ctg1_6788
|
||||||
|
艺术,1028035488,102803_ctg1_5488_-_ctg1_5488
|
||||||
|
美妆,1028031588,102803_ctg1_1588_-_ctg1_1588
|
||||||
|
法律,1028037388,102803_ctg1_7388_-_ctg1_7388
|
||||||
|
设计,1028035388,102803_ctg1_5388_-_ctg1_5388
|
||||||
|
健康,1028032188,102803_ctg1_2188_-_ctg1_2188
|
||||||
|
音乐,1028035288,102803_ctg1_5288_-_ctg1_5288
|
||||||
|
游戏,1028034888,102803_ctg1_4888_-_ctg1_4888
|
||||||
|
新时代,1028037968,102803_ctg1_7968_-_ctg1_7968
|
||||||
|
校园,102803600177,102803_ctg1_600177_-_ctg1_600177
|
||||||
|
收藏,1028038189,102803_ctg1_8189_-_ctg1_8189
|
||||||
|
政务,1028035788,102803_ctg1_5788_-_ctg1_5788
|
||||||
|
养生,1028036588,102803_ctg1_6588_-_ctg1_6588
|
||||||
|
育儿,1028033188,102803_ctg1_3188_-_ctg1_3188
|
||||||
|
抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037
|
||||||
|
教育,102803600080,102803_ctg1_600080_-_ctg1_600080
|
||||||
|
婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788
|
||||||
|
舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788
|
||||||
|
辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988
|
||||||
|
公益,102803600057,102803_ctg1_600057_-_ctg1_600057
|
||||||
|
问答,1028037977,102803_ctg1_7977_-_ctg1_7977
|
||||||
|
三农,1028037188,102803_ctg1_7188_-_ctg1_7188
|
||||||
|
+48
-49
@@ -1,21 +1,22 @@
|
|||||||
from utils.getPublicData import *
|
from utils.getPublicData import *
|
||||||
|
from snownlp import SnowNLP
|
||||||
articleList = getAllArticleData()
|
articleList = getAllArticleData()
|
||||||
commentList = getAllCommentsData()
|
commentList = getAllCommentsData()
|
||||||
|
|
||||||
def getTypeList():# 返回爬取到的所有文章的类型(已去重)
|
def getTypeList():
|
||||||
return list(set([x[8] for x in getAllArticleData()]))
|
return list(set([x[8] for x in getAllArticleData()]))
|
||||||
|
|
||||||
def getArticleByType(type):# 根据特定文章类型筛选文章
|
def getArticleByType(type):
|
||||||
articles = []
|
articles = []
|
||||||
for i in articleList:
|
for i in articleList:
|
||||||
if i[8] == type:
|
if i[8] == type:
|
||||||
articles.append(i)
|
articles.append(i)
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
|
def getArticleCharLikeCount(type):
|
||||||
articles = getArticleByType(type)
|
articles = getArticleByType(type)
|
||||||
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
|
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
|
||||||
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
|
yData = [0 for x in range(len(xData))]
|
||||||
for article in articles:
|
for article in articles:
|
||||||
likeCount = int(article[1])
|
likeCount = int(article[1])
|
||||||
if likeCount < 100:
|
if likeCount < 100:
|
||||||
@@ -34,10 +35,10 @@ def getArticleCharLikeCount(type):# 统计特定类型文章的点赞数分布
|
|||||||
yData[6] += 1
|
yData[6] += 1
|
||||||
return xData,yData
|
return xData,yData
|
||||||
|
|
||||||
def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
|
def getArticleCharCommentsLen(type):
|
||||||
articles = getArticleByType(type)
|
articles = getArticleByType(type)
|
||||||
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
|
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
|
||||||
yData = [0 for x in range(len(xData))]# 初始化为长度和xData相同但是每一个元素都是零的列表
|
yData = [0 for x in range(len(xData))]
|
||||||
for article in articles:
|
for article in articles:
|
||||||
commentLen = int(article[2])
|
commentLen = int(article[2])
|
||||||
if commentLen < 100:
|
if commentLen < 100:
|
||||||
@@ -60,7 +61,7 @@ def getArticleCharCommentsLen(type):# 统计特定类型文章的评论数分布
|
|||||||
yData[8] += 1
|
yData[8] += 1
|
||||||
return xData,yData
|
return xData,yData
|
||||||
|
|
||||||
def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
|
def getArticleCharRepotsLen(type):
|
||||||
articles = getArticleByType(type)
|
articles = getArticleByType(type)
|
||||||
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
|
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
|
||||||
yData = [0 for x in range(len(xData))]
|
yData = [0 for x in range(len(xData))]
|
||||||
@@ -92,14 +93,14 @@ def getArticleCharRepotsLen(type):# 统计特定类型文章的转发数分布
|
|||||||
yData[11] += 1
|
yData[11] += 1
|
||||||
return xData,yData
|
return xData,yData
|
||||||
|
|
||||||
def getIPCharByArticleRegion():#统计文章发布地域的分布情况
|
def getIPCharByArticleRegion():
|
||||||
articleRegionDic = {}
|
articleRegionDic = {}
|
||||||
for i in articleList:
|
for i in articleList:
|
||||||
if i[4] != '无':# 如果ip为确定值的话就进行下一步
|
if i[4] != '无':
|
||||||
if i[4] in articleRegionDic.keys():
|
if articleRegionDic.get(i[4],-1) == -1:
|
||||||
articleRegionDic[i[4]] += 1
|
|
||||||
else:
|
|
||||||
articleRegionDic[i[4]] = 1
|
articleRegionDic[i[4]] = 1
|
||||||
|
else:
|
||||||
|
articleRegionDic[i[4]] += 1
|
||||||
resultData = []
|
resultData = []
|
||||||
for key,value in articleRegionDic.items():
|
for key,value in articleRegionDic.items():
|
||||||
resultData.append({
|
resultData.append({
|
||||||
@@ -108,14 +109,14 @@ def getIPCharByArticleRegion():#统计文章发布地域的分布情况
|
|||||||
})
|
})
|
||||||
return resultData
|
return resultData
|
||||||
|
|
||||||
def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
|
def getIPCharByCommentsRegion():
|
||||||
commentRegionDic = {}
|
commentRegionDic = {}
|
||||||
for i in commentList:
|
for i in commentList:
|
||||||
if i[3] != '无':
|
if i[3] != '无':
|
||||||
if i[3] in commentRegionDic.keys():
|
if commentRegionDic.get(i[3],-1) == -1:
|
||||||
commentRegionDic[i[3]] += 1
|
|
||||||
else:
|
|
||||||
commentRegionDic[i[3]] = 1
|
commentRegionDic[i[3]] = 1
|
||||||
|
else:
|
||||||
|
commentRegionDic[i[3]] += 1
|
||||||
resultData = []
|
resultData = []
|
||||||
for key,value in commentRegionDic.items():
|
for key,value in commentRegionDic.items():
|
||||||
resultData.append({
|
resultData.append({
|
||||||
@@ -124,35 +125,33 @@ def getIPCharByCommentsRegion():#统计评论发布地域的分布情况
|
|||||||
})
|
})
|
||||||
return resultData
|
return resultData
|
||||||
|
|
||||||
def getCommentCharDataOne():# 统计评论点赞数的分布情况
|
def getCommentCharDataOne():
|
||||||
xData = []
|
xData = []
|
||||||
rangeNum = 20
|
rangeNum = 20
|
||||||
for item in range(100):
|
for item in range(1,100):
|
||||||
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
|
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
|
||||||
yData = [0 for x in range(len(xData))]
|
yData = [0 for x in range(len(xData))]
|
||||||
for comment in commentList:
|
for comment in commentList:
|
||||||
for item in range(100):
|
for item in range(99):
|
||||||
if int(comment[2]) < rangeNum * (item + 1):
|
if int(comment[2]) < rangeNum * (item + 2):
|
||||||
yData[item] += 1
|
yData[item] += 1
|
||||||
break
|
break
|
||||||
return xData,yData
|
return xData,yData
|
||||||
|
|
||||||
def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
|
def getCommentCharDataTwo():
|
||||||
genderDic = {}
|
genderDic = {}
|
||||||
for i in commentList:
|
for i in commentList:
|
||||||
if i[6] in genderDic.keys():
|
if genderDic.get(i[6],-1) == -1:
|
||||||
genderDic[i[6]] += 1
|
|
||||||
else:
|
|
||||||
genderDic[i[6]] = 1
|
genderDic[i[6]] = 1
|
||||||
resultData = []
|
else:
|
||||||
for key,value in genderDic.items():
|
genderDic[i[6]] += 1
|
||||||
resultData.append({
|
resultData = [{
|
||||||
'name':key,
|
'name':x[0],
|
||||||
'value':value
|
'value':x[1]
|
||||||
})
|
} for x in genderDic.items()]
|
||||||
return resultData
|
return resultData
|
||||||
|
|
||||||
def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
|
def getYuQingCharDataOne():
|
||||||
hotWordList = getAllHotWords()
|
hotWordList = getAllHotWords()
|
||||||
xData = ['正面','中性','负面']
|
xData = ['正面','中性','负面']
|
||||||
yData = [0,0,0]
|
yData = [0,0,0]
|
||||||
@@ -164,19 +163,19 @@ def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
|
|||||||
yData[1] += 1
|
yData[1] += 1
|
||||||
elif emotionValue < 0.5:
|
elif emotionValue < 0.5:
|
||||||
yData[2] += 1
|
yData[2] += 1
|
||||||
finalData = [{
|
bieData = [{
|
||||||
'name':x,
|
'name':x,
|
||||||
'value':yData[index]
|
'value':yData[index]
|
||||||
} for index,x in enumerate(xData)]
|
} for index,x in enumerate(xData)]
|
||||||
return xData,yData,finalData
|
return xData,yData,bieData
|
||||||
|
|
||||||
def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
|
def getYuQingCharDataTwo():
|
||||||
xData = ['正面', '中性', '负面']
|
xData = ['正面', '中性', '负面']
|
||||||
finalData1 = [{
|
bieData1 = [{
|
||||||
'name':x,
|
'name':x,
|
||||||
'value':0
|
'value':0
|
||||||
} for x in xData]
|
} for x in xData]
|
||||||
finalData2 = [{
|
bieData2 = [{
|
||||||
'name': x,
|
'name': x,
|
||||||
'value': 0
|
'value': 0
|
||||||
} for x in xData]
|
} for x in xData]
|
||||||
@@ -184,27 +183,27 @@ def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
|
|||||||
for comment in commentList:
|
for comment in commentList:
|
||||||
emotionValue = SnowNLP(comment[4]).sentiments
|
emotionValue = SnowNLP(comment[4]).sentiments
|
||||||
if emotionValue > 0.5:
|
if emotionValue > 0.5:
|
||||||
finalData1[0]['value'] += 1
|
bieData1[0]['value'] += 1
|
||||||
elif emotionValue == 0.5:
|
elif emotionValue == 0.5:
|
||||||
finalData1[1]['value'] += 1
|
bieData1[1]['value'] += 1
|
||||||
elif emotionValue < 0.5:
|
elif emotionValue < 0.5:
|
||||||
finalData1[2]['value'] += 1
|
bieData1[2]['value'] += 1
|
||||||
for artile in articleList:
|
for artile in articleList:
|
||||||
emotionValue = SnowNLP(artile[5]).sentiments
|
emotionValue = SnowNLP(artile[5]).sentiments
|
||||||
if emotionValue > 0.5:
|
if emotionValue > 0.5:
|
||||||
finalData2[0]['value'] += 1
|
bieData2[0]['value'] += 1
|
||||||
elif emotionValue == 0.5:
|
elif emotionValue == 0.5:
|
||||||
finalData2[1]['value'] += 1
|
bieData2[1]['value'] += 1
|
||||||
elif emotionValue < 0.5:
|
elif emotionValue < 0.5:
|
||||||
finalData2[2]['value'] += 1
|
bieData2[2]['value'] += 1
|
||||||
return finalData1,finalData2
|
return bieData1,bieData2
|
||||||
|
|
||||||
def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
|
def getYuQingCharDataThree():
|
||||||
hotWordList = getAllHotWords()
|
hotWordList = getAllHotWords()
|
||||||
xData = []
|
x1Data = []
|
||||||
yData = []
|
y1Data = []
|
||||||
for i in hotWordList[:10]:
|
for i in hotWordList[:10]:
|
||||||
xData.append(i[0])
|
x1Data.append(i[0])
|
||||||
yData.append(int(i[1]))
|
y1Data.append(int(i[1]))
|
||||||
return xData,yData
|
return x1Data,y1Data
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user