爬虫结束自动打标注
This commit is contained in:
+71
-71
@@ -13,85 +13,85 @@ def getArticleByType(type):
|
||||
articles.append(i)
|
||||
return articles
|
||||
|
||||
def getArticleCharLikeCount(type):
|
||||
def getArticleLikeCount(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
|
||||
yData = [0 for x in range(len(xData))]
|
||||
X = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
|
||||
Y = [0 for x in range(len(X))]
|
||||
for article in articles:
|
||||
likeCount = int(article[1])
|
||||
if likeCount < 100:
|
||||
yData[0] += 1
|
||||
Y[0] += 1
|
||||
elif likeCount < 1000:
|
||||
yData[1] += 1
|
||||
Y[1] += 1
|
||||
elif likeCount < 5000:
|
||||
yData[2] += 1
|
||||
Y[2] += 1
|
||||
elif likeCount < 15000:
|
||||
yData[3] += 1
|
||||
Y[3] += 1
|
||||
elif likeCount < 30000:
|
||||
yData[4] += 1
|
||||
Y[4] += 1
|
||||
elif likeCount < 50000:
|
||||
yData[5] += 1
|
||||
Y[5] += 1
|
||||
elif likeCount >= 50000:
|
||||
yData[6] += 1
|
||||
return xData,yData
|
||||
Y[6] += 1
|
||||
return X,Y
|
||||
|
||||
def getArticleCharCommentsLen(type):
|
||||
def getArticleCommentsLen(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
|
||||
yData = [0 for x in range(len(xData))]
|
||||
X = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
|
||||
Y = [0 for x in range(len(X))]
|
||||
for article in articles:
|
||||
commentLen = int(article[2])
|
||||
if commentLen < 100:
|
||||
yData[0] += 1
|
||||
Y[0] += 1
|
||||
elif commentLen < 500:
|
||||
yData[1] += 1
|
||||
Y[1] += 1
|
||||
elif commentLen < 5000:
|
||||
yData[2] += 1
|
||||
Y[2] += 1
|
||||
elif commentLen < 1000:
|
||||
yData[3] += 1
|
||||
Y[3] += 1
|
||||
elif commentLen < 1500:
|
||||
yData[4] += 1
|
||||
Y[4] += 1
|
||||
elif commentLen < 3000:
|
||||
yData[5] += 1
|
||||
Y[5] += 1
|
||||
elif commentLen < 5000:
|
||||
yData[6] += 1
|
||||
Y[6] += 1
|
||||
elif commentLen < 10000:
|
||||
yData[7] += 1
|
||||
Y[7] += 1
|
||||
elif commentLen >= 15000:
|
||||
yData[8] += 1
|
||||
return xData,yData
|
||||
Y[8] += 1
|
||||
return X,Y
|
||||
|
||||
def getArticleCharRepotsLen(type):
|
||||
def getArticleRepotsLen(type):
|
||||
articles = getArticleByType(type)
|
||||
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
|
||||
yData = [0 for x in range(len(xData))]
|
||||
X = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
|
||||
Y = [0 for x in range(len(X))]
|
||||
for article in articles:
|
||||
repostsCount = int(article[3])
|
||||
if repostsCount < 100:
|
||||
yData[0] += 1
|
||||
Y[0] += 1
|
||||
elif repostsCount < 300:
|
||||
yData[1] += 1
|
||||
Y[1] += 1
|
||||
elif repostsCount < 500:
|
||||
yData[2] += 1
|
||||
Y[2] += 1
|
||||
elif repostsCount < 1000:
|
||||
yData[3] += 1
|
||||
Y[3] += 1
|
||||
elif repostsCount < 3000:
|
||||
yData[4] += 1
|
||||
Y[4] += 1
|
||||
elif repostsCount < 4000:
|
||||
yData[5] += 1
|
||||
Y[5] += 1
|
||||
elif repostsCount < 5000:
|
||||
yData[6] += 1
|
||||
Y[6] += 1
|
||||
elif repostsCount < 10000:
|
||||
yData[7] += 1
|
||||
Y[7] += 1
|
||||
elif repostsCount < 15000:
|
||||
yData[8] += 1
|
||||
Y[8] += 1
|
||||
elif repostsCount < 30000:
|
||||
yData[9] += 1
|
||||
Y[9] += 1
|
||||
elif repostsCount < 70000:
|
||||
yData[10] += 1
|
||||
Y[10] += 1
|
||||
elif repostsCount >= 70000:
|
||||
yData[11] += 1
|
||||
return xData,yData
|
||||
Y[11] += 1
|
||||
return X,Y
|
||||
|
||||
def getIPCharByArticleRegion():
|
||||
articleRegionDic = {}
|
||||
@@ -125,26 +125,26 @@ def getIPCharByCommentsRegion():
|
||||
})
|
||||
return resultData
|
||||
|
||||
def getCommentCharDataOne():
|
||||
xData = []
|
||||
def getCommentDataOne():
|
||||
X = []
|
||||
rangeNum = 20
|
||||
for item in range(100):
|
||||
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
|
||||
yData = [0 for x in range(len(xData))]
|
||||
X.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
|
||||
Y = [0 for x in range(len(X))]
|
||||
for comment in commentList:
|
||||
for item in range(100):
|
||||
if int(comment[2]) < rangeNum * (item + 1):
|
||||
yData[item] += 1
|
||||
Y[item] += 1
|
||||
break
|
||||
return xData,yData
|
||||
return X,Y
|
||||
|
||||
def getCommentCharDataTwo():
|
||||
def getCommentDataTwo():
|
||||
genderDic = {}
|
||||
for i in commentList:
|
||||
if genderDic.get(i[6],-1) == -1:
|
||||
genderDic[i[6]] = 1
|
||||
else:
|
||||
if i[6] in genderDic.keys():
|
||||
genderDic[i[6]] += 1
|
||||
else:
|
||||
genderDic[i[6]] = 1
|
||||
resultData = [{
|
||||
'name':x[0],
|
||||
'value':x[1]
|
||||
@@ -153,50 +153,50 @@ def getCommentCharDataTwo():
|
||||
|
||||
def getYuQingCharDataOne():
|
||||
hotWordList = getAllHotWords()
|
||||
xData = ['正面','中性','负面']
|
||||
yData = [0,0,0]
|
||||
X = ['正面','中性','负面']
|
||||
Y = [0,0,0]
|
||||
for word in hotWordList:
|
||||
emotionValue = SnowNLP(word[0]).sentiments
|
||||
if emotionValue > 0.4:
|
||||
yData[0] += 1
|
||||
Y[0] += 1
|
||||
elif emotionValue < 0.2:
|
||||
yData[2] += 1
|
||||
Y[2] += 1
|
||||
else:
|
||||
yData[1] += 1
|
||||
bieData = [{
|
||||
Y[1] += 1
|
||||
finaldata = [{
|
||||
'name':x,
|
||||
'value':yData[index]
|
||||
} for index,x in enumerate(xData)]
|
||||
return xData,yData,bieData
|
||||
'value':Y[index]
|
||||
} for index,x in enumerate(X)]
|
||||
return X,Y,finaldata
|
||||
|
||||
def getYuQingCharDataTwo():
|
||||
xData = ['正面', '中性', '负面']
|
||||
bieData1 = [{
|
||||
X = ['正面', '中性', '负面']
|
||||
finaldata1 = [{
|
||||
'name':x,
|
||||
'value':0
|
||||
} for x in xData]
|
||||
bieData2 = [{
|
||||
} for x in X]
|
||||
finaldata2 = [{
|
||||
'name': x,
|
||||
'value': 0
|
||||
} for x in xData]
|
||||
} for x in X]
|
||||
|
||||
for comment in commentList:
|
||||
emotionValue = SnowNLP(comment[4]).sentiments
|
||||
if emotionValue > 0.4:
|
||||
bieData1[0]['value'] += 1
|
||||
finaldata1[0]['value'] += 1
|
||||
elif emotionValue < 0.2:
|
||||
bieData1[2]['value'] += 1
|
||||
finaldata1[2]['value'] += 1
|
||||
else:
|
||||
bieData1[1]['value'] += 1
|
||||
finaldata1[1]['value'] += 1
|
||||
for artile in articleList:
|
||||
emotionValue = SnowNLP(artile[5]).sentiments
|
||||
if emotionValue > 0.4:
|
||||
bieData2[0]['value'] += 1
|
||||
finaldata2[0]['value'] += 1
|
||||
elif emotionValue < 0.2:
|
||||
bieData2[2]['value'] += 1
|
||||
finaldata2[2]['value'] += 1
|
||||
else:
|
||||
bieData2[1]['value'] += 1
|
||||
return bieData1,bieData2
|
||||
finaldata2[1]['value'] += 1
|
||||
return finaldata1,finaldata2
|
||||
|
||||
def getYuQingCharDataThree():
|
||||
hotWordList = getAllHotWords()
|
||||
|
||||
@@ -27,14 +27,14 @@ def getHomeCommentsLikeCountTopFore():# 获取评论中点赞最高的前四条
|
||||
return list(sorted(commentsList,key=lambda x:int(x[2]),reverse=True))[:4]
|
||||
|
||||
def getHomeArticleCreatedAtChart():# 根据日期分别计算该日期的文章数
|
||||
xData = list(set([x[7] for x in articleList]))
|
||||
xData = list(sorted(xData,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
|
||||
yData = [0 for x in range(len(xData))]
|
||||
X = list(set([x[7] for x in articleList]))
|
||||
X = list(sorted(X,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
|
||||
Y = [0 for x in range(len(X))]
|
||||
for article in articleList:
|
||||
for index,j in enumerate(xData):# 返回索引和值
|
||||
for index,j in enumerate(X):# 返回索引和值
|
||||
if article[7] == j:
|
||||
yData[index] += 1
|
||||
return xData,yData
|
||||
Y[index] += 1
|
||||
return X,Y
|
||||
|
||||
def getHomeTypeChart():# 统计每种类型的文章数量
|
||||
typeDic = {}
|
||||
|
||||
@@ -50,9 +50,9 @@ def getTopicData():
|
||||
# 读取合并文件 merge.csv # 取前十个话题
|
||||
top_10_topics = pd.read_csv('./merged_topics.csv').head(10)
|
||||
# 获取话题名称和对应的值
|
||||
xData = top_10_topics['name'].tolist()
|
||||
yData = top_10_topics['value'].tolist()
|
||||
return xData, yData
|
||||
X = top_10_topics['name'].tolist()
|
||||
Y = top_10_topics['value'].tolist()
|
||||
return X, Y
|
||||
|
||||
def getTopicCreatedAtandpredictData(topic):# 统计特定话题的评论在每个日期的数量,并返回日期和对应的评论数量
|
||||
createdAt = {}
|
||||
|
||||
Reference in New Issue
Block a user