爬虫结束自动打标注

This commit is contained in:
juanboy
2024-07-04 19:22:17 +08:00
parent 59b18fa5a2
commit d139169e09
5 changed files with 108 additions and 108 deletions
+71 -71
View File
@@ -13,85 +13,85 @@ def getArticleByType(type):
articles.append(i)
return articles
def getArticleCharLikeCount(type):
def getArticleLikeCount(type):
articles = getArticleByType(type)
xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
yData = [0 for x in range(len(xData))]
X = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
Y = [0 for x in range(len(X))]
for article in articles:
likeCount = int(article[1])
if likeCount < 100:
yData[0] += 1
Y[0] += 1
elif likeCount < 1000:
yData[1] += 1
Y[1] += 1
elif likeCount < 5000:
yData[2] += 1
Y[2] += 1
elif likeCount < 15000:
yData[3] += 1
Y[3] += 1
elif likeCount < 30000:
yData[4] += 1
Y[4] += 1
elif likeCount < 50000:
yData[5] += 1
Y[5] += 1
elif likeCount >= 50000:
yData[6] += 1
return xData,yData
Y[6] += 1
return X,Y
def getArticleCharCommentsLen(type):
def getArticleCommentsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
yData = [0 for x in range(len(xData))]
X = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
Y = [0 for x in range(len(X))]
for article in articles:
commentLen = int(article[2])
if commentLen < 100:
yData[0] += 1
Y[0] += 1
elif commentLen < 500:
yData[1] += 1
Y[1] += 1
elif commentLen < 5000:
yData[2] += 1
Y[2] += 1
elif commentLen < 1000:
yData[3] += 1
Y[3] += 1
elif commentLen < 1500:
yData[4] += 1
Y[4] += 1
elif commentLen < 3000:
yData[5] += 1
Y[5] += 1
elif commentLen < 5000:
yData[6] += 1
Y[6] += 1
elif commentLen < 10000:
yData[7] += 1
Y[7] += 1
elif commentLen >= 15000:
yData[8] += 1
return xData,yData
Y[8] += 1
return X,Y
def getArticleCharRepotsLen(type):
def getArticleRepotsLen(type):
articles = getArticleByType(type)
xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
yData = [0 for x in range(len(xData))]
X = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
Y = [0 for x in range(len(X))]
for article in articles:
repostsCount = int(article[3])
if repostsCount < 100:
yData[0] += 1
Y[0] += 1
elif repostsCount < 300:
yData[1] += 1
Y[1] += 1
elif repostsCount < 500:
yData[2] += 1
Y[2] += 1
elif repostsCount < 1000:
yData[3] += 1
Y[3] += 1
elif repostsCount < 3000:
yData[4] += 1
Y[4] += 1
elif repostsCount < 4000:
yData[5] += 1
Y[5] += 1
elif repostsCount < 5000:
yData[6] += 1
Y[6] += 1
elif repostsCount < 10000:
yData[7] += 1
Y[7] += 1
elif repostsCount < 15000:
yData[8] += 1
Y[8] += 1
elif repostsCount < 30000:
yData[9] += 1
Y[9] += 1
elif repostsCount < 70000:
yData[10] += 1
Y[10] += 1
elif repostsCount >= 70000:
yData[11] += 1
return xData,yData
Y[11] += 1
return X,Y
def getIPCharByArticleRegion():
articleRegionDic = {}
@@ -125,26 +125,26 @@ def getIPCharByCommentsRegion():
})
return resultData
def getCommentCharDataOne():
xData = []
def getCommentDataOne():
X = []
rangeNum = 20
for item in range(100):
xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
yData = [0 for x in range(len(xData))]
X.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
Y = [0 for x in range(len(X))]
for comment in commentList:
for item in range(100):
if int(comment[2]) < rangeNum * (item + 1):
yData[item] += 1
Y[item] += 1
break
return xData,yData
return X,Y
def getCommentCharDataTwo():
def getCommentDataTwo():
genderDic = {}
for i in commentList:
if genderDic.get(i[6],-1) == -1:
genderDic[i[6]] = 1
else:
if i[6] in genderDic.keys():
genderDic[i[6]] += 1
else:
genderDic[i[6]] = 1
resultData = [{
'name':x[0],
'value':x[1]
@@ -153,50 +153,50 @@ def getCommentCharDataTwo():
def getYuQingCharDataOne():
hotWordList = getAllHotWords()
xData = ['正面','中性','负面']
yData = [0,0,0]
X = ['正面','中性','负面']
Y = [0,0,0]
for word in hotWordList:
emotionValue = SnowNLP(word[0]).sentiments
if emotionValue > 0.4:
yData[0] += 1
Y[0] += 1
elif emotionValue < 0.2:
yData[2] += 1
Y[2] += 1
else:
yData[1] += 1
bieData = [{
Y[1] += 1
finaldata = [{
'name':x,
'value':yData[index]
} for index,x in enumerate(xData)]
return xData,yData,bieData
'value':Y[index]
} for index,x in enumerate(X)]
return X,Y,finaldata
def getYuQingCharDataTwo():
xData = ['正面', '中性', '负面']
bieData1 = [{
X = ['正面', '中性', '负面']
finaldata1 = [{
'name':x,
'value':0
} for x in xData]
bieData2 = [{
} for x in X]
finaldata2 = [{
'name': x,
'value': 0
} for x in xData]
} for x in X]
for comment in commentList:
emotionValue = SnowNLP(comment[4]).sentiments
if emotionValue > 0.4:
bieData1[0]['value'] += 1
finaldata1[0]['value'] += 1
elif emotionValue < 0.2:
bieData1[2]['value'] += 1
finaldata1[2]['value'] += 1
else:
bieData1[1]['value'] += 1
finaldata1[1]['value'] += 1
for artile in articleList:
emotionValue = SnowNLP(artile[5]).sentiments
if emotionValue > 0.4:
bieData2[0]['value'] += 1
finaldata2[0]['value'] += 1
elif emotionValue < 0.2:
bieData2[2]['value'] += 1
finaldata2[2]['value'] += 1
else:
bieData2[1]['value'] += 1
return bieData1,bieData2
finaldata2[1]['value'] += 1
return finaldata1,finaldata2
def getYuQingCharDataThree():
hotWordList = getAllHotWords()
+6 -6
View File
@@ -27,14 +27,14 @@ def getHomeCommentsLikeCountTopFore():# 获取评论中点赞最高的前四条
return list(sorted(commentsList,key=lambda x:int(x[2]),reverse=True))[:4]
def getHomeArticleCreatedAtChart():# 根据日期分别计算该日期的文章数
xData = list(set([x[7] for x in articleList]))
xData = list(sorted(xData,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
yData = [0 for x in range(len(xData))]
X = list(set([x[7] for x in articleList]))
X = list(sorted(X,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
Y = [0 for x in range(len(X))]
for article in articleList:
for index,j in enumerate(xData):# 返回索引和值
for index,j in enumerate(X):# 返回索引和值
if article[7] == j:
yData[index] += 1
return xData,yData
Y[index] += 1
return X,Y
def getHomeTypeChart():# 统计每种类型的文章数量
typeDic = {}
+3 -3
View File
@@ -50,9 +50,9 @@ def getTopicData():
# 读取合并文件 merge.csv # 取前十个话题
top_10_topics = pd.read_csv('./merged_topics.csv').head(10)
# 获取话题名称和对应的值
xData = top_10_topics['name'].tolist()
yData = top_10_topics['value'].tolist()
return xData, yData
X = top_10_topics['name'].tolist()
Y = top_10_topics['value'].tolist()
return X, Y
def getTopicCreatedAtandpredictData(topic):# 统计特定话题的评论在每个日期的数量,并返回日期和对应的评论数量
createdAt = {}