爬虫结束自动打标注

2024-07-04 19:22:17 +08:00
parent 59b18fa5a2
commit d139169e09
5 changed files with 108 additions and 108 deletions
@@ -13,85 +13,85 @@ def getArticleByType(type):
            articles.append(i)
    return articles

-def getArticleCharLikeCount(type):
+def getArticleLikeCount(type):
    articles = getArticleByType(type)
-    xData = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
-    yData = [0 for x in range(len(xData))]
+    X = ['0-100','100-1000','1000-5000','5000-15000','15000-30000','30000-50000','50000-~']
+    Y = [0 for x in range(len(X))]
    for article in articles:
        likeCount = int(article[1])
        if likeCount < 100:
-            yData[0] += 1
+            Y[0] += 1
        elif likeCount < 1000:
-            yData[1] += 1
+            Y[1] += 1
        elif likeCount < 5000:
-            yData[2] += 1
+            Y[2] += 1
        elif likeCount < 15000:
-            yData[3] += 1
+            Y[3] += 1
        elif likeCount < 30000:
-            yData[4] += 1
+            Y[4] += 1
        elif likeCount < 50000:
-            yData[5] += 1
+            Y[5] += 1
        elif likeCount >= 50000:
-            yData[6] += 1
-    return xData,yData
+            Y[6] += 1
+    return X,Y

-def getArticleCharCommentsLen(type):
+def getArticleCommentsLen(type):
    articles = getArticleByType(type)
-    xData = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
-    yData = [0 for x in range(len(xData))]
+    X = ['0-100','100-500','500-1000','1000-1500','1500-3000','3000-5000','5000-10000','10000-15000','15000-~']
+    Y = [0 for x in range(len(X))]
    for article in articles:
        commentLen = int(article[2])
        if commentLen < 100:
-            yData[0] += 1
+            Y[0] += 1
        elif commentLen < 500:
-            yData[1] += 1
+            Y[1] += 1
        elif commentLen < 5000:
-            yData[2] += 1
+            Y[2] += 1
        elif commentLen < 1000:
-            yData[3] += 1
+            Y[3] += 1
        elif commentLen < 1500:
-            yData[4] += 1
+            Y[4] += 1
        elif commentLen < 3000:
-            yData[5] += 1
+            Y[5] += 1
        elif commentLen < 5000:
-            yData[6] += 1
+            Y[6] += 1
        elif commentLen < 10000:
-            yData[7] += 1
+            Y[7] += 1
        elif commentLen >= 15000:
-            yData[8] += 1
-    return xData,yData
+            Y[8] += 1
+    return X,Y

-def getArticleCharRepotsLen(type):
+def getArticleRepotsLen(type):
    articles = getArticleByType(type)
-    xData = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
-    yData = [0 for x in range(len(xData))]
+    X = ['0-100','100-300','300-500','500-1000','1000-2000','2000-3000','3000-4000','4000-5000','5000-10000','10000-15000','15000-30000','30000-70000','70000-~']
+    Y = [0 for x in range(len(X))]
    for article in articles:
        repostsCount = int(article[3])
        if repostsCount < 100:
-            yData[0] += 1
+            Y[0] += 1
        elif repostsCount < 300:
-            yData[1] += 1
+            Y[1] += 1
        elif repostsCount < 500:
-            yData[2] += 1
+            Y[2] += 1
        elif repostsCount < 1000:
-            yData[3] += 1
+            Y[3] += 1
        elif repostsCount < 3000:
-            yData[4] += 1
+            Y[4] += 1
        elif repostsCount < 4000:
-            yData[5] += 1
+            Y[5] += 1
        elif repostsCount < 5000:
-            yData[6] += 1
+            Y[6] += 1
        elif repostsCount < 10000:
-            yData[7] += 1
+            Y[7] += 1
        elif repostsCount < 15000:
-            yData[8] += 1
+            Y[8] += 1
        elif repostsCount < 30000:
-            yData[9] += 1
+            Y[9] += 1
        elif repostsCount < 70000:
-            yData[10] += 1
+            Y[10] += 1
        elif repostsCount >= 70000:
-            yData[11] += 1
-    return xData,yData
+            Y[11] += 1
+    return X,Y

 def getIPCharByArticleRegion():
    articleRegionDic = {}
@@ -125,26 +125,26 @@ def getIPCharByCommentsRegion():
        })
    return resultData

-def getCommentCharDataOne():
-    xData = []
+def getCommentDataOne():
+    X = []
    rangeNum = 20
    for item in range(100):
-        xData.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
-    yData = [0 for x in range(len(xData))]
+        X.append(str(rangeNum * item) + '-' + str(rangeNum * (item + 1)))
+    Y = [0 for x in range(len(X))]
    for comment in commentList:
        for item in range(100):
            if int(comment[2]) < rangeNum * (item + 1):
-                yData[item] += 1
+                Y[item] += 1
                break
-    return xData,yData
+    return X,Y

-def getCommentCharDataTwo():
+def getCommentDataTwo():
    genderDic = {}
    for i in commentList:
-        if genderDic.get(i[6],-1) == -1:
-            genderDic[i[6]] = 1
-        else:
+        if i[6] in genderDic.keys():
            genderDic[i[6]] += 1
+        else:
+            genderDic[i[6]] = 1
    resultData = [{
        'name':x[0],
        'value':x[1]
@@ -153,50 +153,50 @@ def getCommentCharDataTwo():

 def getYuQingCharDataOne():
    hotWordList = getAllHotWords()
-    xData = ['正面','中性','负面']
-    yData = [0,0,0]
+    X = ['正面','中性','负面']
+    Y = [0,0,0]
    for word in hotWordList:
        emotionValue = SnowNLP(word[0]).sentiments
        if emotionValue > 0.4:
-            yData[0] += 1
+            Y[0] += 1
        elif emotionValue < 0.2:
-            yData[2] += 1
+            Y[2] += 1
        else:
-            yData[1] += 1
-    bieData = [{
+            Y[1] += 1
+    finaldata = [{
        'name':x,
-        'value':yData[index]
-    } for index,x in enumerate(xData)]
-    return xData,yData,bieData
+        'value':Y[index]
+    } for index,x in enumerate(X)]
+    return X,Y,finaldata

 def getYuQingCharDataTwo():
-    xData = ['正面', '中性', '负面']
-    bieData1 = [{
+    X = ['正面', '中性', '负面']
+    finaldata1 = [{
        'name':x,
        'value':0
-    } for x in xData]
-    bieData2 = [{
+    } for x in X]
+    finaldata2 = [{
        'name': x,
        'value': 0
-    } for x in xData]
+    } for x in X]

    for comment in commentList:
        emotionValue = SnowNLP(comment[4]).sentiments
        if emotionValue > 0.4:
-            bieData1[0]['value'] += 1
+            finaldata1[0]['value'] += 1
        elif emotionValue < 0.2:
-            bieData1[2]['value'] += 1
+            finaldata1[2]['value'] += 1
        else:
-            bieData1[1]['value'] += 1
+            finaldata1[1]['value'] += 1
    for artile in articleList:
        emotionValue = SnowNLP(artile[5]).sentiments
        if emotionValue > 0.4:
-            bieData2[0]['value'] += 1
+            finaldata2[0]['value'] += 1
        elif emotionValue < 0.2:
-            bieData2[2]['value'] += 1
+            finaldata2[2]['value'] += 1
        else:
-            bieData2[1]['value'] += 1
-    return bieData1,bieData2
+            finaldata2[1]['value'] += 1
+    return finaldata1,finaldata2

 def getYuQingCharDataThree():
    hotWordList = getAllHotWords()
@@ -27,14 +27,14 @@ def getHomeCommentsLikeCountTopFore():# 获取评论中点赞最高的前四条
    return list(sorted(commentsList,key=lambda x:int(x[2]),reverse=True))[:4]

 def getHomeArticleCreatedAtChart():# 根据日期分别计算该日期的文章数
-    xData = list(set([x[7] for x in articleList]))
-    xData = list(sorted(xData,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
-    yData = [0 for x in range(len(xData))]
+    X = list(set([x[7] for x in articleList]))
+    X = list(sorted(X,key=lambda x:datetime.strptime(x,'%Y-%m-%d').timestamp(),reverse=True))
+    Y = [0 for x in range(len(X))]
    for article in articleList:
-        for index,j in enumerate(xData):# 返回索引和值
+        for index,j in enumerate(X):# 返回索引和值
            if article[7] == j:
-                yData[index] += 1
-    return xData,yData
+                Y[index] += 1
+    return X,Y

 def getHomeTypeChart():# 统计每种类型的文章数量
    typeDic = {}
@@ -50,9 +50,9 @@ def getTopicData():
    # 读取合并文件 merge.csv    # 取前十个话题
    top_10_topics = pd.read_csv('./merged_topics.csv').head(10)
    # 获取话题名称和对应的值
-    xData = top_10_topics['name'].tolist()
-    yData = top_10_topics['value'].tolist()
-    return xData, yData
+    X = top_10_topics['name'].tolist()
+    Y = top_10_topics['value'].tolist()
+    return X, Y

 def getTopicCreatedAtandpredictData(topic):# 统计特定话题的评论在每个日期的数量，并返回日期和对应的评论数量
    createdAt = {}