Merge branch 'main' of https://github.com/666ghj/Weibo_PublicOpinion_AnalysisSystem

2024-07-02 23:14:20 +08:00
parent 37df0f52b2 01ed2ce623
commit 07496a62a0
5 changed files with 266 additions and 0 deletions
@@ -0,0 +1,47 @@
+SET FOREIGN_KEY_CHECKS=0;
+
+-- ----------------------------
+-- article表
+-- ----------------------------
+CREATE TABLE `article` (
+  `id` bigint(20) DEFAULT NULL,
+  `likeNum` bigint(20) DEFAULT NULL,
+  `commentsLen` bigint(20) DEFAULT NULL,
+  `reposts_count` bigint(20) DEFAULT NULL,
+  `region` text,
+  `content` text,
+  `contentLen` bigint(20) DEFAULT NULL,
+  `created_at` text,
+  `type` text,
+  `detailUrl` text,
+  `authorAvatar` text,
+  `authorName` text,
+  `authorDetail` text,
+  `isVip` double DEFAULT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+
+-- ----------------------------
+-- comments表
+-- ----------------------------
+CREATE TABLE `comments` (
+  `articleId` bigint(20) DEFAULT NULL,
+  `created_at` text,
+  `likes_counts` bigint(20) DEFAULT NULL,
+  `region` text,
+  `content` text,
+  `authorName` text,
+  `authorGender` text,
+  `authorAddress` text,
+  `authorAvatar` text
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
+
+-- ----------------------------
+-- user表
+-- ----------------------------
+CREATE TABLE `user` (
+  `username` varchar(255) DEFAULT NULL,
+  `password` varchar(255) DEFAULT NULL,
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `createTime` varchar(255) DEFAULT NULL,
+  PRIMARY KEY (`id`)
+) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
@@ -0,0 +1,43 @@
+from spiderContent import start as spiderContentStart
+from spiderComments import start as spiderCommentsStart
+import os
+from sqlalchemy import create_engine
+import pandas as pd
+
+engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
+
+def save_to_sql():
+    try:
+        artileOldPd = pd.read_sql('select * from article',engine)
+        articleNewPd = pd.read_csv('articleData.csv')
+        commentOldPd = pd.read_sql('select * from comments',engine)
+        commentNewPd = pd.read_csv('articleComments.csv')
+
+        concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
+        concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
+
+        concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
+        concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
+
+        concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
+        concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
+    except:
+        articleNewPd = pd.read_csv('articleData.csv')
+        commentNewPd = pd.read_csv('articleComments.csv')
+        articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
+        commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
+
+    os.remove('./articleData.csv')
+    os.remove('./articleComments.csv')
+
+def main():
+    print('正在爬取文章数据')
+    spiderContentStart(1,1)
+    print('正在爬取文章评论数据')
+    spiderCommentsStart()
+    print('正在存储数据')
+    save_to_sql()
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,99 @@
+import time
+import requests
+import csv
+import os
+from datetime import datetime
+
+def init():
+    if not os.path.exists('./articleComments.csv'):
+        with open('./articleComments.csv','w',encoding='utf-8',newline='') as csvFile:
+            writer = csv.writer(csvFile)
+            writer.writerow([
+                'articleId',
+                'created_at',
+                'likes_counts',
+                'region',
+                'content',
+                'authorName',
+                'authorGender',
+                'authorAddress',
+                'authorAvatar'
+            ])
+
+def writerRow(row):
+    with open('./articleComments.csv', 'a', encoding='utf-8', newline='') as csvFile:
+        writer = csv.writer(csvFile)
+        writer.writerow(row)
+
+def get_data(url,params):
+    headers = {
+        'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
+    }
+    response = requests.get(url,headers=headers,params=params)
+    if response.status_code == 200:
+        return response.json()['data']
+    else:
+        return None
+
+def getAllArticleList():
+    artileList = []
+    with open('./articleData.csv','r',encoding='utf-8') as reader:
+        readerCsv = csv.reader(reader)
+        next(reader)
+        for nav in readerCsv:
+            artileList.append(nav)
+    return artileList
+
+def parse_json(response,artileId):
+    for comment in response:
+        created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
+        likes_counts = comment['like_counts']
+        try:
+            region = comment['source'].replace('来自', '')
+        except:
+            region = '无'
+        content = comment['text_raw']
+        authorName = comment['user']['screen_name']
+        authorGender = comment['user']['gender']
+        authorAddress = comment['user']['location']
+        authorAvatar = comment['user']['avatar_large']
+        writerRow([
+            artileId,
+            created_at,
+            likes_counts,
+            region,
+            content,
+            authorName,
+            authorGender,
+            authorAddress,
+            authorAvatar
+        ])
+
+def start():
+    commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
+    init()
+    articleList = getAllArticleList()
+    for article in articleList:
+        articleId = article[0]
+        print('正在爬取id值为%s的文章评论' % articleId)
+        time.sleep(2)
+        params = {
+            'id':int(articleId),
+            'is_show_bulletin':2
+        }
+        response = get_data(commentUrl,params)
+        parse_json(response,articleId)
+
+
+
+if __name__ == '__main__':
+    start()
+
+
+
+
+
+
+
+
@@ -152,3 +152,59 @@ def getCommentCharDataTwo():# 统计评论数据中不同性别的数量
        })
    return resultData

+def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量
+    hotWordList = getAllHotWords()
+    xData = ['正面','中性','负面']
+    yData = [0,0,0]
+    for word in hotWordList:
+        emotionValue = SnowNLP(word[0]).sentiments
+        if emotionValue > 0.5:
+            yData[0] += 1
+        elif emotionValue == 0.5:
+            yData[1] += 1
+        elif emotionValue < 0.5:
+            yData[2] += 1
+    finalData = [{
+        'name':x,
+        'value':yData[index]
+    } for index,x in enumerate(xData)]
+    return xData,yData,finalData
+
+def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值
+    xData = ['正面', '中性', '负面']
+    finalData1 = [{
+        'name':x,
+        'value':0
+    } for x in xData]
+    finalData2 = [{
+        'name': x,
+        'value': 0
+    } for x in xData]
+
+    for comment in commentList:
+        emotionValue = SnowNLP(comment[4]).sentiments
+        if emotionValue > 0.5:
+            finalData1[0]['value'] += 1
+        elif emotionValue == 0.5:
+            finalData1[1]['value'] += 1
+        elif emotionValue < 0.5:
+            finalData1[2]['value'] += 1
+    for artile in articleList:
+        emotionValue = SnowNLP(artile[5]).sentiments
+        if emotionValue > 0.5:
+            finalData2[0]['value'] += 1
+        elif emotionValue == 0.5:
+            finalData2[1]['value'] += 1
+        elif emotionValue < 0.5:
+            finalData2[2]['value'] += 1
+    return finalData1,finalData2
+
+def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率
+    hotWordList = getAllHotWords()
+    xData = []
+    yData = []
+    for i in hotWordList[:10]:
+        xData.append(i[0])
+        yData.append(int(i[1]))
+    return xData,yData
+
@@ -0,0 +1,21 @@
+from utils.getPublicData import getAllArticleData
+from snownlp import SnowNLP
+
+def getTableDataList(flag):
+    if flag:
+        tableList = []
+        articeList = getAllArticleData()
+        for article in articeList:
+            item = list(article)
+            value = ''
+            if SnowNLP(item[5]).sentiments > 0.5:
+                value = '正面'
+            elif SnowNLP(item[5]).sentiments < 0.5:
+                value = '负面'
+            else:
+                value = '中性'
+            item.append(value)
+            tableList.append(item)
+        return tableList
+    else:
+        return getAllArticleData()