diff --git a/createTables.sql b/createTables.sql new file mode 100644 index 0000000..d8f5760 --- /dev/null +++ b/createTables.sql @@ -0,0 +1,47 @@ +SET FOREIGN_KEY_CHECKS=0; + +-- ---------------------------- +-- article表 +-- ---------------------------- +CREATE TABLE `article` ( + `id` bigint(20) DEFAULT NULL, + `likeNum` bigint(20) DEFAULT NULL, + `commentsLen` bigint(20) DEFAULT NULL, + `reposts_count` bigint(20) DEFAULT NULL, + `region` text, + `content` text, + `contentLen` bigint(20) DEFAULT NULL, + `created_at` text, + `type` text, + `detailUrl` text, + `authorAvatar` text, + `authorName` text, + `authorDetail` text, + `isVip` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; + +-- ---------------------------- +-- comments表 +-- ---------------------------- +CREATE TABLE `comments` ( + `articleId` bigint(20) DEFAULT NULL, + `created_at` text, + `likes_counts` bigint(20) DEFAULT NULL, + `region` text, + `content` text, + `authorName` text, + `authorGender` text, + `authorAddress` text, + `authorAvatar` text +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; + +-- ---------------------------- +-- user表 +-- ---------------------------- +CREATE TABLE `user` ( + `username` varchar(255) DEFAULT NULL, + `password` varchar(255) DEFAULT NULL, + `id` int(11) NOT NULL AUTO_INCREMENT, + `createTime` varchar(255) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8; \ No newline at end of file diff --git a/spider/main.py b/spider/main.py new file mode 100644 index 0000000..e668948 --- /dev/null +++ b/spider/main.py @@ -0,0 +1,43 @@ +from spiderContent import start as spiderContentStart +from spiderComments import start as spiderCommentsStart +import os +from sqlalchemy import create_engine +import pandas as pd + +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') + +def save_to_sql(): + try: + artileOldPd = pd.read_sql('select * from article',engine) + articleNewPd = pd.read_csv('articleData.csv') + commentOldPd = pd.read_sql('select * from comments',engine) + commentNewPd = pd.read_csv('articleComments.csv') + + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') + + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) + + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) + except: + articleNewPd = pd.read_csv('articleData.csv') + commentNewPd = pd.read_csv('articleComments.csv') + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) + + os.remove('./articleData.csv') + os.remove('./articleComments.csv') + +def main(): + print('正在爬取文章数据') + spiderContentStart(1,1) + print('正在爬取文章评论数据') + spiderCommentsStart() + print('正在存储数据') + save_to_sql() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/spider/spiderComments.py b/spider/spiderComments.py new file mode 100644 index 0000000..ea00843 --- /dev/null +++ b/spider/spiderComments.py @@ -0,0 +1,99 @@ +import time +import requests +import csv +import os +from datetime import datetime + +def init(): + if not os.path.exists('./articleComments.csv'): + with open('./articleComments.csv','w',encoding='utf-8',newline='') as csvFile: + writer = csv.writer(csvFile) + writer.writerow([ + 'articleId', + 'created_at', + 'likes_counts', + 'region', + 'content', + 'authorName', + 'authorGender', + 'authorAddress', + 'authorAvatar' + ]) + +def writerRow(row): + with open('./articleComments.csv', 'a', encoding='utf-8', newline='') as csvFile: + writer = csv.writer(csvFile) + writer.writerow(row) + +def get_data(url,params): + headers = { + 'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' + } + response = requests.get(url,headers=headers,params=params) + if response.status_code == 200: + return response.json()['data'] + else: + return None + +def getAllArticleList(): + artileList = [] + with open('./articleData.csv','r',encoding='utf-8') as reader: + readerCsv = csv.reader(reader) + next(reader) + for nav in readerCsv: + artileList.append(nav) + return artileList + +def parse_json(response,artileId): + for comment in response: + created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') + likes_counts = comment['like_counts'] + try: + region = comment['source'].replace('来自', '') + except: + region = '无' + content = comment['text_raw'] + authorName = comment['user']['screen_name'] + authorGender = comment['user']['gender'] + authorAddress = comment['user']['location'] + authorAvatar = comment['user']['avatar_large'] + writerRow([ + artileId, + created_at, + likes_counts, + region, + content, + authorName, + authorGender, + authorAddress, + authorAvatar + ]) + +def start(): + commentUrl = 'https://weibo.com/ajax/statuses/buildComments' + init() + articleList = getAllArticleList() + for article in articleList: + articleId = article[0] + print('正在爬取id值为%s的文章评论' % articleId) + time.sleep(2) + params = { + 'id':int(articleId), + 'is_show_bulletin':2 + } + response = get_data(commentUrl,params) + parse_json(response,articleId) + + + +if __name__ == '__main__': + start() + + + + + + + + diff --git a/utils/getEchartsData.py b/utils/getEchartsData.py index f4a3cde..771e1be 100644 --- a/utils/getEchartsData.py +++ b/utils/getEchartsData.py @@ -152,3 +152,59 @@ def getCommentCharDataTwo():# 统计评论数据中不同性别的数量 }) return resultData +def getYuQingCharDataOne():# 统计热词中正面、中性、负面的数量 + hotWordList = getAllHotWords() + xData = ['正面','中性','负面'] + yData = [0,0,0] + for word in hotWordList: + emotionValue = SnowNLP(word[0]).sentiments + if emotionValue > 0.5: + yData[0] += 1 + elif emotionValue == 0.5: + yData[1] += 1 + elif emotionValue < 0.5: + yData[2] += 1 + finalData = [{ + 'name':x, + 'value':yData[index] + } for index,x in enumerate(xData)] + return xData,yData,finalData + +def getYuQingCharDataTwo():# 统计评论列表和文章列表中的情感值 + xData = ['正面', '中性', '负面'] + finalData1 = [{ + 'name':x, + 'value':0 + } for x in xData] + finalData2 = [{ + 'name': x, + 'value': 0 + } for x in xData] + + for comment in commentList: + emotionValue = SnowNLP(comment[4]).sentiments + if emotionValue > 0.5: + finalData1[0]['value'] += 1 + elif emotionValue == 0.5: + finalData1[1]['value'] += 1 + elif emotionValue < 0.5: + finalData1[2]['value'] += 1 + for artile in articleList: + emotionValue = SnowNLP(artile[5]).sentiments + if emotionValue > 0.5: + finalData2[0]['value'] += 1 + elif emotionValue == 0.5: + finalData2[1]['value'] += 1 + elif emotionValue < 0.5: + finalData2[2]['value'] += 1 + return finalData1,finalData2 + +def getYuQingCharDataThree():# 提取前10个热词及其对应的出现频率 + hotWordList = getAllHotWords() + xData = [] + yData = [] + for i in hotWordList[:10]: + xData.append(i[0]) + yData.append(int(i[1])) + return xData,yData + diff --git a/utils/getTableData.py b/utils/getTableData.py new file mode 100644 index 0000000..87c4824 --- /dev/null +++ b/utils/getTableData.py @@ -0,0 +1,21 @@ +from utils.getPublicData import getAllArticleData +from snownlp import SnowNLP + +def getTableDataList(flag): + if flag: + tableList = [] + articeList = getAllArticleData() + for article in articeList: + item = list(article) + value = '' + if SnowNLP(item[5]).sentiments > 0.5: + value = '正面' + elif SnowNLP(item[5]).sentiments < 0.5: + value = '负面' + else: + value = '中性' + item.append(value) + tableList.append(item) + return tableList + else: + return getAllArticleData() \ No newline at end of file