From 24a81848f476758464d057cf4c0747a4ef93b637 Mon Sep 17 00:00:00 2001 From: YYL469 <2049360881@qq.com> Date: Tue, 2 Jul 2024 21:55:04 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90main.py=E3=80=91=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=E7=9A=84=E7=88=AC=E5=8F=96=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E8=BF=87=E7=A8=8B=EF=BC=8C=E5=B9=B6=E5=B0=86=E7=88=AC=E5=8F=96?= =?UTF-8?q?=E7=9A=84=E6=95=B0=E6=8D=AE=E5=AD=98=E5=82=A8=E5=88=B0=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider/main.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 spider/main.py diff --git a/spider/main.py b/spider/main.py new file mode 100644 index 0000000..e668948 --- /dev/null +++ b/spider/main.py @@ -0,0 +1,43 @@ +from spiderContent import start as spiderContentStart +from spiderComments import start as spiderCommentsStart +import os +from sqlalchemy import create_engine +import pandas as pd + +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') + +def save_to_sql(): + try: + artileOldPd = pd.read_sql('select * from article',engine) + articleNewPd = pd.read_csv('articleData.csv') + commentOldPd = pd.read_sql('select * from comments',engine) + commentNewPd = pd.read_csv('articleComments.csv') + + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') + + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) + + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) + except: + articleNewPd = pd.read_csv('articleData.csv') + commentNewPd = pd.read_csv('articleComments.csv') + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) + + os.remove('./articleData.csv') + os.remove('./articleComments.csv') + +def main(): + print('正在爬取文章数据') + spiderContentStart(1,1) + print('正在爬取文章评论数据') + spiderCommentsStart() + print('正在存储数据') + save_to_sql() + + +if __name__ == '__main__': + main() \ No newline at end of file