diff --git a/spider/main.py b/spider/main.py index e668948..d164331 100644 --- a/spider/main.py +++ b/spider/main.py @@ -1,34 +1,6 @@ from spiderContent import start as spiderContentStart from spiderComments import start as spiderCommentsStart -import os -from sqlalchemy import create_engine -import pandas as pd - -engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') - -def save_to_sql(): - try: - artileOldPd = pd.read_sql('select * from article',engine) - articleNewPd = pd.read_csv('articleData.csv') - commentOldPd = pd.read_sql('select * from comments',engine) - commentNewPd = pd.read_csv('articleComments.csv') - - concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') - concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') - - concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) - concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) - - concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) - concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) - except: - articleNewPd = pd.read_csv('articleData.csv') - commentNewPd = pd.read_csv('articleComments.csv') - articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) - commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) - - os.remove('./articleData.csv') - os.remove('./articleComments.csv') +from saveData import save_to_sql as saveData def main(): print('正在爬取文章数据') @@ -36,8 +8,7 @@ def main(): print('正在爬取文章评论数据') spiderCommentsStart() print('正在存储数据') - save_to_sql() - + saveData() if __name__ == '__main__': main() \ No newline at end of file diff --git a/spider/saveData.py b/spider/saveData.py new file mode 100644 index 0000000..7bcf35d --- /dev/null +++ b/spider/saveData.py @@ -0,0 +1,32 @@ +import os +from sqlalchemy import create_engine +import pandas as pd + +engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') + +def save_to_sql(): + try: + artileOldPd = pd.read_sql('select * from article',engine) + articleNewPd = pd.read_csv('articleData.csv') + commentOldPd = pd.read_sql('select * from comments',engine) + commentNewPd = pd.read_csv('articleComments.csv') + + concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner') + concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner') + + concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True) + concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True) + + concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False) + concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False) + except: + articleNewPd = pd.read_csv('articleData.csv') + commentNewPd = pd.read_csv('articleComments.csv') + articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False) + commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False) + + os.remove('./articleData.csv') + os.remove('./articleComments.csv') + +if __name__ == '__main__': + save_to_sql() \ No newline at end of file