修改【main.py】,增加【saveData.py】,将数据存储模块与调度模块分离
This commit is contained in:
+2
-31
@@ -1,34 +1,6 @@
|
|||||||
from spiderContent import start as spiderContentStart
|
from spiderContent import start as spiderContentStart
|
||||||
from spiderComments import start as spiderCommentsStart
|
from spiderComments import start as spiderCommentsStart
|
||||||
import os
|
from saveData import save_to_sql as saveData
|
||||||
from sqlalchemy import create_engine
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
|
|
||||||
|
|
||||||
def save_to_sql():
|
|
||||||
try:
|
|
||||||
artileOldPd = pd.read_sql('select * from article',engine)
|
|
||||||
articleNewPd = pd.read_csv('articleData.csv')
|
|
||||||
commentOldPd = pd.read_sql('select * from comments',engine)
|
|
||||||
commentNewPd = pd.read_csv('articleComments.csv')
|
|
||||||
|
|
||||||
concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
|
|
||||||
concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
|
|
||||||
|
|
||||||
concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
|
|
||||||
concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
|
|
||||||
|
|
||||||
concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
|
|
||||||
concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
|
|
||||||
except:
|
|
||||||
articleNewPd = pd.read_csv('articleData.csv')
|
|
||||||
commentNewPd = pd.read_csv('articleComments.csv')
|
|
||||||
articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
|
|
||||||
commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
|
|
||||||
|
|
||||||
os.remove('./articleData.csv')
|
|
||||||
os.remove('./articleComments.csv')
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print('正在爬取文章数据')
|
print('正在爬取文章数据')
|
||||||
@@ -36,8 +8,7 @@ def main():
|
|||||||
print('正在爬取文章评论数据')
|
print('正在爬取文章评论数据')
|
||||||
spiderCommentsStart()
|
spiderCommentsStart()
|
||||||
print('正在存储数据')
|
print('正在存储数据')
|
||||||
save_to_sql()
|
saveData()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
import os
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@10.92.35.13/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
|
||||||
|
|
||||||
|
def save_to_sql():
|
||||||
|
try:
|
||||||
|
artileOldPd = pd.read_sql('select * from article',engine)
|
||||||
|
articleNewPd = pd.read_csv('articleData.csv')
|
||||||
|
commentOldPd = pd.read_sql('select * from comments',engine)
|
||||||
|
commentNewPd = pd.read_csv('articleComments.csv')
|
||||||
|
|
||||||
|
concatArticlePd = pd.concat([articleNewPd,artileOldPd],join='inner')
|
||||||
|
concatCommentsPd = pd.concat([commentNewPd,commentOldPd],join='inner')
|
||||||
|
|
||||||
|
concatArticlePd.drop_duplicates(subset='id',keep='last',inplace=True)
|
||||||
|
concatCommentsPd.drop_duplicates(subset='content',keep='last',inplace=True)
|
||||||
|
|
||||||
|
concatArticlePd.to_sql('article', con=engine, if_exists='replace', index=False)
|
||||||
|
concatCommentsPd.to_sql('comments', con=engine, if_exists='replace', index=False)
|
||||||
|
except:
|
||||||
|
articleNewPd = pd.read_csv('articleData.csv')
|
||||||
|
commentNewPd = pd.read_csv('articleComments.csv')
|
||||||
|
articleNewPd.to_sql('article',con=engine,if_exists='replace',index=False)
|
||||||
|
commentNewPd.to_sql('comments',con=engine,if_exists='replace',index=False)
|
||||||
|
|
||||||
|
os.remove('./articleData.csv')
|
||||||
|
os.remove('./articleComments.csv')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
save_to_sql()
|
||||||
Reference in New Issue
Block a user