From 9bebd200b71e86ff83f3f8e3bf1cce33d6309d4a Mon Sep 17 00:00:00 2001 From: YYL469 <2049360881@qq.com> Date: Thu, 4 Jul 2024 09:41:18 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=88=AC=E8=99=AB=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 2 +- spider/main.py | 9 +++------ spider/spiderData.py | 16 ++++++++++++++++ spider/spiderDataPackage/__init__.py | 0 spider/spiderDataPackage/settings.py | 3 +++ spider/{ => spiderDataPackage}/spiderComments.py | 9 +++++---- spider/{ => spiderDataPackage}/spiderContent.py | 9 +++++---- spider/{ => spiderDataPackage}/spiderNav.py | 16 +++++++++------- 8 files changed, 42 insertions(+), 22 deletions(-) create mode 100644 spider/spiderData.py create mode 100644 spider/spiderDataPackage/__init__.py create mode 100644 spider/spiderDataPackage/settings.py rename spider/{ => spiderDataPackage}/spiderComments.py (91%) rename spider/{ => spiderDataPackage}/spiderContent.py (93%) rename spider/{ => spiderDataPackage}/spiderNav.py (88%) diff --git a/app.py b/app.py index 3c1c876..0606542 100644 --- a/app.py +++ b/app.py @@ -36,7 +36,7 @@ def run_spider_script(): if __name__ == '__main__': scheduler = BackgroundScheduler(timezone=utc) - scheduler.add_job(run_spider_script, 'interval', hours=5) + scheduler.add_job(run_spider_script, 'interval', minutes=1) scheduler.start() try: diff --git a/spider/main.py b/spider/main.py index 7947a96..5aeb4d2 100644 --- a/spider/main.py +++ b/spider/main.py @@ -1,12 +1,9 @@ -from spiderContent import start as spiderContent -from spiderComments import start as spiderComments +from spiderData import spiderData from saveData import save_to_sql as saveData def main(): - print('正在爬取文章数据') - spiderContent(1,1) - print('正在爬取文章评论数据') - spiderComments() + print('正在爬取数据') + spiderData() print('正在存储数据') saveData() print("爬取数据更新") diff --git a/spider/spiderData.py b/spider/spiderData.py new file mode 100644 index 0000000..97498a9 --- /dev/null +++ b/spider/spiderData.py @@ -0,0 +1,16 @@ +from spiderDataPackage.spiderNav import start as spiderNav +from spiderDataPackage.spiderContent import start as spiderContent +from spiderDataPackage.spiderComments import start as spiderComments +import os + +def spiderData(): + if not os.path.exists('./nav.csv'): + print('正在爬取导航栏数据') + spiderNav() + print('正在爬取文章数据') + spiderContent(1,1) + print('正在爬取文章评论数据') + spiderComments() + +if __name__ == '__main__': + spiderData() \ No newline at end of file diff --git a/spider/spiderDataPackage/__init__.py b/spider/spiderDataPackage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spider/spiderDataPackage/settings.py b/spider/spiderDataPackage/settings.py new file mode 100644 index 0000000..7e613b5 --- /dev/null +++ b/spider/spiderDataPackage/settings.py @@ -0,0 +1,3 @@ +navAddr="./nav.csv" +articleAddr="./article.csv" +commentsAddr="./comments.csv" \ No newline at end of file diff --git a/spider/spiderComments.py b/spider/spiderDataPackage/spiderComments.py similarity index 91% rename from spider/spiderComments.py rename to spider/spiderDataPackage/spiderComments.py index 6cd368d..6bfc121 100644 --- a/spider/spiderComments.py +++ b/spider/spiderDataPackage/spiderComments.py @@ -3,10 +3,11 @@ import requests import csv import os from datetime import datetime +from settings import articleAddr,commentsAddr def init(): - if not os.path.exists('./comments.csv'): - with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists(commentsAddr): + with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'articleId', @@ -21,7 +22,7 @@ def init(): ]) def write(row): - with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -38,7 +39,7 @@ def fetchData(url,params): def getArticleList(): articleList = [] - with open('./article.csv','r',encoding='utf-8') as reader: + with open(articleAddr,'r',encoding='utf-8') as reader: readerCsv = csv.reader(reader) next(reader) for nav in readerCsv: diff --git a/spider/spiderContent.py b/spider/spiderDataPackage/spiderContent.py similarity index 93% rename from spider/spiderContent.py rename to spider/spiderDataPackage/spiderContent.py index c03dc5d..f0afd84 100644 --- a/spider/spiderContent.py +++ b/spider/spiderDataPackage/spiderContent.py @@ -3,10 +3,11 @@ import requests import csv import os from datetime import datetime +from settings import navAddr,articleAddr def init(): - if not os.path.exists('./article.csv'): - with open('./article.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists(articleAddr): + with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'id', @@ -26,7 +27,7 @@ def init(): ]) def write(row): - with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -43,7 +44,7 @@ def fetchData(url,params): def getTypeList(): typeList = [] - with open('./nav.csv','r',encoding='utf-8') as reader: + with open(navAddr,'r',encoding='utf-8') as reader: readerCsv = csv.reader(reader) next(reader) for nav in readerCsv: diff --git a/spider/spiderNav.py b/spider/spiderDataPackage/spiderNav.py similarity index 88% rename from spider/spiderNav.py rename to spider/spiderDataPackage/spiderNav.py index 8cba3a9..7322034 100644 --- a/spider/spiderNav.py +++ b/spider/spiderDataPackage/spiderNav.py @@ -2,10 +2,10 @@ import requests import csv import numpy as np import os - +from settings import navAddr def init(): - if not os.path.exists('./nav.csv'): - with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists(navAddr): + with open(navAddr,'w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'typeName', @@ -14,7 +14,7 @@ def init(): ]) def write(row): - with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -45,9 +45,11 @@ def readJson(response): containerid ]) - -if __name__ == '__main__': +def start(): init() url = 'https://weibo.com/ajax/feed/allGroups' response = fetchData(url) - readJson(response) \ No newline at end of file + readJson(response) + +if __name__ == '__main__': + start() \ No newline at end of file