优化爬虫代码

This commit is contained in:
YYL469
2024-07-04 09:41:18 +08:00
parent fe1e02877b
commit 9bebd200b7
8 changed files with 42 additions and 22 deletions
+1 -1
View File
@@ -36,7 +36,7 @@ def run_spider_script():
if __name__ == '__main__': if __name__ == '__main__':
scheduler = BackgroundScheduler(timezone=utc) scheduler = BackgroundScheduler(timezone=utc)
scheduler.add_job(run_spider_script, 'interval', hours=5) scheduler.add_job(run_spider_script, 'interval', minutes=1)
scheduler.start() scheduler.start()
try: try:
+3 -6
View File
@@ -1,12 +1,9 @@
from spiderContent import start as spiderContent from spiderData import spiderData
from spiderComments import start as spiderComments
from saveData import save_to_sql as saveData from saveData import save_to_sql as saveData
def main(): def main():
print('正在爬取文章数据') print('正在爬取数据')
spiderContent(1,1) spiderData()
print('正在爬取文章评论数据')
spiderComments()
print('正在存储数据') print('正在存储数据')
saveData() saveData()
print("爬取数据更新") print("爬取数据更新")
+16
View File
@@ -0,0 +1,16 @@
from spiderDataPackage.spiderNav import start as spiderNav
from spiderDataPackage.spiderContent import start as spiderContent
from spiderDataPackage.spiderComments import start as spiderComments
import os
def spiderData():
if not os.path.exists('./nav.csv'):
print('正在爬取导航栏数据')
spiderNav()
print('正在爬取文章数据')
spiderContent(1,1)
print('正在爬取文章评论数据')
spiderComments()
if __name__ == '__main__':
spiderData()
+3
View File
@@ -0,0 +1,3 @@
navAddr="./nav.csv"
articleAddr="./article.csv"
commentsAddr="./comments.csv"
@@ -3,10 +3,11 @@ import requests
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from settings import articleAddr,commentsAddr
def init(): def init():
if not os.path.exists('./comments.csv'): if not os.path.exists(commentsAddr):
with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile: with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow([
'articleId', 'articleId',
@@ -21,7 +22,7 @@ def init():
]) ])
def write(row): def write(row):
with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile: with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
@@ -38,7 +39,7 @@ def fetchData(url,params):
def getArticleList(): def getArticleList():
articleList = [] articleList = []
with open('./article.csv','r',encoding='utf-8') as reader: with open(articleAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader) readerCsv = csv.reader(reader)
next(reader) next(reader)
for nav in readerCsv: for nav in readerCsv:
@@ -3,10 +3,11 @@ import requests
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from settings import navAddr,articleAddr
def init(): def init():
if not os.path.exists('./article.csv'): if not os.path.exists(articleAddr):
with open('./article.csv','w',encoding='utf-8',newline='') as csvFile: with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow([
'id', 'id',
@@ -26,7 +27,7 @@ def init():
]) ])
def write(row): def write(row):
with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile: with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
@@ -43,7 +44,7 @@ def fetchData(url,params):
def getTypeList(): def getTypeList():
typeList = [] typeList = []
with open('./nav.csv','r',encoding='utf-8') as reader: with open(navAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader) readerCsv = csv.reader(reader)
next(reader) next(reader)
for nav in readerCsv: for nav in readerCsv:
@@ -2,10 +2,10 @@ import requests
import csv import csv
import numpy as np import numpy as np
import os import os
from settings import navAddr
def init(): def init():
if not os.path.exists('./nav.csv'): if not os.path.exists(navAddr):
with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile: with open(navAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow([
'typeName', 'typeName',
@@ -14,7 +14,7 @@ def init():
]) ])
def write(row): def write(row):
with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile: with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
@@ -45,9 +45,11 @@ def readJson(response):
containerid containerid
]) ])
def start():
if __name__ == '__main__':
init() init()
url = 'https://weibo.com/ajax/feed/allGroups' url = 'https://weibo.com/ajax/feed/allGroups'
response = fetchData(url) response = fetchData(url)
readJson(response) readJson(response)
if __name__ == '__main__':
start()