优化爬虫代码

This commit is contained in:
YYL469
2024-07-04 09:41:18 +08:00
parent fe1e02877b
commit 9bebd200b7
8 changed files with 42 additions and 22 deletions
+1 -1
View File
@@ -36,7 +36,7 @@ def run_spider_script():
if __name__ == '__main__':
scheduler = BackgroundScheduler(timezone=utc)
scheduler.add_job(run_spider_script, 'interval', hours=5)
scheduler.add_job(run_spider_script, 'interval', minutes=1)
scheduler.start()
try:
+3 -6
View File
@@ -1,12 +1,9 @@
from spiderContent import start as spiderContent
from spiderComments import start as spiderComments
from spiderData import spiderData
from saveData import save_to_sql as saveData
def main():
print('正在爬取文章数据')
spiderContent(1,1)
print('正在爬取文章评论数据')
spiderComments()
print('正在爬取数据')
spiderData()
print('正在存储数据')
saveData()
print("爬取数据更新")
+16
View File
@@ -0,0 +1,16 @@
from spiderDataPackage.spiderNav import start as spiderNav
from spiderDataPackage.spiderContent import start as spiderContent
from spiderDataPackage.spiderComments import start as spiderComments
import os
def spiderData():
if not os.path.exists('./nav.csv'):
print('正在爬取导航栏数据')
spiderNav()
print('正在爬取文章数据')
spiderContent(1,1)
print('正在爬取文章评论数据')
spiderComments()
if __name__ == '__main__':
spiderData()
+3
View File
@@ -0,0 +1,3 @@
navAddr="./nav.csv"
articleAddr="./article.csv"
commentsAddr="./comments.csv"
@@ -3,10 +3,11 @@ import requests
import csv
import os
from datetime import datetime
from settings import articleAddr,commentsAddr
def init():
if not os.path.exists('./comments.csv'):
with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(commentsAddr):
with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'articleId',
@@ -21,7 +22,7 @@ def init():
])
def write(row):
with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
@@ -38,7 +39,7 @@ def fetchData(url,params):
def getArticleList():
articleList = []
with open('./article.csv','r',encoding='utf-8') as reader:
with open(articleAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader)
next(reader)
for nav in readerCsv:
@@ -3,10 +3,11 @@ import requests
import csv
import os
from datetime import datetime
from settings import navAddr,articleAddr
def init():
if not os.path.exists('./article.csv'):
with open('./article.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(articleAddr):
with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'id',
@@ -26,7 +27,7 @@ def init():
])
def write(row):
with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
@@ -43,7 +44,7 @@ def fetchData(url,params):
def getTypeList():
typeList = []
with open('./nav.csv','r',encoding='utf-8') as reader:
with open(navAddr,'r',encoding='utf-8') as reader:
readerCsv = csv.reader(reader)
next(reader)
for nav in readerCsv:
@@ -2,10 +2,10 @@ import requests
import csv
import numpy as np
import os
from settings import navAddr
def init():
if not os.path.exists('./nav.csv'):
with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile:
if not os.path.exists(navAddr):
with open(navAddr,'w',encoding='utf-8',newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([
'typeName',
@@ -14,7 +14,7 @@ def init():
])
def write(row):
with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile:
with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
@@ -45,9 +45,11 @@ def readJson(response):
containerid
])
if __name__ == '__main__':
def start():
init()
url = 'https://weibo.com/ajax/feed/allGroups'
response = fetchData(url)
readJson(response)
readJson(response)
if __name__ == '__main__':
start()