优化爬虫代码
This commit is contained in:
@@ -36,7 +36,7 @@ def run_spider_script():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
scheduler = BackgroundScheduler(timezone=utc)
|
scheduler = BackgroundScheduler(timezone=utc)
|
||||||
scheduler.add_job(run_spider_script, 'interval', hours=5)
|
scheduler.add_job(run_spider_script, 'interval', minutes=1)
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
+3
-6
@@ -1,12 +1,9 @@
|
|||||||
from spiderContent import start as spiderContent
|
from spiderData import spiderData
|
||||||
from spiderComments import start as spiderComments
|
|
||||||
from saveData import save_to_sql as saveData
|
from saveData import save_to_sql as saveData
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print('正在爬取文章数据')
|
print('正在爬取数据')
|
||||||
spiderContent(1,1)
|
spiderData()
|
||||||
print('正在爬取文章评论数据')
|
|
||||||
spiderComments()
|
|
||||||
print('正在存储数据')
|
print('正在存储数据')
|
||||||
saveData()
|
saveData()
|
||||||
print("爬取数据更新")
|
print("爬取数据更新")
|
||||||
|
|||||||
@@ -0,0 +1,16 @@
|
|||||||
|
from spiderDataPackage.spiderNav import start as spiderNav
|
||||||
|
from spiderDataPackage.spiderContent import start as spiderContent
|
||||||
|
from spiderDataPackage.spiderComments import start as spiderComments
|
||||||
|
import os
|
||||||
|
|
||||||
|
def spiderData():
|
||||||
|
if not os.path.exists('./nav.csv'):
|
||||||
|
print('正在爬取导航栏数据')
|
||||||
|
spiderNav()
|
||||||
|
print('正在爬取文章数据')
|
||||||
|
spiderContent(1,1)
|
||||||
|
print('正在爬取文章评论数据')
|
||||||
|
spiderComments()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
spiderData()
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
navAddr="./nav.csv"
|
||||||
|
articleAddr="./article.csv"
|
||||||
|
commentsAddr="./comments.csv"
|
||||||
@@ -3,10 +3,11 @@ import requests
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from settings import articleAddr,commentsAddr
|
||||||
|
|
||||||
def init():
|
def init():
|
||||||
if not os.path.exists('./comments.csv'):
|
if not os.path.exists(commentsAddr):
|
||||||
with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile:
|
with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow([
|
writer.writerow([
|
||||||
'articleId',
|
'articleId',
|
||||||
@@ -21,7 +22,7 @@ def init():
|
|||||||
])
|
])
|
||||||
|
|
||||||
def write(row):
|
def write(row):
|
||||||
with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile:
|
with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
@@ -38,7 +39,7 @@ def fetchData(url,params):
|
|||||||
|
|
||||||
def getArticleList():
|
def getArticleList():
|
||||||
articleList = []
|
articleList = []
|
||||||
with open('./article.csv','r',encoding='utf-8') as reader:
|
with open(articleAddr,'r',encoding='utf-8') as reader:
|
||||||
readerCsv = csv.reader(reader)
|
readerCsv = csv.reader(reader)
|
||||||
next(reader)
|
next(reader)
|
||||||
for nav in readerCsv:
|
for nav in readerCsv:
|
||||||
@@ -3,10 +3,11 @@ import requests
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from settings import navAddr,articleAddr
|
||||||
|
|
||||||
def init():
|
def init():
|
||||||
if not os.path.exists('./article.csv'):
|
if not os.path.exists(articleAddr):
|
||||||
with open('./article.csv','w',encoding='utf-8',newline='') as csvFile:
|
with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow([
|
writer.writerow([
|
||||||
'id',
|
'id',
|
||||||
@@ -26,7 +27,7 @@ def init():
|
|||||||
])
|
])
|
||||||
|
|
||||||
def write(row):
|
def write(row):
|
||||||
with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile:
|
with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
@@ -43,7 +44,7 @@ def fetchData(url,params):
|
|||||||
|
|
||||||
def getTypeList():
|
def getTypeList():
|
||||||
typeList = []
|
typeList = []
|
||||||
with open('./nav.csv','r',encoding='utf-8') as reader:
|
with open(navAddr,'r',encoding='utf-8') as reader:
|
||||||
readerCsv = csv.reader(reader)
|
readerCsv = csv.reader(reader)
|
||||||
next(reader)
|
next(reader)
|
||||||
for nav in readerCsv:
|
for nav in readerCsv:
|
||||||
@@ -2,10 +2,10 @@ import requests
|
|||||||
import csv
|
import csv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
|
from settings import navAddr
|
||||||
def init():
|
def init():
|
||||||
if not os.path.exists('./nav.csv'):
|
if not os.path.exists(navAddr):
|
||||||
with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile:
|
with open(navAddr,'w',encoding='utf-8',newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow([
|
writer.writerow([
|
||||||
'typeName',
|
'typeName',
|
||||||
@@ -14,7 +14,7 @@ def init():
|
|||||||
])
|
])
|
||||||
|
|
||||||
def write(row):
|
def write(row):
|
||||||
with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile:
|
with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
|
||||||
writer = csv.writer(csvFile)
|
writer = csv.writer(csvFile)
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
@@ -45,9 +45,11 @@ def readJson(response):
|
|||||||
containerid
|
containerid
|
||||||
])
|
])
|
||||||
|
|
||||||
|
def start():
|
||||||
if __name__ == '__main__':
|
|
||||||
init()
|
init()
|
||||||
url = 'https://weibo.com/ajax/feed/allGroups'
|
url = 'https://weibo.com/ajax/feed/allGroups'
|
||||||
response = fetchData(url)
|
response = fetchData(url)
|
||||||
readJson(response)
|
readJson(response)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
start()
|
||||||
Reference in New Issue
Block a user