diff --git a/spider/main.py b/spider/main.py index d164331..898fad6 100644 --- a/spider/main.py +++ b/spider/main.py @@ -1,14 +1,13 @@ -from spiderContent import start as spiderContentStart -from spiderComments import start as spiderCommentsStart +from spiderData import spiderData from saveData import save_to_sql as saveData def main(): - print('正在爬取文章数据') - spiderContentStart(1,1) - print('正在爬取文章评论数据') - spiderCommentsStart() - print('正在存储数据') - saveData() + try: + spiderData() + saveData() + print("爬取数据更新") + except: + print("爬取数据失败") if __name__ == '__main__': main() \ No newline at end of file diff --git a/spider/navData.csv b/spider/navData.csv deleted file mode 100644 index b61708e..0000000 --- a/spider/navData.csv +++ /dev/null @@ -1,60 +0,0 @@ -typeName,gid,containerid -热门,102803,102803 -同城,1028032222,102803_2222 -榜单,102803600169,102803_ctg1_600169_-_ctg1_600169 -男篮,102803600279,102803_ctg1_600279_-_ctg1_600279 -明星,1028034288,102803_ctg1_4288_-_ctg1_4288 -车展,1028035188,102803_ctg1_5188_-_ctg1_5188 -搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388 -情感,1028031988,102803_ctg1_1988_-_ctg1_1988 -周末,102803600195,102803_ctg1_600195_-_ctg1_600195 -电影,1028033288,102803_ctg1_3288_-_ctg1_3288 -社会,1028034188,102803_ctg1_4188_-_ctg1_4188 -电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488 -美食,1028032688,102803_ctg1_2688_-_ctg1_2688 -俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267 -国际,1028036288,102803_ctg1_6288_-_ctg1_6288 -深度,102803600155,102803_ctg1_600155_-_ctg1_600155 -财经,1028036388,102803_ctg1_6388_-_ctg1_6388 -读书,1028034588,102803_ctg1_4588_-_ctg1_4588 -摄影,1028034988,102803_ctg1_4988_-_ctg1_4988 -颜值,102803600165,102803_ctg1_600165_-_ctg1_600165 -体育,1028031388,102803_ctg1_1388_-_ctg1_1388 -数码,1028035088,102803_ctg1_5088_-_ctg1_5088 -综艺,1028034688,102803_ctg1_4688_-_ctg1_4688 -时尚,1028034488,102803_ctg1_4488_-_ctg1_4488 -星座,1028031688,102803_ctg1_1688_-_ctg1_1688 -军事,1028036688,102803_ctg1_6688_-_ctg1_6688 -股市,1028031288,102803_ctg1_1288_-_ctg1_1288 -房产,1028035588,102803_ctg1_5588_-_ctg1_5588 -家居,1028035888,102803_ctg1_5888_-_ctg1_5888 -萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788 -科技,1028032088,102803_ctg1_2088_-_ctg1_2088 -科普,1028035988,102803_ctg1_5988_-_ctg1_5988 -动漫,1028032388,102803_ctg1_2388_-_ctg1_2388 -运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788 -旅游,1028032588,102803_ctg1_2588_-_ctg1_2588 -瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488 -好物,102803600094,102803_ctg1_600094_-_ctg1_600094 -历史,1028036788,102803_ctg1_6788_-_ctg1_6788 -艺术,1028035488,102803_ctg1_5488_-_ctg1_5488 -美妆,1028031588,102803_ctg1_1588_-_ctg1_1588 -法律,1028037388,102803_ctg1_7388_-_ctg1_7388 -设计,1028035388,102803_ctg1_5388_-_ctg1_5388 -健康,1028032188,102803_ctg1_2188_-_ctg1_2188 -音乐,1028035288,102803_ctg1_5288_-_ctg1_5288 -游戏,1028034888,102803_ctg1_4888_-_ctg1_4888 -新时代,1028037968,102803_ctg1_7968_-_ctg1_7968 -校园,102803600177,102803_ctg1_600177_-_ctg1_600177 -收藏,1028038189,102803_ctg1_8189_-_ctg1_8189 -政务,1028035788,102803_ctg1_5788_-_ctg1_5788 -养生,1028036588,102803_ctg1_6588_-_ctg1_6588 -育儿,1028033188,102803_ctg1_3188_-_ctg1_3188 -抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037 -教育,102803600080,102803_ctg1_600080_-_ctg1_600080 -婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788 -舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788 -辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988 -公益,102803600057,102803_ctg1_600057_-_ctg1_600057 -问答,1028037977,102803_ctg1_7977_-_ctg1_7977 -三农,1028037188,102803_ctg1_7188_-_ctg1_7188 diff --git a/spider/spiderData.py b/spider/spiderData.py new file mode 100644 index 0000000..131c87a --- /dev/null +++ b/spider/spiderData.py @@ -0,0 +1,13 @@ +from spiderDataPack.spiderNav import start as spiderNavStart +from spiderDataPack.spiderContent import start as spiderContentStart +from spiderDataPack.spiderComments import start as spiderCommentsStart +import os + +def spiderData(): + if not os.path.exists('./nav.csv'): + spiderNavStart() + spiderContentStart(1,1) + spiderCommentsStart() + +if __name__ == '__main__': + spiderData() \ No newline at end of file diff --git a/spider/spiderDataPack/__init__.py b/spider/spiderDataPack/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc b/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..a5d539c Binary files /dev/null and b/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc differ diff --git a/spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc b/spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc new file mode 100644 index 0000000..44cf39d Binary files /dev/null and b/spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc differ diff --git a/spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc b/spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc new file mode 100644 index 0000000..b9cdb70 Binary files /dev/null and b/spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc differ diff --git a/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc b/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc new file mode 100644 index 0000000..935a533 Binary files /dev/null and b/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc differ diff --git a/spider/spiderComments.py b/spider/spiderDataPack/spiderComments.py similarity index 91% rename from spider/spiderComments.py rename to spider/spiderDataPack/spiderComments.py index ea00843..7d2861c 100644 --- a/spider/spiderComments.py +++ b/spider/spiderDataPack/spiderComments.py @@ -5,8 +5,8 @@ import os from datetime import datetime def init(): - if not os.path.exists('./articleComments.csv'): - with open('./articleComments.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists('./comments.csv'): + with open('./comments.csv','w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'articleId', @@ -21,7 +21,7 @@ def init(): ]) def writerRow(row): - with open('./articleComments.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open('./comments.csv', 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -38,7 +38,7 @@ def get_data(url,params): def getAllArticleList(): artileList = [] - with open('./articleData.csv','r',encoding='utf-8') as reader: + with open('./article.csv','r',encoding='utf-8') as reader: readerCsv = csv.reader(reader) next(reader) for nav in readerCsv: diff --git a/spider/spiderContent.py b/spider/spiderDataPack/spiderContent.py similarity index 93% rename from spider/spiderContent.py rename to spider/spiderDataPack/spiderContent.py index 6da3a57..fd860e6 100644 --- a/spider/spiderContent.py +++ b/spider/spiderDataPack/spiderContent.py @@ -5,8 +5,8 @@ import os from datetime import datetime def init(): - if not os.path.exists('./articleData.csv'): - with open('./articleData.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists('./article.csv'): + with open('./article.csv','w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'id', @@ -26,7 +26,7 @@ def init(): ]) def writerRow(row): - with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open('./article.csv', 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -43,7 +43,7 @@ def get_data(url,params): def getAllTypeList(): typeList = [] - with open('./navData.csv','r',encoding='utf-8') as reader: + with open('./nav.csv','r',encoding='utf-8') as reader: readerCsv = csv.reader(reader) next(reader) for nav in readerCsv: diff --git a/spider/spiderNav.py b/spider/spiderDataPack/spiderNav.py similarity index 89% rename from spider/spiderNav.py rename to spider/spiderDataPack/spiderNav.py index fbcdf7d..1223698 100644 --- a/spider/spiderNav.py +++ b/spider/spiderDataPack/spiderNav.py @@ -4,8 +4,8 @@ import numpy as np import os def init(): - if not os.path.exists('./navData.csv'): - with open('./navData.csv','w',encoding='utf-8',newline='') as csvFile: + if not os.path.exists('./nav.csv'): + with open('./nav.csv','w',encoding='utf-8',newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'typeName', @@ -14,7 +14,7 @@ def init(): ]) def writerRow(row): - with open('./navData.csv', 'a', encoding='utf-8', newline='') as csvFile: + with open('./nav.csv', 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) @@ -45,9 +45,11 @@ def parse_json(response): containerid ]) - -if __name__ == '__main__': +def start(): init() url = 'https://weibo.com/ajax/feed/allGroups' response = get_data(url) - parse_json(response) \ No newline at end of file + parse_json(response) + +if __name__ == '__main__': + start() \ No newline at end of file