diff --git a/spider/__pycache__/saveData.cpython-38.pyc b/spider/__pycache__/saveData.cpython-38.pyc new file mode 100644 index 0000000..97fee39 Binary files /dev/null and b/spider/__pycache__/saveData.cpython-38.pyc differ diff --git a/spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc b/spider/__pycache__/spiderComments.cpython-38.pyc similarity index 94% rename from spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc rename to spider/__pycache__/spiderComments.cpython-38.pyc index 44cf39d..16f9383 100644 Binary files a/spider/spiderDataPack/__pycache__/spiderComments.cpython-38.pyc and b/spider/__pycache__/spiderComments.cpython-38.pyc differ diff --git a/spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc b/spider/__pycache__/spiderContent.cpython-38.pyc similarity index 57% rename from spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc rename to spider/__pycache__/spiderContent.cpython-38.pyc index b9cdb70..7eaf8e1 100644 Binary files a/spider/spiderDataPack/__pycache__/spiderContent.cpython-38.pyc and b/spider/__pycache__/spiderContent.cpython-38.pyc differ diff --git a/spider/__pycache__/spiderData.cpython-38.pyc b/spider/__pycache__/spiderData.cpython-38.pyc new file mode 100644 index 0000000..254127d Binary files /dev/null and b/spider/__pycache__/spiderData.cpython-38.pyc differ diff --git a/spider/article.csv b/spider/article.csv new file mode 100644 index 0000000..a481b83 --- /dev/null +++ b/spider/article.csv @@ -0,0 +1 @@ +id,likeNum,commentsLen,reposts_count,region,content,contentLen,created_at,type,detailUrl,authorAvatar,authorName,authorDetail,isVip diff --git a/spider/main.py b/spider/main.py index 898fad6..132b661 100644 --- a/spider/main.py +++ b/spider/main.py @@ -1,13 +1,15 @@ -from spiderData import spiderData +from spiderContent import start as spiderContentStart +from spiderComments import start as spiderCommentsStart from saveData import save_to_sql as saveData def main(): - try: - spiderData() - saveData() - print("爬取数据更新") - except: - print("爬取数据失败") + print('正在爬取文章数据') + spiderContentStart(1,1) + print('正在爬取文章评论数据') + spiderCommentsStart() + print('正在存储数据') + saveData() + print("爬取数据更新") if __name__ == '__main__': main() \ No newline at end of file diff --git a/spider/nav.csv b/spider/nav.csv new file mode 100644 index 0000000..b61708e --- /dev/null +++ b/spider/nav.csv @@ -0,0 +1,60 @@ +typeName,gid,containerid +热门,102803,102803 +同城,1028032222,102803_2222 +榜单,102803600169,102803_ctg1_600169_-_ctg1_600169 +男篮,102803600279,102803_ctg1_600279_-_ctg1_600279 +明星,1028034288,102803_ctg1_4288_-_ctg1_4288 +车展,1028035188,102803_ctg1_5188_-_ctg1_5188 +搞笑,1028034388,102803_ctg1_4388_-_ctg1_4388 +情感,1028031988,102803_ctg1_1988_-_ctg1_1988 +周末,102803600195,102803_ctg1_600195_-_ctg1_600195 +电影,1028033288,102803_ctg1_3288_-_ctg1_3288 +社会,1028034188,102803_ctg1_4188_-_ctg1_4188 +电视剧,1028032488,102803_ctg1_2488_-_ctg1_2488 +美食,1028032688,102803_ctg1_2688_-_ctg1_2688 +俄乌局势,102803600267,102803_ctg1_600267_-_ctg1_600267 +国际,1028036288,102803_ctg1_6288_-_ctg1_6288 +深度,102803600155,102803_ctg1_600155_-_ctg1_600155 +财经,1028036388,102803_ctg1_6388_-_ctg1_6388 +读书,1028034588,102803_ctg1_4588_-_ctg1_4588 +摄影,1028034988,102803_ctg1_4988_-_ctg1_4988 +颜值,102803600165,102803_ctg1_600165_-_ctg1_600165 +体育,1028031388,102803_ctg1_1388_-_ctg1_1388 +数码,1028035088,102803_ctg1_5088_-_ctg1_5088 +综艺,1028034688,102803_ctg1_4688_-_ctg1_4688 +时尚,1028034488,102803_ctg1_4488_-_ctg1_4488 +星座,1028031688,102803_ctg1_1688_-_ctg1_1688 +军事,1028036688,102803_ctg1_6688_-_ctg1_6688 +股市,1028031288,102803_ctg1_1288_-_ctg1_1288 +房产,1028035588,102803_ctg1_5588_-_ctg1_5588 +家居,1028035888,102803_ctg1_5888_-_ctg1_5888 +萌宠,1028032788,102803_ctg1_2788_-_ctg1_2788 +科技,1028032088,102803_ctg1_2088_-_ctg1_2088 +科普,1028035988,102803_ctg1_5988_-_ctg1_5988 +动漫,1028032388,102803_ctg1_2388_-_ctg1_2388 +运动健身,1028034788,102803_ctg1_4788_-_ctg1_4788 +旅游,1028032588,102803_ctg1_2588_-_ctg1_2588 +瘦身,1028036488,102803_ctg1_6488_-_ctg1_6488 +好物,102803600094,102803_ctg1_600094_-_ctg1_600094 +历史,1028036788,102803_ctg1_6788_-_ctg1_6788 +艺术,1028035488,102803_ctg1_5488_-_ctg1_5488 +美妆,1028031588,102803_ctg1_1588_-_ctg1_1588 +法律,1028037388,102803_ctg1_7388_-_ctg1_7388 +设计,1028035388,102803_ctg1_5388_-_ctg1_5388 +健康,1028032188,102803_ctg1_2188_-_ctg1_2188 +音乐,1028035288,102803_ctg1_5288_-_ctg1_5288 +游戏,1028034888,102803_ctg1_4888_-_ctg1_4888 +新时代,1028037968,102803_ctg1_7968_-_ctg1_7968 +校园,102803600177,102803_ctg1_600177_-_ctg1_600177 +收藏,1028038189,102803_ctg1_8189_-_ctg1_8189 +政务,1028035788,102803_ctg1_5788_-_ctg1_5788 +养生,1028036588,102803_ctg1_6588_-_ctg1_6588 +育儿,1028033188,102803_ctg1_3188_-_ctg1_3188 +抽奖,102803600037,102803_ctg1_600037_-_ctg1_600037 +教育,102803600080,102803_ctg1_600080_-_ctg1_600080 +婚恋,1028031788,102803_ctg1_1788_-_ctg1_1788 +舞蹈,1028038788,102803_ctg1_8788_-_ctg1_8788 +辟谣,1028036988,102803_ctg1_6988_-_ctg1_6988 +公益,102803600057,102803_ctg1_600057_-_ctg1_600057 +问答,1028037977,102803_ctg1_7977_-_ctg1_7977 +三农,1028037188,102803_ctg1_7188_-_ctg1_7188 diff --git a/spider/spiderDataPack/spiderComments.py b/spider/spiderComments.py similarity index 100% rename from spider/spiderDataPack/spiderComments.py rename to spider/spiderComments.py diff --git a/spider/spiderDataPack/spiderContent.py b/spider/spiderContent.py similarity index 100% rename from spider/spiderDataPack/spiderContent.py rename to spider/spiderContent.py diff --git a/spider/spiderData.py b/spider/spiderData.py deleted file mode 100644 index 131c87a..0000000 --- a/spider/spiderData.py +++ /dev/null @@ -1,13 +0,0 @@ -from spiderDataPack.spiderNav import start as spiderNavStart -from spiderDataPack.spiderContent import start as spiderContentStart -from spiderDataPack.spiderComments import start as spiderCommentsStart -import os - -def spiderData(): - if not os.path.exists('./nav.csv'): - spiderNavStart() - spiderContentStart(1,1) - spiderCommentsStart() - -if __name__ == '__main__': - spiderData() \ No newline at end of file diff --git a/spider/spiderDataPack/__init__.py b/spider/spiderDataPack/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc b/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index a5d539c..0000000 Binary files a/spider/spiderDataPack/__pycache__/__init__.cpython-38.pyc and /dev/null differ diff --git a/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc b/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc deleted file mode 100644 index 935a533..0000000 Binary files a/spider/spiderDataPack/__pycache__/spiderNav.cpython-38.pyc and /dev/null differ diff --git a/spider/spiderDataPack/spiderNav.py b/spider/spiderNav.py similarity index 97% rename from spider/spiderDataPack/spiderNav.py rename to spider/spiderNav.py index 1223698..8a0f75e 100644 --- a/spider/spiderDataPack/spiderNav.py +++ b/spider/spiderNav.py @@ -45,11 +45,9 @@ def parse_json(response): containerid ]) -def start(): + +if __name__ == '__main__': init() url = 'https://weibo.com/ajax/feed/allGroups' response = get_data(url) - parse_json(response) - -if __name__ == '__main__': - start() \ No newline at end of file + parse_json(response) \ No newline at end of file