Update spiderNav.py

This commit is contained in:
lintsinghua
2024-12-14 20:05:58 +08:00
committed by GitHub
parent 82be6f864f
commit babb9d54a9
+44 -34
View File
@@ -2,54 +2,64 @@ import requests
import csv import csv
import numpy as np import numpy as np
import os import os
import random
from .settings import navAddr from .settings import navAddr
from requests.exceptions import RequestException
# 初始化导航数据文件
def init(): def init():
if not os.path.exists(navAddr): if not os.path.exists(navAddr):
with open(navAddr,'w',encoding='utf-8',newline='') as csvFile: with open(navAddr, 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow(['typeName', 'gid', 'containerid'])
'typeName',
'gid',
'containerid'
])
# 写入导航数据
def write(row): def write(row):
with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile: with open(navAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
def fetchData(url): # 获取数据,支持多账号
headers = { def fetchData(url, headers_list):
'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', headers = random.choice(headers_list)
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' try:
} response = requests.get(url, headers=headers, timeout=10)
params = { if response.status_code == 200:
'is_new_segment':1, return response.json()['data']['modules']
'fetch_hot':1 else:
} return None
response = requests.get(url,headers=headers,params=params) except RequestException as e:
if response.status_code == 200: print(f"请求失败:{e}")
return response.json()
else:
return None return None
# 解析导航数据
def readJson(response): def readJson(response):
navList = np.append(response['groups'][3]['group'],response['groups'][4]['group']) for module in response:
for nav in navList: if 'type' in module and 'typeName' in module:
navName = nav['title'] typeName = module['typeName']
gid = nav['gid'] for submodule in module['modules']:
containerid = nav['containerid'] if 'id' in submodule and 'containerid' in submodule:
write([ gid = submodule['id']
navName, containerid = submodule['containerid']
gid, write([typeName, gid, containerid])
containerid
])
def start(): # 启动爬虫
def start(headers_list):
navUrl = 'https://weibo.com/ajax/side/hot'
init() init()
url = 'https://weibo.com/ajax/feed/allGroups' response = fetchData(navUrl, headers_list)
response = fetchData(url) if response:
readJson(response) readJson(response)
if __name__ == '__main__': if __name__ == '__main__':
start() headers_list = [
{
'Cookie': 'your_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
},
{
'Cookie': 'another_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
]
start(headers_list)