Update spiderContent.py

This commit is contained in:
lintsinghua
2024-12-14 20:07:14 +08:00
committed by GitHub
parent a7472b4287
commit 8c4f3e9b52
+61 -87
View File
@@ -2,123 +2,97 @@ import time
import requests import requests
import csv import csv
import os import os
import random
from datetime import datetime from datetime import datetime
from .settings import navAddr,articleAddr from .settings import navAddr, articleAddr
from requests.exceptions import RequestException
# 初始化文章数据文件
def init(): def init():
if not os.path.exists(articleAddr): if not os.path.exists(articleAddr):
with open(articleAddr,'w',encoding='utf-8',newline='') as csvFile: with open(articleAddr, 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow([
'id', 'id', 'likeNum', 'commentsLen', 'reposts_count', 'region', 'content', 'contentLen',
'likeNum', 'created_at', 'type', 'detailUrl', 'authorAvatar', 'authorName', 'authorDetail', 'isVip'
'commentsLen',
'reposts_count',
'region',
'content',
'contentLen',
'created_at',
'type',
'detailUrl',# followBtnCode>uid + mblogid
'authorAvatar',
'authorName',
'authorDetail',
'isVip' # v_plus
]) ])
# 写入数据到CSV
def write(row): def write(row):
with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile: with open(articleAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
def fetchData(url,params): # 获取数据,支持多账号
headers = { def fetchData(url, params, headers_list):
'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', headers = random.choice(headers_list)
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' try:
} response = requests.get(url, headers=headers, params=params, timeout=10)
response = requests.get(url,headers=headers,params=params)
if response.status_code == 200: if response.status_code == 200:
return response.json()['statuses'] return response.json()['statuses']
else: else:
return None return None
except RequestException as e:
print(f"请求失败:{e}")
return None
# 获取类型列表
def getTypeList(): def getTypeList():
typeList = [] typeList = []
with open(navAddr,'r',encoding='utf-8') as reader: with open(navAddr, 'r', encoding='utf-8') as reader:
readerCsv = csv.reader(reader) readerCsv = csv.reader(reader)
next(reader) next(reader)
for nav in readerCsv: for nav in readerCsv:
typeList.append(nav) typeList.append(nav)
return typeList return typeList
def readJson(response,type): # 解析文章数据
for artice in response: def readJson(response, type):
id = artice['id'] for article in response:
likeNum = artice['attitudes_count'] id = article['id']
commentsLen = artice['comments_count'] likeNum = article['attitudes_count']
reposts_count = artice['reposts_count'] commentsLen = article['comments_count']
try: reposts_count = article['reposts_count']
region = artice['region_name'].replace('发布于 ', '') region = article.get('region_name', '').replace('发布于 ', '')
except: content = article['text_raw']
region = '' contentLen = article['textLength']
content = artice['text_raw'] created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
contentLen = artice['textLength'] detailUrl = f"https://weibo.com/{article['id']}/{article['mblogid']}" if 'mblogid' in article else ''
created_at = datetime.strptime(artice['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') authorAvatar = article['user']['avatar_large']
type = type authorName = article['user']['screen_name']
try: authorDetail = f"https://weibo.com/u/{article['user']['id']}"
detailUrl = 'https://weibo.com/' + str(artice['id']) + '/' + str(artice['mblogid']) isVip = article['user']['v_plus']
except: write([id, likeNum, commentsLen, reposts_count, region, content, contentLen, created_at, type, detailUrl, authorAvatar, authorName, authorDetail, isVip])
detailUrl = ''
authorAvatar = artice['user']['avatar_large']
authorName = artice['user']['screen_name']
authorDetail = 'https://weibo.com/u/' + str(artice['user']['id'])
isVip = artice['user']['v_plus']
write([
id,
likeNum,
commentsLen,
reposts_count,
region,
content,
contentLen,
created_at,
type,
detailUrl,
authorAvatar,
authorName,
authorDetail,
isVip
])
def start(typeNum=14,pageNum=3): # 启动爬虫
def start(headers_list, typeNum=14, pageNum=3, delay=2):
articleUrl = 'https://weibo.com/ajax/feed/hottimeline' articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
init() init()
typeList = getTypeList() typeList = getTypeList()
typeNumCount = 0 for type in typeList[:typeNum]:
for type in typeList: for page in range(pageNum):
if typeNumCount > typeNum:return print(f'正在爬取的类型:{type[0]} 中的第{page + 1}页文章数据')
time.sleep(2) time.sleep(random.uniform(1, delay)) # 随机延时
for page in range(0,pageNum): params = {
print('正在爬取的类型:%s 中的第%s页文章数据' % (type[0],page + 1)) 'group_id': type[1],
time.sleep(1) 'containerid': type[2],
parmas = { 'max_id': page,
'group_id':type[1], 'count': 10,
'containerid':type[2], 'extparam': 'discover|new_feed'
'max_id':page,
'count':10,
'extparam':'discover|new_feed'
} }
response = fetchData(articleUrl,parmas) response = fetchData(articleUrl, params, headers_list)
readJson(response,type[0]) if response:
typeNumCount += 1 readJson(response, type[0])
if __name__ == '__main__': if __name__ == '__main__':
start() headers_list = [
{
'Cookie': 'your_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
},
{
'Cookie': 'another_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
]
start(headers_list)