Update spiderComments.py

This commit is contained in:
lintsinghua
2024-12-14 20:06:39 +08:00
committed by GitHub
parent babb9d54a9
commit a7472b4287
+46 -58
View File
@@ -2,99 +2,87 @@ import time
import requests import requests
import csv import csv
import os import os
import random
from datetime import datetime from datetime import datetime
from .settings import articleAddr,commentsAddr from .settings import articleAddr, commentsAddr
from requests.exceptions import RequestException
# 初始化,创建评论数据文件
def init(): def init():
if not os.path.exists(commentsAddr): if not os.path.exists(commentsAddr):
with open(commentsAddr,'w',encoding='utf-8',newline='') as csvFile: with open(commentsAddr, 'w', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow([ writer.writerow([
'articleId', 'articleId', 'created_at', 'likes_counts', 'region', 'content',
'created_at', 'authorName', 'authorGender', 'authorAddress', 'authorAvatar'
'likes_counts',
'region',
'content',
'authorName',
'authorGender',
'authorAddress',
'authorAvatar'
]) ])
# 写入评论数据到CSV
def write(row): def write(row):
with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile: with open(commentsAddr, 'a', encoding='utf-8', newline='') as csvFile:
writer = csv.writer(csvFile) writer = csv.writer(csvFile)
writer.writerow(row) writer.writerow(row)
def fetchData(url,params): # 获取数据,支持多账号随机切换
headers = { def fetchData(url, params, headers_list):
'Cookie':'SINAGLOBAL=2555941826014.1074.1676801766625; ULV=1719829459275:6:1:2:4660996305989.918.1719827559898:1719743122299; UOR=,,www.baidu.com; XSRF-TOKEN=VtLXviYSIs8lor7sz4iGyigL; SUB=_2A25LhvU9DeRhGeFH6FIX-S3MyD2IHXVo-gj1rDV8PUJbkNAGLRXMkW1Ne2nhI3Gle25QJK0Z99J3trq_NZn6YKJ-; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW3Mv8V5EupQbbKh.vaZIwU5JpX5KzhUgL.FoM4e05c1Ke7e022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoM41hz41hqReKqN; WBPSESS=Dt2hbAUaXfkVprjyrAZT_LRaDLsnxG-kIbeYwnBb5OUKZiwfVr_UrcYfWuqG-4ZVDM5HeU3HXkDNK_thfRfdS9Ao6ezT30jDksv-CpaVmlTAqGUHjJ7PYkH5aCK4HLxmRq14ZalmQNwzfWMPa4y0VNRLuYdg7L1s49ymNq_5v5vusoz0r4ki6u-MHGraF0fbUTgX14x0kHayEwOoxfLI-w==; SCF=AqmJWo31oFV5itnRgWNU1-wHQTL6PmkBLf3gDuqpdqAIfaWguDTMre6Oxjf5Uzs74JAh2r0DdV1sJ1g6m-wJ5NQ.; _s_tentry=-; Apache=4660996305989.918.1719827559898; PC_TOKEN=7955a7ab1f; appkey=; geetest_token=602cd4e3a7ed1898808f8adfe1a2048b; ALF=1722421868', headers = random.choice(headers_list)
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0' try:
} response = requests.get(url, headers=headers, params=params, timeout=10)
response = requests.get(url,headers=headers,params=params) if response.status_code == 200:
if response.status_code == 200: return response.json()['data']
return response.json()['data'] else:
else: return None
except RequestException as e:
print(f"请求失败:{e}")
return None return None
# 获取文章列表
def getArticleList(): def getArticleList():
articleList = [] articleList = []
with open(articleAddr,'r',encoding='utf-8') as reader: with open(articleAddr, 'r', encoding='utf-8') as reader:
readerCsv = csv.reader(reader) readerCsv = csv.reader(reader)
next(reader) next(reader)
for nav in readerCsv: for nav in readerCsv:
articleList.append(nav) articleList.append(nav)
return articleList return articleList
def readJson(response,artileId): # 解析评论数据
def readJson(response, articleId):
for comment in response: for comment in response:
created_at = datetime.strptime(comment['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
likes_counts = comment['like_counts'] likes_counts = comment['like_counts']
try: region = comment.get('source', '').replace('来自', '')
region = comment['source'].replace('来自', '')
except:
region = ''
content = comment['text_raw'] content = comment['text_raw']
authorName = comment['user']['screen_name'] authorName = comment['user']['screen_name']
authorGender = comment['user']['gender'] authorGender = comment['user']['gender']
authorAddress = comment['user']['location'] authorAddress = comment['user']['location']
authorAvatar = comment['user']['avatar_large'] authorAvatar = comment['user']['avatar_large']
write([ write([articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress, authorAvatar])
artileId,
created_at,
likes_counts,
region,
content,
authorName,
authorGender,
authorAddress,
authorAvatar
])
def start(): # 启动爬虫
def start(headers_list, delay=2):
commentUrl = 'https://weibo.com/ajax/statuses/buildComments' commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
init() init()
articleList = getArticleList() articleList = getArticleList()
for article in articleList: for article in articleList:
articleId = article[0] articleId = article[0]
print('正在爬取id值为%s的文章评论' % articleId) print(f'正在爬取id值为{articleId}的文章评论')
time.sleep(2) time.sleep(random.uniform(1, delay)) # 随机延时,避免频繁访问
params = { params = {'id': int(articleId), 'is_show_bulletin': 2}
'id':int(articleId), response = fetchData(commentUrl, params, headers_list)
'is_show_bulletin':2 if response:
} readJson(response, articleId)
response = fetchData(commentUrl,params)
readJson(response,articleId)
if __name__ == '__main__': if __name__ == '__main__':
start() # 这里的headers_list应该包含多个账号的cookie
headers_list = [
{
'Cookie': 'your_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
},
{
'Cookie': 'another_cookie_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
]
start(headers_list)