更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -58,11 +58,47 @@ class WeiboClient:
if enable_return_response:
return response
data: Dict = response.json()
# 检查响应状态码
if response.status_code != 200:
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
raise DataFetchError(error_msg)
# 检查响应内容类型
content_type = response.headers.get("content-type", "").lower()
if "application/json" not in content_type and "text/json" not in content_type:
# 可能是HTML响应(如登录页面)
response_text = response.text[:500]
utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
# 如果看起来像是HTML,可能是需要登录
if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
raise DataFetchError("Response is HTML, may need to login or cookie expired")
raise DataFetchError(f"Unexpected content type: {content_type}")
# 安全地解析JSON
try:
data: Dict = response.json()
except ValueError as e:
# JSON解析失败
response_text = response.text[:500]
utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
raise DataFetchError(f"Failed to parse JSON response: {e}")
# 检查响应是否为空
if not data:
utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
return {"cards": []}
ok_code = data.get("ok")
if ok_code == 0: # response error
msg = data.get("msg", "response error")
# "这里还没有内容" 是正常情况,表示没有更多数据,不应该抛出异常
if msg == "这里还没有内容" or "还没有内容" in msg:
utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
return {"cards": []} # 返回空结果,而不是抛出异常
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "response error"))
raise DataFetchError(msg)
elif ok_code != 1: # unknown error
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unknown error"))
@@ -15,6 +15,7 @@
import asyncio
import os
import re
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
self.cdp_manager = None
async def start(self):
# 初始化数据库表(如果需要)
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
try:
from database.db_session import create_tables
utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
await create_tables(config.SAVE_DATA_OPTION)
utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
except Exception as e:
utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
raise
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
page += 1
continue
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
try:
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
cards = search_res.get("cards", [])
utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
# 如果没有更多内容,跳出循环
if len(cards) == 0:
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
break
except DataFetchError as e:
# 如果是"没有内容"的错误,正常结束
if "还没有内容" in str(e) or "没有内容" in str(e):
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
break
# 其他错误继续抛出
raise
note_id_list: List[str] = []
note_list = filter_search_result_card(search_res.get("cards"))
note_list = filter_search_result_card(cards)
utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
for note_item in note_list:
if note_item:
mblog: Dict = note_item.get("mblog")
if mblog:
note_id_list.append(mblog.get("id"))
await weibo_store.update_weibo_note(note_item)
await self.get_note_images(mblog)
note_id = mblog.get("id")
note_id_list.append(note_id)
try:
await weibo_store.update_weibo_note(note_item)
await self.get_note_images(mblog)
except Exception as e:
utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
# 继续处理其他笔记,不中断整个流程
page += 1