更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -58,11 +58,47 @@ class WeiboClient:
|
||||
if enable_return_response:
|
||||
return response
|
||||
|
||||
data: Dict = response.json()
|
||||
# 检查响应状态码
|
||||
if response.status_code != 200:
|
||||
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
|
||||
raise DataFetchError(error_msg)
|
||||
|
||||
# 检查响应内容类型
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
if "application/json" not in content_type and "text/json" not in content_type:
|
||||
# 可能是HTML响应(如登录页面)
|
||||
response_text = response.text[:500]
|
||||
utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
|
||||
# 如果看起来像是HTML,可能是需要登录
|
||||
if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
|
||||
raise DataFetchError("Response is HTML, may need to login or cookie expired")
|
||||
raise DataFetchError(f"Unexpected content type: {content_type}")
|
||||
|
||||
# 安全地解析JSON
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except ValueError as e:
|
||||
# JSON解析失败
|
||||
response_text = response.text[:500]
|
||||
utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
|
||||
utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
|
||||
raise DataFetchError(f"Failed to parse JSON response: {e}")
|
||||
|
||||
# 检查响应是否为空
|
||||
if not data:
|
||||
utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
|
||||
return {"cards": []}
|
||||
|
||||
ok_code = data.get("ok")
|
||||
if ok_code == 0: # response error
|
||||
msg = data.get("msg", "response error")
|
||||
# "这里还没有内容" 是正常情况,表示没有更多数据,不应该抛出异常
|
||||
if msg == "这里还没有内容" or "还没有内容" in msg:
|
||||
utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
|
||||
return {"cards": []} # 返回空结果,而不是抛出异常
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "response error"))
|
||||
raise DataFetchError(msg)
|
||||
elif ok_code != 1: # unknown error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "unknown error"))
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
# 初始化数据库表(如果需要)
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
|
||||
try:
|
||||
from database.db_session import create_tables
|
||||
utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
|
||||
await create_tables(config.SAVE_DATA_OPTION)
|
||||
utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
try:
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
cards = search_res.get("cards", [])
|
||||
utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
|
||||
|
||||
# 如果没有更多内容,跳出循环
|
||||
if len(cards) == 0:
|
||||
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
|
||||
break
|
||||
except DataFetchError as e:
|
||||
# 如果是"没有内容"的错误,正常结束
|
||||
if "还没有内容" in str(e) or "没有内容" in str(e):
|
||||
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
|
||||
break
|
||||
# 其他错误继续抛出
|
||||
raise
|
||||
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
note_list = filter_search_result_card(cards)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
|
||||
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
mblog: Dict = note_item.get("mblog")
|
||||
if mblog:
|
||||
note_id_list.append(mblog.get("id"))
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
note_id = mblog.get("id")
|
||||
note_id_list.append(note_id)
|
||||
try:
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
|
||||
# 继续处理其他笔记,不中断整个流程
|
||||
|
||||
page += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user