本地化&2.0

This commit is contained in:
z66
2025-12-02 14:01:39 +08:00
parent ec1baf539c
commit a9eda60493
15 changed files with 409 additions and 140 deletions
@@ -10,7 +10,7 @@
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional, Union, Set
from urllib.parse import urlencode, quote
import requests
@@ -328,6 +328,8 @@ class BaiduTieBaClient(AbstractApiClient):
result: List[TiebaComment] = []
current_page = 1
seen_comment_ids: Set[str] = set()
duplicate_page_count = 0
while note_detail.total_replay_page >= current_page and len(result) < max_count:
# 构造评论页URL
@@ -353,6 +355,26 @@ class BaiduTieBaClient(AbstractApiClient):
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
break
if config.ENABLE_COMMENT_DEDUP:
new_comments: List[TiebaComment] = []
for comment in comments:
comment_id = getattr(comment, "comment_id", None)
if comment_id and comment_id not in seen_comment_ids:
seen_comment_ids.add(comment_id)
new_comments.append(comment)
if not new_comments:
duplicate_page_count += 1
utils.logger.info(
f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有出现新的评论(重复数据),计数={duplicate_page_count}"
)
if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 连续 {duplicate_page_count} 页无新增评论,提前结束抓取")
break
current_page += 1
continue
comments = new_comments
duplicate_page_count = 0
# 限制评论数量
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
@@ -408,6 +430,8 @@ class BaiduTieBaClient(AbstractApiClient):
current_page = 1
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
seen_sub_ids: Set[str] = set()
duplicate_page_count = 0
while max_sub_page_num >= current_page:
# 构造子评论URL
@@ -442,6 +466,28 @@ class BaiduTieBaClient(AbstractApiClient):
)
break
if config.ENABLE_COMMENT_DEDUP:
new_sub_comments: List[TiebaComment] = []
for sub_comment in sub_comments:
sub_comment_id = getattr(sub_comment, "comment_id", None)
if sub_comment_id and sub_comment_id not in seen_sub_ids:
seen_sub_ids.add(sub_comment_id)
new_sub_comments.append(sub_comment)
if not new_sub_comments:
duplicate_page_count += 1
utils.logger.info(
f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}{current_page}页未出现新子评论,计数={duplicate_page_count}"
)
if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
utils.logger.info(
f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}连续 {duplicate_page_count} 页无新增子评论,提前结束"
)
break
current_page += 1
continue
sub_comments = new_sub_comments
duplicate_page_count = 0
if callback:
await callback(parment_comment.note_id, sub_comments)
@@ -662,10 +662,13 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
try:
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
elif self.browser_context:
await self.browser_context.close()
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
except Exception as e:
utils.logger.error(f"[BaiduTieBaCrawler.close] An error occurred during close: {e}")