本地化&2.0
This commit is contained in:
@@ -362,13 +362,16 @@ class DouYinCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
async def get_aweme_media(self, aweme_item: Dict):
|
||||
"""
|
||||
|
||||
@@ -426,10 +426,13 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[KuaishouCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Union, Set
|
||||
from urllib.parse import urlencode, quote
|
||||
|
||||
import requests
|
||||
@@ -328,6 +328,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
result: List[TiebaComment] = []
|
||||
current_page = 1
|
||||
seen_comment_ids: Set[str] = set()
|
||||
duplicate_page_count = 0
|
||||
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
# 构造评论页URL
|
||||
@@ -353,6 +355,26 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
|
||||
break
|
||||
|
||||
if config.ENABLE_COMMENT_DEDUP:
|
||||
new_comments: List[TiebaComment] = []
|
||||
for comment in comments:
|
||||
comment_id = getattr(comment, "comment_id", None)
|
||||
if comment_id and comment_id not in seen_comment_ids:
|
||||
seen_comment_ids.add(comment_id)
|
||||
new_comments.append(comment)
|
||||
if not new_comments:
|
||||
duplicate_page_count += 1
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有出现新的评论(重复数据),计数={duplicate_page_count}"
|
||||
)
|
||||
if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 连续 {duplicate_page_count} 页无新增评论,提前结束抓取")
|
||||
break
|
||||
current_page += 1
|
||||
continue
|
||||
comments = new_comments
|
||||
duplicate_page_count = 0
|
||||
|
||||
# 限制评论数量
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
@@ -408,6 +430,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
seen_sub_ids: Set[str] = set()
|
||||
duplicate_page_count = 0
|
||||
|
||||
while max_sub_page_num >= current_page:
|
||||
# 构造子评论URL
|
||||
@@ -442,6 +466,28 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
)
|
||||
break
|
||||
|
||||
if config.ENABLE_COMMENT_DEDUP:
|
||||
new_sub_comments: List[TiebaComment] = []
|
||||
for sub_comment in sub_comments:
|
||||
sub_comment_id = getattr(sub_comment, "comment_id", None)
|
||||
if sub_comment_id and sub_comment_id not in seen_sub_ids:
|
||||
seen_sub_ids.add(sub_comment_id)
|
||||
new_sub_comments.append(sub_comment)
|
||||
if not new_sub_comments:
|
||||
duplicate_page_count += 1
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}第{current_page}页未出现新子评论,计数={duplicate_page_count}"
|
||||
)
|
||||
if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}连续 {duplicate_page_count} 页无新增子评论,提前结束"
|
||||
)
|
||||
break
|
||||
current_page += 1
|
||||
continue
|
||||
sub_comments = new_sub_comments
|
||||
duplicate_page_count = 0
|
||||
|
||||
if callback:
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
|
||||
|
||||
@@ -662,10 +662,13 @@ class TieBaCrawler(AbstractCrawler):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
@@ -383,10 +383,13 @@ class WeiboCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
@@ -437,13 +437,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
async def get_notice_media(self, note_detail: Dict):
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
|
||||
@@ -139,6 +139,12 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
|
||||
start_page = config.START_PAGE
|
||||
|
||||
# 统计信息
|
||||
total_saved_contents = 0
|
||||
total_failed_contents = 0
|
||||
total_saved_comments = 0
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
@@ -164,7 +170,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
)
|
||||
)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Search contents :{content_list}"
|
||||
f"[ZhihuCrawler.search] Search contents :{len(content_list)} 条"
|
||||
)
|
||||
if not content_list:
|
||||
utils.logger.info("No more content!")
|
||||
@@ -175,13 +181,41 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
page += 1
|
||||
# 保存内容,添加异常处理和统计
|
||||
saved_count = 0
|
||||
failed_count = 0
|
||||
for content in content_list:
|
||||
await zhihu_store.update_zhihu_content(content)
|
||||
try:
|
||||
await zhihu_store.update_zhihu_content(content)
|
||||
saved_count += 1
|
||||
except Exception as e:
|
||||
failed_count += 1
|
||||
utils.logger.error(
|
||||
f"[ZhihuCrawler.search] 保存内容失败 (content_id={content.content_id}): {e}"
|
||||
)
|
||||
|
||||
if saved_count > 0:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] 关键词 '{keyword}' 第 {page-1} 页: 成功保存 {saved_count} 条内容"
|
||||
)
|
||||
total_saved_contents += saved_count
|
||||
if failed_count > 0:
|
||||
utils.logger.warning(
|
||||
f"[ZhihuCrawler.search] 关键词 '{keyword}' 第 {page-1} 页: 保存失败 {failed_count} 条内容"
|
||||
)
|
||||
total_failed_contents += failed_count
|
||||
|
||||
await self.batch_get_content_comments(content_list)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[ZhihuCrawler.search] Search content error")
|
||||
return
|
||||
|
||||
# 输出最终统计信息
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] 关键词搜索完成统计: "
|
||||
f"成功保存 {total_saved_contents} 条内容, "
|
||||
f"失败 {total_failed_contents} 条内容"
|
||||
)
|
||||
|
||||
async def batch_get_content_comments(self, content_list: List[ZhihuContent]):
|
||||
"""
|
||||
@@ -473,10 +507,13 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
Reference in New Issue
Block a user