本地化&2.0

This commit is contained in:
z66
2025-12-02 14:01:39 +08:00
parent ec1baf539c
commit a9eda60493
15 changed files with 409 additions and 140 deletions
@@ -139,6 +139,12 @@ class ZhihuCrawler(AbstractCrawler):
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
start_page = config.START_PAGE
# 统计信息
total_saved_contents = 0
total_failed_contents = 0
total_saved_comments = 0
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(
@@ -164,7 +170,7 @@ class ZhihuCrawler(AbstractCrawler):
)
)
utils.logger.info(
f"[ZhihuCrawler.search] Search contents :{content_list}"
f"[ZhihuCrawler.search] Search contents :{len(content_list)}"
)
if not content_list:
utils.logger.info("No more content!")
@@ -175,13 +181,41 @@ class ZhihuCrawler(AbstractCrawler):
utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
page += 1
# 保存内容,添加异常处理和统计
saved_count = 0
failed_count = 0
for content in content_list:
await zhihu_store.update_zhihu_content(content)
try:
await zhihu_store.update_zhihu_content(content)
saved_count += 1
except Exception as e:
failed_count += 1
utils.logger.error(
f"[ZhihuCrawler.search] 保存内容失败 (content_id={content.content_id}): {e}"
)
if saved_count > 0:
utils.logger.info(
f"[ZhihuCrawler.search] 关键词 '{keyword}'{page-1} 页: 成功保存 {saved_count} 条内容"
)
total_saved_contents += saved_count
if failed_count > 0:
utils.logger.warning(
f"[ZhihuCrawler.search] 关键词 '{keyword}'{page-1} 页: 保存失败 {failed_count} 条内容"
)
total_failed_contents += failed_count
await self.batch_get_content_comments(content_list)
except DataFetchError:
utils.logger.error("[ZhihuCrawler.search] Search content error")
return
# 输出最终统计信息
utils.logger.info(
f"[ZhihuCrawler.search] 关键词搜索完成统计: "
f"成功保存 {total_saved_contents} 条内容, "
f"失败 {total_failed_contents} 条内容"
)
async def batch_get_content_comments(self, content_list: List[ZhihuContent]):
"""
@@ -473,10 +507,13 @@ class ZhihuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
try:
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
elif self.browser_context:
await self.browser_context.close()
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
except Exception as e:
utils.logger.error(f"[ZhihuCrawler.close] An error occurred during close: {e}")