更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
def __init__(
self,
timeout=10,
timeout=30, # 增加超时时间到30秒,避免请求卡住
proxy=None,
*,
headers: Dict[str, str],
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
"""
d_c0 = self.cookie_dict.get("d_c0")
if not d_c0:
raise Exception("d_c0 not found in cookies")
utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
sign_res = sign(url, self.default_headers["cookie"])
headers = self.default_headers.copy()
headers['x-zst-81'] = sign_res["x-zst-81"]
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
Returns:
"""
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
uri = "/api/v4/search_v3"
params = {
"gk_version": "gz-gaokao",
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
"sort": sort.value,
"vertical": note_type.value,
}
search_res = await self.get(uri, params)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
return self._extractor.extract_contents_from_search(search_res)
try:
utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
search_res = await self.get(uri, params)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功,开始解析结果")
contents = self._extractor.extract_contents_from_search(search_res)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成,找到 {len(contents)} 条内容")
return contents
except Exception as e:
utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
raise
async def get_root_comments(
self,
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
# 设置页面超时时间为30秒
self.context_page.set_default_timeout(30000)
await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)
# Create a client to interact with the zhihu website.
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
cookie_str=config.COOKIES,
)
await login_obj.begin()
# 登录后等待页面稳定
await asyncio.sleep(2)
await self.zhihu_client.update_cookies(
browser_context=self.browser_context
)
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
# 使用用户配置的第一个关键词,如果没有关键词则使用默认的"test"
search_keyword = "test" # 默认关键词
if config.KEYWORDS and config.KEYWORDS.strip():
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if keywords_list:
search_keyword = keywords_list[0]
utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
else:
utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空,使用默认关键词 'test'")
else:
utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词,使用默认关键词 'test'")
utils.logger.info(
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
)
await self.context_page.goto(
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
utils.logger.info(
f"[ZhihuCrawler.start] 关键词: {search_keyword}"
)
await asyncio.sleep(5)
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
try:
# 使用用户配置的关键词,而不是硬编码的python
from urllib.parse import quote
encoded_keyword = quote(search_keyword)
search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
# 添加超时时间,避免卡住
await self.context_page.goto(
search_url,
wait_until="domcontentloaded",
timeout=30000
)
utils.logger.info("[ZhihuCrawler.start] 页面跳转完成,等待页面稳定...")
# 等待页面基本加载完成,不等待networkidle(知乎页面可能一直有请求)
await asyncio.sleep(3)
utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载,开始更新cookies")
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
except Exception as e:
utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e},尝试继续执行", exc_info=True)
# 即使跳转失败,也尝试更新cookies
try:
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
utils.logger.info("[ZhihuCrawler.start] Cookies更新完成(跳转失败后)")
except Exception as cookie_error:
utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)
crawler_type_var.set(config.CRAWLER_TYPE)
utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
await self.get_creators_and_notes()
else:
pass
utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
async def search(self) -> None:
"""Search for notes and retrieve their comment information."""
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
zhihu_limit_count = 20 # zhihu limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
total_failed_contents = 0
total_saved_comments = 0
for keyword in config.KEYWORDS.split(","):
# 安全地处理关键词列表
if not config.KEYWORDS or not config.KEYWORDS.strip():
utils.logger.error("[ZhihuCrawler.search] 关键词配置为空,无法执行搜索任务")
return
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if not keywords_list:
utils.logger.error("[ZhihuCrawler.search] 关键词列表为空,无法执行搜索任务")
return
utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
for keyword in keywords_list:
source_keyword_var.set(keyword)
utils.logger.info(
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
# 获取用户配置的关键词用于 referer,如果没有则使用默认值
referer_keyword = "test"
if config.KEYWORDS and config.KEYWORDS.strip():
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if keywords_list:
referer_keyword = keywords_list[0]
from urllib.parse import quote
encoded_referer_keyword = quote(referer_keyword)
referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
zhihu_client_obj = ZhiHuClient(
proxy=httpx_proxy,
headers={
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
"accept-language": "zh-CN,zh;q=0.9",
"cookie": cookie_str,
"priority": "u=1, i",
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
"referer": referer_url,
"user-agent": self.user_agent,
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",