更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
timeout=30, # 增加超时时间到30秒,避免请求卡住
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
|
||||
"""
|
||||
d_c0 = self.cookie_dict.get("d_c0")
|
||||
if not d_c0:
|
||||
raise Exception("d_c0 not found in cookies")
|
||||
utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
|
||||
raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
|
||||
sign_res = sign(url, self.default_headers["cookie"])
|
||||
headers = self.default_headers.copy()
|
||||
headers['x-zst-81'] = sign_res["x-zst-81"]
|
||||
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
|
||||
uri = "/api/v4/search_v3"
|
||||
params = {
|
||||
"gk_version": "gz-gaokao",
|
||||
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
|
||||
"sort": sort.value,
|
||||
"vertical": note_type.value,
|
||||
}
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
|
||||
return self._extractor.extract_contents_from_search(search_res)
|
||||
try:
|
||||
utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功,开始解析结果")
|
||||
contents = self._extractor.extract_contents_from_search(search_res)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成,找到 {len(contents)} 条内容")
|
||||
return contents
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def get_root_comments(
|
||||
self,
|
||||
|
||||
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
# 设置页面超时时间为30秒
|
||||
self.context_page.set_default_timeout(30000)
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
# Create a client to interact with the zhihu website.
|
||||
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
|
||||
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
# 登录后等待页面稳定
|
||||
await asyncio.sleep(2)
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
# 使用用户配置的第一个关键词,如果没有关键词则使用默认的"test"
|
||||
search_keyword = "test" # 默认关键词
|
||||
if config.KEYWORDS and config.KEYWORDS.strip():
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if keywords_list:
|
||||
search_keyword = keywords_list[0]
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
|
||||
else:
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空,使用默认关键词 'test'")
|
||||
else:
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词,使用默认关键词 'test'")
|
||||
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.start] 关键词: {search_keyword}"
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
try:
|
||||
# 使用用户配置的关键词,而不是硬编码的python
|
||||
from urllib.parse import quote
|
||||
encoded_keyword = quote(search_keyword)
|
||||
search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
|
||||
|
||||
# 添加超时时间,避免卡住
|
||||
await self.context_page.goto(
|
||||
search_url,
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30000
|
||||
)
|
||||
utils.logger.info("[ZhihuCrawler.start] 页面跳转完成,等待页面稳定...")
|
||||
# 等待页面基本加载完成,不等待networkidle(知乎页面可能一直有请求)
|
||||
await asyncio.sleep(3)
|
||||
utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载,开始更新cookies")
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e},尝试继续执行", exc_info=True)
|
||||
# 即使跳转失败,也尝试更新cookies
|
||||
try:
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
utils.logger.info("[ZhihuCrawler.start] Cookies更新完成(跳转失败后)")
|
||||
except Exception as cookie_error:
|
||||
utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
|
||||
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")
|
||||
|
||||
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
|
||||
utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
|
||||
zhihu_limit_count = 20 # zhihu limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
|
||||
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
total_failed_contents = 0
|
||||
total_saved_comments = 0
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
# 安全地处理关键词列表
|
||||
if not config.KEYWORDS or not config.KEYWORDS.strip():
|
||||
utils.logger.error("[ZhihuCrawler.search] 关键词配置为空,无法执行搜索任务")
|
||||
return
|
||||
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if not keywords_list:
|
||||
utils.logger.error("[ZhihuCrawler.search] 关键词列表为空,无法执行搜索任务")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
|
||||
|
||||
for keyword in keywords_list:
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
|
||||
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
|
||||
# 获取用户配置的关键词用于 referer,如果没有则使用默认值
|
||||
referer_keyword = "test"
|
||||
if config.KEYWORDS and config.KEYWORDS.strip():
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if keywords_list:
|
||||
referer_keyword = keywords_list[0]
|
||||
|
||||
from urllib.parse import quote
|
||||
encoded_referer_keyword = quote(referer_keyword)
|
||||
referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
|
||||
|
||||
zhihu_client_obj = ZhiHuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cookie": cookie_str,
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
|
||||
"referer": referer_url,
|
||||
"user-agent": self.user_agent,
|
||||
"x-api-version": "3.0.91",
|
||||
"x-app-za": "OS=Web",
|
||||
|
||||
Reference in New Issue
Block a user