更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -49,8 +49,27 @@ class BilibiliClient(AbstractApiClient):
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
"""
|
||||
Basic HTTP request wrapper with retries for transient network errors.
|
||||
"""
|
||||
verify = getattr(config, "HTTPX_VERIFY", True)
|
||||
# 优先使用传入 proxy,其次是 config.HTTPX_PROXY,最后走系统环境变量
|
||||
proxy = self.proxy or getattr(config, "HTTPX_PROXY", "") or None
|
||||
|
||||
async with httpx.AsyncClient(proxy=proxy, timeout=self.timeout, verify=verify) as client:
|
||||
# 简单重试,处理短暂的连接失败
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = await client.request(method, url, **kwargs)
|
||||
break
|
||||
except httpx.HTTPError as e:
|
||||
last_exc = e
|
||||
if attempt == 2:
|
||||
# 3rd failure -> give up
|
||||
utils.logger.error(f"[BilibiliClient.request] Network error on {method} {url}: {repr(e)}")
|
||||
raise DataFetchError(f"network error: {e}") from e
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.JSONDecodeError:
|
||||
|
||||
@@ -68,10 +68,23 @@ class BilibiliLogin(AbstractLogin):
|
||||
return True
|
||||
return False
|
||||
|
||||
async def _has_valid_login_cookie(self) -> bool:
|
||||
"""
|
||||
快速检查当前上下文是否已有登录态,用于避免重复扫码。
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
return bool(cookie_dict.get("SESSDATA") or cookie_dict.get("DedeUserID"))
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login bilibili website and keep webdriver login state"""
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
|
||||
|
||||
# 如果已经登录则直接跳过扫码流程
|
||||
if await self._has_valid_login_cookie():
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] 已检测到有效登录态,跳过扫码登录")
|
||||
return
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//div[@class='right-entry__outside go-login-btn']//div"
|
||||
|
||||
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
|
||||
utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
|
||||
raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
|
||||
except httpx.TimeoutException as e:
|
||||
utils.logger.error(f"请求超时: {e}")
|
||||
raise DataFetchError(f"请求超时: {e}")
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
|
||||
raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
|
||||
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
posts_res = None
|
||||
retry_success = False
|
||||
try:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
retry_success = True
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
|
||||
# 如果是网络连接错误,等待后重试一次
|
||||
if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
|
||||
utils.logger.warning(f"[DouYinCrawler.search] 网络错误,等待3秒后重试...")
|
||||
await asyncio.sleep(3)
|
||||
try:
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
offset=page * dy_limit_count - dy_limit_count,
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
retry_success = True
|
||||
except Exception as retry_e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
|
||||
break
|
||||
else:
|
||||
break
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
|
||||
break
|
||||
|
||||
# 如果请求失败(包括重试失败),跳过后续处理
|
||||
if not retry_success or posts_res is None:
|
||||
break
|
||||
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
"""Make HTTP request with retry and proxy fallback."""
|
||||
max_retries = 3
|
||||
|
||||
# build proxy attempts: try proxy first (if set), then no-proxy
|
||||
proxy_attempts: List[Optional[str]] = []
|
||||
if self.proxy:
|
||||
proxy_attempts.append(self.proxy)
|
||||
proxy_attempts.append(None) # always allow a direct attempt
|
||||
|
||||
last_exc: Optional[Exception] = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
|
||||
try:
|
||||
async with httpx.AsyncClient(proxy=proxy_to_use) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
return data.get("data", {})
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
|
||||
last_exc = e
|
||||
utils.logger.warning(
|
||||
f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
|
||||
f"proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
|
||||
f"proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
# For other exceptions (like DataFetchError), don't retry
|
||||
last_exc = e
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
raise
|
||||
|
||||
# If somehow we exit the loop without returning, raise last exception
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
final_uri = uri
|
||||
|
||||
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
# 添加重试机制处理网络连接错误
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
|
||||
break
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
error_msg = str(e)
|
||||
if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
|
||||
if retry_count < max_retries:
|
||||
utils.logger.warning(f"[KuaishouCrawler] 网络连接错误,第 {retry_count} 次重试: {e}")
|
||||
await asyncio.sleep(2 * retry_count) # 递增等待时间
|
||||
else:
|
||||
utils.logger.error(f"[KuaishouCrawler] 网络连接失败,已重试 {max_retries} 次: {e}")
|
||||
raise
|
||||
else:
|
||||
# 非网络错误直接抛出
|
||||
raise
|
||||
|
||||
# Create a client to interact with the kuaishou website.
|
||||
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
||||
|
||||
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
|
||||
else:
|
||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
async def _quick_check_login_state(self) -> bool:
|
||||
"""
|
||||
Quick check if the current login status is successful without retry
|
||||
Returns True if logged in, False otherwise
|
||||
"""
|
||||
try:
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
kuaishou_pass_token = cookie_dict.get("passToken")
|
||||
if kuaishou_pass_token:
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
|
||||
"""login kuaishou website and keep webdriver login state"""
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
|
||||
|
||||
# click login button
|
||||
# Check if already logged in (quick check without retry)
|
||||
is_logged_in = await self._quick_check_login_state()
|
||||
if is_logged_in:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
|
||||
return
|
||||
|
||||
# Check if login button exists (if not, might already be logged in)
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//p[text()='登录']"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
try:
|
||||
# Wait for the element to be visible with a shorter timeout
|
||||
await login_button_ele.wait_for(state="visible", timeout=3000)
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
|
||||
|
||||
# Try normal click first
|
||||
await login_button_ele.click(timeout=5000)
|
||||
except Exception as e:
|
||||
# If login button is not found, might already be logged in
|
||||
if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
|
||||
# Double check login state (quick check)
|
||||
is_logged_in = await self._quick_check_login_state()
|
||||
if is_logged_in:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
|
||||
return
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
|
||||
raise
|
||||
else:
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
|
||||
try:
|
||||
# If normal click fails, try force click to bypass overlay
|
||||
await login_button_ele.click(force=True, timeout=5000)
|
||||
except Exception as e2:
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
|
||||
# If force click also fails, use JavaScript to click directly
|
||||
await login_button_ele.evaluate("element => element.click()")
|
||||
|
||||
# Wait a moment for the login modal to appear
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='qrcode-img']//img"
|
||||
|
||||
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
self.playwright_page = playwright_page # Playwright页面对象
|
||||
self._last_captcha_check_time = 0 # 上次验证码检测时间
|
||||
self._captcha_verified_recently = False # 是否最近完成过验证码
|
||||
|
||||
def _sync_request(self, method, url, proxy=None, **kwargs):
|
||||
"""
|
||||
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self.headers["Cookie"] = cookie_str
|
||||
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
|
||||
|
||||
async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
|
||||
"""
|
||||
检测并等待百度验证码完成(在爬虫过程中使用)
|
||||
等待用户手动拖动验证码,验证成功后自动识别并继续
|
||||
|
||||
Args:
|
||||
max_wait_time: 最大等待时间(秒),默认120秒
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
return
|
||||
|
||||
import time
|
||||
|
||||
async def _detect_captcha() -> bool:
|
||||
"""更全面地检测验证码(包含文本、URL 及常见容器)"""
|
||||
# DOM 选择器
|
||||
selector_hits = [
|
||||
'.tang-pass-slider',
|
||||
'#captcha',
|
||||
'.vcode-img',
|
||||
'.pass-verify',
|
||||
'.tang-pass-verify',
|
||||
'.pass-verify-slider',
|
||||
'div[id*="captcha"]',
|
||||
'div[class*="verify"]',
|
||||
'div[class*="captcha"]',
|
||||
'text=安全验证',
|
||||
'text=请输入验证码',
|
||||
'text=拖动',
|
||||
'text=滑动',
|
||||
]
|
||||
for selector in selector_hits:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element and await element.is_visible():
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# URL 关键词
|
||||
url_lower = (self.playwright_page.url or "").lower()
|
||||
if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
|
||||
return True
|
||||
|
||||
# 页面文本关键词(截断以降低开销)
|
||||
try:
|
||||
page_text = (await self.playwright_page.content())[:4000]
|
||||
if any(
|
||||
kw in page_text
|
||||
for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
|
||||
):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
# 如果最近5秒内刚完成过验证码,跳过检测(避免重复检测)
|
||||
if self._captcha_verified_recently:
|
||||
time_since_last_check = time.time() - self._last_captcha_check_time
|
||||
if time_since_last_check < 5:
|
||||
utils.logger.debug(
|
||||
f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码,跳过检测"
|
||||
)
|
||||
return
|
||||
else:
|
||||
self._captcha_verified_recently = False
|
||||
|
||||
# 基础选择器(用于后续反复检测)
|
||||
captcha_selectors = [
|
||||
'.tang-pass-slider',
|
||||
'#captcha',
|
||||
'.vcode-img',
|
||||
'.pass-verify',
|
||||
'.tang-pass-verify',
|
||||
'.pass-verify-slider',
|
||||
'div[id*="captcha"]',
|
||||
'div[class*="verify"]',
|
||||
'div[class*="captcha"]',
|
||||
]
|
||||
success_selectors = [
|
||||
'.tang-pass-success',
|
||||
'.pass-verify-success',
|
||||
'div[class*="success"]',
|
||||
]
|
||||
|
||||
# 检测验证码是否存在
|
||||
captcha_found = await _detect_captcha()
|
||||
if captcha_found:
|
||||
utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码,请手动拖动完成验证...")
|
||||
if not captcha_found:
|
||||
return
|
||||
|
||||
# 记录当前URL,用于检测页面跳转
|
||||
initial_url = self.playwright_page.url
|
||||
utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码(最多等待 {max_wait_time} 秒)...")
|
||||
|
||||
start_time = time.time()
|
||||
last_log_time = 0
|
||||
check_interval = 1 # 检查间隔改为1秒,更快响应
|
||||
|
||||
while True:
|
||||
# 检查是否超时
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time >= max_wait_time:
|
||||
utils.logger.warning(
|
||||
f"[BaiduTieBaClient] ⏰ 等待验证码超时({max_wait_time}秒),跳过当前百度贴吧爬取任务"
|
||||
)
|
||||
# 超时直接中断本次百度贴吧爬虫,交给上层捕获处理
|
||||
raise TimeoutError(
|
||||
f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
|
||||
)
|
||||
|
||||
try:
|
||||
# 检测验证成功的标识
|
||||
verification_success = False
|
||||
for selector in success_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
verification_success = True
|
||||
utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 检测验证码是否还存在
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 检测页面URL是否变化(验证成功后可能会跳转)
|
||||
current_url = self.playwright_page.url
|
||||
url_changed = current_url != initial_url
|
||||
|
||||
# 判断验证是否成功
|
||||
# 成功条件:1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化(且不是验证码页面)
|
||||
if verification_success or (not captcha_still_exists and url_changed):
|
||||
# 验证码消失且URL变化,可能是验证成功后的跳转
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,检测到页面变化,等待3秒确认验证完成...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 再次确认验证码是否真的消失了
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_still_exists:
|
||||
# 确认验证成功
|
||||
final_url = self.playwright_page.url
|
||||
utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功!")
|
||||
if url_changed:
|
||||
utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化,验证在当前页面完成")
|
||||
|
||||
# 标记最近完成过验证码,避免立即再次检测
|
||||
self._captcha_verified_recently = True
|
||||
import time
|
||||
self._last_captcha_check_time = time.time()
|
||||
|
||||
# 等待页面稳定,避免立即再次检测验证码
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 验证成功后,再次检查是否又出现了验证码(防止跳转到新的验证码页面)
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
captcha_reappeared = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_reappeared = True
|
||||
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_reappeared:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
|
||||
break
|
||||
else:
|
||||
# 如果又出现了验证码,重置状态继续等待
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
|
||||
initial_url = self.playwright_page.url
|
||||
start_time = time.time()
|
||||
continue
|
||||
else:
|
||||
# 验证码又出现了,可能验证失败或页面刷新
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 验证码重新出现,可能验证失败,继续等待...")
|
||||
elif not captcha_still_exists and not url_changed:
|
||||
# 验证码消失但URL未变化,可能是验证成功但未跳转
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,等待3秒确认验证完成...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 再次确认
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_still_exists:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功!")
|
||||
|
||||
# 标记最近完成过验证码
|
||||
self._captcha_verified_recently = True
|
||||
import time
|
||||
self._last_captcha_check_time = time.time()
|
||||
|
||||
# 等待页面稳定
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 验证成功后,再次检查是否又出现了验证码
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
captcha_reappeared = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_reappeared = True
|
||||
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_reappeared:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
|
||||
break
|
||||
else:
|
||||
# 如果又出现了验证码,重置状态继续等待
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
|
||||
initial_url = self.playwright_page.url
|
||||
start_time = time.time()
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
# 如果检测过程中出现异常,继续等待
|
||||
utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
|
||||
|
||||
# 等待一段时间后再次检查
|
||||
await asyncio.sleep(check_interval)
|
||||
|
||||
# 每10秒输出一次提示
|
||||
current_time = int(elapsed_time)
|
||||
if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
|
||||
remaining_time = max_wait_time - current_time
|
||||
utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...(剩余 {remaining_time} 秒)")
|
||||
last_log_time = current_time
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问搜索页面
|
||||
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问帖子详情页面
|
||||
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问评论页面
|
||||
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问子评论页面
|
||||
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问贴吧页面
|
||||
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问创作者主页
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问创作者帖子列表页面
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
|
||||
@@ -58,11 +58,47 @@ class WeiboClient:
|
||||
if enable_return_response:
|
||||
return response
|
||||
|
||||
data: Dict = response.json()
|
||||
# 检查响应状态码
|
||||
if response.status_code != 200:
|
||||
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
|
||||
raise DataFetchError(error_msg)
|
||||
|
||||
# 检查响应内容类型
|
||||
content_type = response.headers.get("content-type", "").lower()
|
||||
if "application/json" not in content_type and "text/json" not in content_type:
|
||||
# 可能是HTML响应(如登录页面)
|
||||
response_text = response.text[:500]
|
||||
utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
|
||||
# 如果看起来像是HTML,可能是需要登录
|
||||
if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
|
||||
raise DataFetchError("Response is HTML, may need to login or cookie expired")
|
||||
raise DataFetchError(f"Unexpected content type: {content_type}")
|
||||
|
||||
# 安全地解析JSON
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except ValueError as e:
|
||||
# JSON解析失败
|
||||
response_text = response.text[:500]
|
||||
utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
|
||||
utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
|
||||
raise DataFetchError(f"Failed to parse JSON response: {e}")
|
||||
|
||||
# 检查响应是否为空
|
||||
if not data:
|
||||
utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
|
||||
return {"cards": []}
|
||||
|
||||
ok_code = data.get("ok")
|
||||
if ok_code == 0: # response error
|
||||
msg = data.get("msg", "response error")
|
||||
# "这里还没有内容" 是正常情况,表示没有更多数据,不应该抛出异常
|
||||
if msg == "这里还没有内容" or "还没有内容" in msg:
|
||||
utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
|
||||
return {"cards": []} # 返回空结果,而不是抛出异常
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "response error"))
|
||||
raise DataFetchError(msg)
|
||||
elif ok_code != 1: # unknown error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "unknown error"))
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
# 初始化数据库表(如果需要)
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
|
||||
try:
|
||||
from database.db_session import create_tables
|
||||
utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
|
||||
await create_tables(config.SAVE_DATA_OPTION)
|
||||
utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
try:
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
cards = search_res.get("cards", [])
|
||||
utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
|
||||
|
||||
# 如果没有更多内容,跳出循环
|
||||
if len(cards) == 0:
|
||||
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
|
||||
break
|
||||
except DataFetchError as e:
|
||||
# 如果是"没有内容"的错误,正常结束
|
||||
if "还没有内容" in str(e) or "没有内容" in str(e):
|
||||
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
|
||||
break
|
||||
# 其他错误继续抛出
|
||||
raise
|
||||
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
note_list = filter_search_result_card(cards)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
|
||||
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
mblog: Dict = note_item.get("mblog")
|
||||
if mblog:
|
||||
note_id_list.append(mblog.get("id"))
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
note_id = mblog.get("id")
|
||||
note_id_list.append(note_id)
|
||||
try:
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
|
||||
# 继续处理其他笔记,不中断整个流程
|
||||
|
||||
page += 1
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
timeout=30, # 增加超时时间到30秒,避免请求卡住
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
|
||||
"""
|
||||
d_c0 = self.cookie_dict.get("d_c0")
|
||||
if not d_c0:
|
||||
raise Exception("d_c0 not found in cookies")
|
||||
utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
|
||||
raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
|
||||
sign_res = sign(url, self.default_headers["cookie"])
|
||||
headers = self.default_headers.copy()
|
||||
headers['x-zst-81'] = sign_res["x-zst-81"]
|
||||
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
|
||||
uri = "/api/v4/search_v3"
|
||||
params = {
|
||||
"gk_version": "gz-gaokao",
|
||||
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
|
||||
"sort": sort.value,
|
||||
"vertical": note_type.value,
|
||||
}
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
|
||||
return self._extractor.extract_contents_from_search(search_res)
|
||||
try:
|
||||
utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功,开始解析结果")
|
||||
contents = self._extractor.extract_contents_from_search(search_res)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成,找到 {len(contents)} 条内容")
|
||||
return contents
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def get_root_comments(
|
||||
self,
|
||||
|
||||
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
# 设置页面超时时间为30秒
|
||||
self.context_page.set_default_timeout(30000)
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
# Create a client to interact with the zhihu website.
|
||||
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
|
||||
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
# 登录后等待页面稳定
|
||||
await asyncio.sleep(2)
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
# 使用用户配置的第一个关键词,如果没有关键词则使用默认的"test"
|
||||
search_keyword = "test" # 默认关键词
|
||||
if config.KEYWORDS and config.KEYWORDS.strip():
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if keywords_list:
|
||||
search_keyword = keywords_list[0]
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
|
||||
else:
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空,使用默认关键词 'test'")
|
||||
else:
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词,使用默认关键词 'test'")
|
||||
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.start] 关键词: {search_keyword}"
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
try:
|
||||
# 使用用户配置的关键词,而不是硬编码的python
|
||||
from urllib.parse import quote
|
||||
encoded_keyword = quote(search_keyword)
|
||||
search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
|
||||
|
||||
# 添加超时时间,避免卡住
|
||||
await self.context_page.goto(
|
||||
search_url,
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30000
|
||||
)
|
||||
utils.logger.info("[ZhihuCrawler.start] 页面跳转完成,等待页面稳定...")
|
||||
# 等待页面基本加载完成,不等待networkidle(知乎页面可能一直有请求)
|
||||
await asyncio.sleep(3)
|
||||
utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载,开始更新cookies")
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e},尝试继续执行", exc_info=True)
|
||||
# 即使跳转失败,也尝试更新cookies
|
||||
try:
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
utils.logger.info("[ZhihuCrawler.start] Cookies更新完成(跳转失败后)")
|
||||
except Exception as cookie_error:
|
||||
utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
|
||||
utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
|
||||
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")
|
||||
|
||||
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
|
||||
utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
|
||||
zhihu_limit_count = 20 # zhihu limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
|
||||
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
total_failed_contents = 0
|
||||
total_saved_comments = 0
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
# 安全地处理关键词列表
|
||||
if not config.KEYWORDS or not config.KEYWORDS.strip():
|
||||
utils.logger.error("[ZhihuCrawler.search] 关键词配置为空,无法执行搜索任务")
|
||||
return
|
||||
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if not keywords_list:
|
||||
utils.logger.error("[ZhihuCrawler.search] 关键词列表为空,无法执行搜索任务")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
|
||||
|
||||
for keyword in keywords_list:
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
|
||||
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
|
||||
# 获取用户配置的关键词用于 referer,如果没有则使用默认值
|
||||
referer_keyword = "test"
|
||||
if config.KEYWORDS and config.KEYWORDS.strip():
|
||||
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
|
||||
if keywords_list:
|
||||
referer_keyword = keywords_list[0]
|
||||
|
||||
from urllib.parse import quote
|
||||
encoded_referer_keyword = quote(referer_keyword)
|
||||
referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
|
||||
|
||||
zhihu_client_obj = ZhiHuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cookie": cookie_str,
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
|
||||
"referer": referer_url,
|
||||
"user-agent": self.user_agent,
|
||||
"x-api-version": "3.0.91",
|
||||
"x-app-za": "OS=Web",
|
||||
|
||||
Reference in New Issue
Block a user