更新部分爬虫以兼容本地运行及数据库存储

2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -49,8 +49,27 @@ class BilibiliClient(AbstractApiClient):
        self.cookie_dict = cookie_dict

    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+        """
+        Basic HTTP request wrapper with retries for transient network errors.
+        """
+        verify = getattr(config, "HTTPX_VERIFY", True)
+        # 优先使用传入 proxy，其次是 config.HTTPX_PROXY，最后走系统环境变量
+        proxy = self.proxy or getattr(config, "HTTPX_PROXY", "") or None
+
+        async with httpx.AsyncClient(proxy=proxy, timeout=self.timeout, verify=verify) as client:
+            # 简单重试，处理短暂的连接失败
+            last_exc: Optional[Exception] = None
+            for attempt in range(3):
+                try:
+                    response = await client.request(method, url, **kwargs)
+                    break
+                except httpx.HTTPError as e:
+                    last_exc = e
+                    if attempt == 2:
+                        # 3rd failure -> give up
+                        utils.logger.error(f"[BilibiliClient.request] Network error on {method} {url}: {repr(e)}")
+                        raise DataFetchError(f"network error: {e}") from e
+                    await asyncio.sleep(1)
        try:
            data: Dict = response.json()
        except json.JSONDecodeError:
@@ -68,10 +68,23 @@ class BilibiliLogin(AbstractLogin):
            return True
        return False

+    async def _has_valid_login_cookie(self) -> bool:
+        """
+        快速检查当前上下文是否已有登录态，用于避免重复扫码。
+        """
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        return bool(cookie_dict.get("SESSDATA") or cookie_dict.get("DedeUserID"))
+
    async def login_by_qrcode(self):
        """login bilibili website and keep webdriver login state"""
        utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")

+        # 如果已经登录则直接跳过扫码流程
+        if await self._has_valid_login_cookie():
+            utils.logger.info("[BilibiliLogin.login_by_qrcode] 已检测到有效登录态，跳过扫码登录")
+            return
+
        # click login button
        login_button_ele = self.context_page.locator(
            "xpath=//div[@class='right-entry__outside go-login-btn']//div"
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
        params["a_bogus"] = a_bogus

    async def request(self, method, url, **kwargs):
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
        try:
-            if response.text == "" or response.text == "blocked":
-                utils.logger.error(f"request params incrr, response.text: {response.text}")
-                raise Exception("account blocked")
-            return response.json()
+            async with httpx.AsyncClient(proxy=self.proxy) as client:
+                response = await client.request(method, url, timeout=self.timeout, **kwargs)
+            try:
+                if response.text == "" or response.text == "blocked":
+                    utils.logger.error(f"request params incrr, response.text: {response.text}")
+                    raise Exception("account blocked")
+                return response.json()
+            except Exception as e:
+                raise DataFetchError(f"{e}, {response.text}")
+        except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
+            utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
+            raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
+        except httpx.TimeoutException as e:
+            utils.logger.error(f"请求超时: {e}")
+            raise DataFetchError(f"请求超时: {e}")
        except Exception as e:
-            raise DataFetchError(f"{e}, {response.text}")
+            utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
+            raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")

    async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
        """
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
                    page += 1
                    continue
+                posts_res = None
+                retry_success = False
                try:
                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
                    posts_res = await self.dy_client.search_info_by_keyword(
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
                        publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
                        search_id=dy_search_id,
                    )
-                    if posts_res.get("data") is None or posts_res.get("data") == []:
-                        utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
+                    retry_success = True
+                except DataFetchError as e:
+                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
+                    # 如果是网络连接错误，等待后重试一次
+                    if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
+                        utils.logger.warning(f"[DouYinCrawler.search] 网络错误，等待3秒后重试...")
+                        await asyncio.sleep(3)
+                        try:
+                            posts_res = await self.dy_client.search_info_by_keyword(
+                                keyword=keyword,
+                                offset=page * dy_limit_count - dy_limit_count,
+                                publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
+                                search_id=dy_search_id,
+                            )
+                            retry_success = True
+                        except Exception as retry_e:
+                            utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
+                            break
+                    else:
                        break
-                except DataFetchError:
-                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
+                except Exception as e:
+                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
+                    break
+                
+                # 如果请求失败（包括重试失败），跳过后续处理
+                if not retry_success or posts_res is None:
+                    break
+                    
+                if posts_res.get("data") is None or posts_res.get("data") == []:
+                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
                    break

                page += 1
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
        self.graphql = KuaiShouGraphQL()

    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
-        data: Dict = response.json()
-        if data.get("errors"):
-            raise DataFetchError(data.get("errors", "unkonw error"))
-        else:
-            return data.get("data", {})
+        """Make HTTP request with retry and proxy fallback."""
+        max_retries = 3
+
+        # build proxy attempts: try proxy first (if set), then no-proxy
+        proxy_attempts: List[Optional[str]] = []
+        if self.proxy:
+            proxy_attempts.append(self.proxy)
+        proxy_attempts.append(None)  # always allow a direct attempt
+
+        last_exc: Optional[Exception] = None
+
+        for attempt in range(max_retries):
+            proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
+            try:
+                async with httpx.AsyncClient(proxy=proxy_to_use) as client:
+                    response = await client.request(method, url, timeout=self.timeout, **kwargs)
+                data: Dict = response.json()
+                if data.get("errors"):
+                    raise DataFetchError(data.get("errors", "unkonw error"))
+                return data.get("data", {})
+            except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
+                last_exc = e
+                utils.logger.warning(
+                    f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
+                    f"proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(1)
+                    continue
+                utils.logger.error(
+                    f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
+                    f"proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                raise
+            except Exception as e:
+                # For other exceptions (like DataFetchError), don't retry
+                last_exc = e
+                utils.logger.error(
+                    f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                raise
+
+        # If somehow we exit the loop without returning, raise last exception
+        if last_exc:
+            raise last_exc

    async def get(self, uri: str, params=None) -> Dict:
        final_uri = uri
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):


            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(f"{self.index_url}?isHome=1")
+            # 添加重试机制处理网络连接错误
+            max_retries = 3
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
+                    break
+                except Exception as e:
+                    retry_count += 1
+                    error_msg = str(e)
+                    if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
+                        if retry_count < max_retries:
+                            utils.logger.warning(f"[KuaishouCrawler] 网络连接错误，第 {retry_count} 次重试: {e}")
+                            await asyncio.sleep(2 * retry_count)  # 递增等待时间
+                        else:
+                            utils.logger.error(f"[KuaishouCrawler] 网络连接失败，已重试 {max_retries} 次: {e}")
+                            raise
+                    else:
+                        # 非网络错误直接抛出
+                        raise

            # Create a client to interact with the kuaishou website.
            self.ks_client = await self.create_ks_client(httpx_proxy_format)
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
        else:
            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

+    async def _quick_check_login_state(self) -> bool:
+        """
+            Quick check if the current login status is successful without retry
+            Returns True if logged in, False otherwise
+        """
+        try:
+            current_cookie = await self.browser_context.cookies()
+            _, cookie_dict = utils.convert_cookies(current_cookie)
+            kuaishou_pass_token = cookie_dict.get("passToken")
+            if kuaishou_pass_token:
+                return True
+            return False
+        except Exception:
+            return False
+
    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self) -> bool:
        """
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
        """login kuaishou website and keep webdriver login state"""
        utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")

-        # click login button
+        # Check if already logged in (quick check without retry)
+        is_logged_in = await self._quick_check_login_state()
+        if is_logged_in:
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
+            return
+
+        # Check if login button exists (if not, might already be logged in)
        login_button_ele = self.context_page.locator(
            "xpath=//p[text()='登录']"
        )
-        await login_button_ele.click()
+        
+        try:
+            # Wait for the element to be visible with a shorter timeout
+            await login_button_ele.wait_for(state="visible", timeout=3000)
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
+            
+            # Try normal click first
+            await login_button_ele.click(timeout=5000)
+        except Exception as e:
+            # If login button is not found, might already be logged in
+            if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
+                utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
+                # Double check login state (quick check)
+                is_logged_in = await self._quick_check_login_state()
+                if is_logged_in:
+                    utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
+                    return
+                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
+                raise
+            else:
+                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
+                try:
+                    # If normal click fails, try force click to bypass overlay
+                    await login_button_ele.click(force=True, timeout=5000)
+                except Exception as e2:
+                    utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
+                    # If force click also fails, use JavaScript to click directly
+                    await login_button_ele.evaluate("element => element.click()")
+        
+        # Wait a moment for the login modal to appear
+        await asyncio.sleep(1)

        # find login qrcode
        qrcode_img_selector = "//div[@class='qrcode-img']//img"
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
        self._page_extractor = TieBaExtractor()
        self.default_ip_proxy = default_ip_proxy
        self.playwright_page = playwright_page  # Playwright页面对象
+        self._last_captcha_check_time = 0  # 上次验证码检测时间
+        self._captcha_verified_recently = False  # 是否最近完成过验证码

    def _sync_request(self, method, url, proxy=None, **kwargs):
        """
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
        self.headers["Cookie"] = cookie_str
        utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")

+    async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
+        """
+        检测并等待百度验证码完成（在爬虫过程中使用）
+        等待用户手动拖动验证码，验证成功后自动识别并继续
+        
+        Args:
+            max_wait_time: 最大等待时间（秒），默认120秒
+        """
+        if not self.playwright_page:
+            return
+        
+        import time
+
+        async def _detect_captcha() -> bool:
+            """更全面地检测验证码（包含文本、URL 及常见容器）"""
+            # DOM 选择器
+            selector_hits = [
+                '.tang-pass-slider',
+                '#captcha',
+                '.vcode-img',
+                '.pass-verify',
+                '.tang-pass-verify',
+                '.pass-verify-slider',
+                'div[id*="captcha"]',
+                'div[class*="verify"]',
+                'div[class*="captcha"]',
+                'text=安全验证',
+                'text=请输入验证码',
+                'text=拖动',
+                'text=滑动',
+            ]
+            for selector in selector_hits:
+                try:
+                    element = await self.playwright_page.query_selector(selector)
+                    if element and await element.is_visible():
+                        return True
+                except Exception:
+                    continue
+
+            # URL 关键词
+            url_lower = (self.playwright_page.url or "").lower()
+            if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
+                return True
+
+            # 页面文本关键词（截断以降低开销）
+            try:
+                page_text = (await self.playwright_page.content())[:4000]
+                if any(
+                    kw in page_text
+                    for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
+                ):
+                    return True
+            except Exception:
+                pass
+            return False
+
+        # 如果最近5秒内刚完成过验证码，跳过检测（避免重复检测）
+        if self._captcha_verified_recently:
+            time_since_last_check = time.time() - self._last_captcha_check_time
+            if time_since_last_check < 5:
+                utils.logger.debug(
+                    f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码，跳过检测"
+                )
+                return
+            else:
+                self._captcha_verified_recently = False
+
+        # 基础选择器（用于后续反复检测）
+        captcha_selectors = [
+            '.tang-pass-slider',
+            '#captcha',
+            '.vcode-img',
+            '.pass-verify',
+            '.tang-pass-verify',
+            '.pass-verify-slider',
+            'div[id*="captcha"]',
+            'div[class*="verify"]',
+            'div[class*="captcha"]',
+        ]
+        success_selectors = [
+            '.tang-pass-success',
+            '.pass-verify-success',
+            'div[class*="success"]',
+        ]
+
+        # 检测验证码是否存在
+        captcha_found = await _detect_captcha()
+        if captcha_found:
+            utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码，请手动拖动完成验证...")
+        if not captcha_found:
+            return
+        
+        # 记录当前URL，用于检测页面跳转
+        initial_url = self.playwright_page.url
+        utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
+        utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码（最多等待 {max_wait_time} 秒）...")
+        
+        start_time = time.time()
+        last_log_time = 0
+        check_interval = 1  # 检查间隔改为1秒，更快响应
+        
+        while True:
+            # 检查是否超时
+            elapsed_time = time.time() - start_time
+            if elapsed_time >= max_wait_time:
+                utils.logger.warning(
+                    f"[BaiduTieBaClient] ⏰ 等待验证码超时（{max_wait_time}秒），跳过当前百度贴吧爬取任务"
+                )
+                # 超时直接中断本次百度贴吧爬虫，交给上层捕获处理
+                raise TimeoutError(
+                    f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
+                )
+            
+            try:
+                # 检测验证成功的标识
+                verification_success = False
+                for selector in success_selectors:
+                    try:
+                        element = await self.playwright_page.query_selector(selector)
+                        if element:
+                            is_visible = await element.is_visible()
+                            if is_visible:
+                                verification_success = True
+                                utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
+                                break
+                    except Exception:
+                        continue
+                
+                # 检测验证码是否还存在
+                captcha_still_exists = False
+                for selector in captcha_selectors:
+                    try:
+                        element = await self.playwright_page.query_selector(selector)
+                        if element:
+                            is_visible = await element.is_visible()
+                            if is_visible:
+                                captcha_still_exists = True
+                                break
+                    except Exception:
+                        continue
+                
+                # 检测页面URL是否变化（验证成功后可能会跳转）
+                current_url = self.playwright_page.url
+                url_changed = current_url != initial_url
+                
+                # 判断验证是否成功
+                # 成功条件：1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化（且不是验证码页面）
+                if verification_success or (not captcha_still_exists and url_changed):
+                    # 验证码消失且URL变化，可能是验证成功后的跳转
+                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，检测到页面变化，等待3秒确认验证完成...")
+                    await asyncio.sleep(3)
+                    
+                    # 再次确认验证码是否真的消失了
+                    captcha_still_exists = False
+                    for selector in captcha_selectors:
+                        try:
+                            element = await self.playwright_page.query_selector(selector)
+                            if element:
+                                is_visible = await element.is_visible()
+                                if is_visible:
+                                    captcha_still_exists = True
+                                    break
+                        except Exception:
+                            continue
+                    
+                    if not captcha_still_exists:
+                        # 确认验证成功
+                        final_url = self.playwright_page.url
+                        utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功！")
+                        if url_changed:
+                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
+                        else:
+                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化，验证在当前页面完成")
+                        
+                        # 标记最近完成过验证码，避免立即再次检测
+                        self._captcha_verified_recently = True
+                        import time
+                        self._last_captcha_check_time = time.time()
+                        
+                        # 等待页面稳定，避免立即再次检测验证码
+                        await asyncio.sleep(3)
+                        
+                        # 验证成功后，再次检查是否又出现了验证码（防止跳转到新的验证码页面）
+                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
+                        await asyncio.sleep(2)
+                        
+                        captcha_reappeared = False
+                        for selector in captcha_selectors:
+                            try:
+                                element = await self.playwright_page.query_selector(selector)
+                                if element:
+                                    is_visible = await element.is_visible()
+                                    if is_visible:
+                                        captcha_reappeared = True
+                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
+                                        break
+                            except Exception:
+                                continue
+                        
+                        if not captcha_reappeared:
+                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
+                            break
+                        else:
+                            # 如果又出现了验证码，重置状态继续等待
+                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
+                            initial_url = self.playwright_page.url
+                            start_time = time.time()
+                            continue
+                    else:
+                        # 验证码又出现了，可能验证失败或页面刷新
+                        utils.logger.warning("[BaiduTieBaClient] ⚠️  验证码重新出现，可能验证失败，继续等待...")
+                elif not captcha_still_exists and not url_changed:
+                    # 验证码消失但URL未变化，可能是验证成功但未跳转
+                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，等待3秒确认验证完成...")
+                    await asyncio.sleep(3)
+                    
+                    # 再次确认
+                    captcha_still_exists = False
+                    for selector in captcha_selectors:
+                        try:
+                            element = await self.playwright_page.query_selector(selector)
+                            if element:
+                                is_visible = await element.is_visible()
+                                if is_visible:
+                                    captcha_still_exists = True
+                                    break
+                        except Exception:
+                            continue
+                    
+                    if not captcha_still_exists:
+                        utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功！")
+                        
+                        # 标记最近完成过验证码
+                        self._captcha_verified_recently = True
+                        import time
+                        self._last_captcha_check_time = time.time()
+                        
+                        # 等待页面稳定
+                        await asyncio.sleep(3)
+                        
+                        # 验证成功后，再次检查是否又出现了验证码
+                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
+                        await asyncio.sleep(2)
+                        
+                        captcha_reappeared = False
+                        for selector in captcha_selectors:
+                            try:
+                                element = await self.playwright_page.query_selector(selector)
+                                if element:
+                                    is_visible = await element.is_visible()
+                                    if is_visible:
+                                        captcha_reappeared = True
+                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
+                                        break
+                            except Exception:
+                                continue
+                        
+                        if not captcha_reappeared:
+                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
+                            break
+                        else:
+                            # 如果又出现了验证码，重置状态继续等待
+                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
+                            initial_url = self.playwright_page.url
+                            start_time = time.time()
+                            continue
+                
+            except Exception as e:
+                # 如果检测过程中出现异常，继续等待
+                utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
+            
+            # 等待一段时间后再次检查
+            await asyncio.sleep(check_interval)
+            
+            # 每10秒输出一次提示
+            current_time = int(elapsed_time)
+            if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
+                remaining_time = max_wait_time - current_time
+                utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...（剩余 {remaining_time} 秒）")
+                last_log_time = current_time
+
    async def get_notes_by_keyword(
        self,
        keyword: str,
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问搜索页面
            await self.playwright_page.goto(full_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问帖子详情页面
            await self.playwright_page.goto(note_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
                # 使用Playwright访问评论页面
                await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")

+                # 检测并等待验证码完成
+                await self._wait_for_captcha_completion()
+
                # 等待页面加载,使用配置文件中的延时设置
                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
                    # 使用Playwright访问子评论页面
                    await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")

+                    # 检测并等待验证码完成
+                    await self._wait_for_captcha_completion()
+
                    # 等待页面加载,使用配置文件中的延时设置
                    await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问贴吧页面
            await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者主页
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者帖子列表页面
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -58,11 +58,47 @@ class WeiboClient:
        if enable_return_response:
            return response

-        data: Dict = response.json()
+        # 检查响应状态码
+        if response.status_code != 200:
+            error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
+            utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
+            raise DataFetchError(error_msg)
+
+        # 检查响应内容类型
+        content_type = response.headers.get("content-type", "").lower()
+        if "application/json" not in content_type and "text/json" not in content_type:
+            # 可能是HTML响应（如登录页面）
+            response_text = response.text[:500]
+            utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
+            # 如果看起来像是HTML，可能是需要登录
+            if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
+                raise DataFetchError("Response is HTML, may need to login or cookie expired")
+            raise DataFetchError(f"Unexpected content type: {content_type}")
+
+        # 安全地解析JSON
+        try:
+            data: Dict = response.json()
+        except ValueError as e:
+            # JSON解析失败
+            response_text = response.text[:500]
+            utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
+            utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
+            raise DataFetchError(f"Failed to parse JSON response: {e}")
+
+        # 检查响应是否为空
+        if not data:
+            utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
+            return {"cards": []}
+
        ok_code = data.get("ok")
        if ok_code == 0:  # response error
+            msg = data.get("msg", "response error")
+            # "这里还没有内容" 是正常情况，表示没有更多数据，不应该抛出异常
+            if msg == "这里还没有内容" or "还没有内容" in msg:
+                utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
+                return {"cards": []}  # 返回空结果，而不是抛出异常
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
-            raise DataFetchError(data.get("msg", "response error"))
+            raise DataFetchError(msg)
        elif ok_code != 1:  # unknown error
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
            raise DataFetchError(data.get("msg", "unknown error"))
@@ -15,6 +15,7 @@

 import asyncio
 import os
+import re
 # import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
        self.cdp_manager = None

    async def start(self):
+        # 初始化数据库表（如果需要）
+        if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
+            try:
+                from database.db_session import create_tables
+                utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
+                await create_tables(config.SAVE_DATA_OPTION)
+                utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
+            except Exception as e:
+                utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
+                raise
+        
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
                    page += 1
                    continue
                utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
-                search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
+                try:
+                    search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
+                    cards = search_res.get("cards", [])
+                    utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
+                    
+                    # 如果没有更多内容，跳出循环
+                    if len(cards) == 0:
+                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
+                        break
+                except DataFetchError as e:
+                    # 如果是"没有内容"的错误，正常结束
+                    if "还没有内容" in str(e) or "没有内容" in str(e):
+                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
+                        break
+                    # 其他错误继续抛出
+                    raise
+                
                note_id_list: List[str] = []
-                note_list = filter_search_result_card(search_res.get("cards"))
+                note_list = filter_search_result_card(cards)
+                utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
+                
                for note_item in note_list:
                    if note_item:
                        mblog: Dict = note_item.get("mblog")
                        if mblog:
-                            note_id_list.append(mblog.get("id"))
-                            await weibo_store.update_weibo_note(note_item)
-                            await self.get_note_images(mblog)
+                            note_id = mblog.get("id")
+                            note_id_list.append(note_id)
+                            try:
+                                await weibo_store.update_weibo_note(note_item)
+                                await self.get_note_images(mblog)
+                            except Exception as e:
+                                utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
+                                # 继续处理其他笔记，不中断整个流程

                page += 1
                
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):

    def __init__(
        self,
-        timeout=10,
+        timeout=30,  # 增加超时时间到30秒，避免请求卡住
        proxy=None,
        *,
        headers: Dict[str, str],
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
        """
        d_c0 = self.cookie_dict.get("d_c0")
        if not d_c0:
-            raise Exception("d_c0 not found in cookies")
+            utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
+            raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
        sign_res = sign(url, self.default_headers["cookie"])
        headers = self.default_headers.copy()
        headers['x-zst-81'] = sign_res["x-zst-81"]
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
        Returns:

        """
+        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
        uri = "/api/v4/search_v3"
        params = {
            "gk_version": "gz-gaokao",
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
            "sort": sort.value,
            "vertical": note_type.value,
        }
-        search_res = await self.get(uri, params)
-        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
-        return self._extractor.extract_contents_from_search(search_res)
+        try:
+            utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
+            search_res = await self.get(uri, params)
+            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功，开始解析结果")
+            contents = self._extractor.extract_contents_from_search(search_res)
+            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成，找到 {len(contents)} 条内容")
+            return contents
+        except Exception as e:
+            utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
+            raise

    async def get_root_comments(
        self,
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
                await self.browser_context.add_init_script(path="libs/stealth.min.js")

            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+            # 设置页面超时时间为30秒
+            self.context_page.set_default_timeout(30000)
+            await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)

            # Create a client to interact with the zhihu website.
            self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
                    cookie_str=config.COOKIES,
                )
                await login_obj.begin()
+                # 登录后等待页面稳定
+                await asyncio.sleep(2)
                await self.zhihu_client.update_cookies(
                    browser_context=self.browser_context
                )

            # 知乎的搜索接口需要打开搜索页面之后cookies才能访问API，单独的首页不行
+            # 使用用户配置的第一个关键词，如果没有关键词则使用默认的"test"
+            search_keyword = "test"  # 默认关键词
+            if config.KEYWORDS and config.KEYWORDS.strip():
+                keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+                if keywords_list:
+                    search_keyword = keywords_list[0]
+                    utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
+                else:
+                    utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空，使用默认关键词 'test'")
+            else:
+                utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词，使用默认关键词 'test'")
+            
            utils.logger.info(
-                "[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies，该过程需要5秒左右"
+                f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
            )
-            await self.context_page.goto(
-                f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
+            utils.logger.info(
+                f"[ZhihuCrawler.start] 关键词: {search_keyword}"
            )
-            await asyncio.sleep(5)
-            await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+            try:
+                # 使用用户配置的关键词，而不是硬编码的python
+                from urllib.parse import quote
+                encoded_keyword = quote(search_keyword)
+                search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
+                utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
+                
+                # 添加超时时间，避免卡住
+                await self.context_page.goto(
+                    search_url,
+                    wait_until="domcontentloaded",
+                    timeout=30000
+                )
+                utils.logger.info("[ZhihuCrawler.start] 页面跳转完成，等待页面稳定...")
+                # 等待页面基本加载完成，不等待networkidle（知乎页面可能一直有请求）
+                await asyncio.sleep(3)
+                utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载，开始更新cookies")
+                await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+                utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
+            except Exception as e:
+                utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e}，尝试继续执行", exc_info=True)
+                # 即使跳转失败，也尝试更新cookies
+                try:
+                    await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+                    utils.logger.info("[ZhihuCrawler.start] Cookies更新完成（跳转失败后）")
+                except Exception as cookie_error:
+                    utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)

            crawler_type_var.set(config.CRAWLER_TYPE)
+            utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
+            utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
+            
            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
+                utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
                await self.search()
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
+                utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
                await self.get_specified_notes()
            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
+                utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
                await self.get_creators_and_notes()
            else:
-                pass
+                utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")

            utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")

    async def search(self) -> None:
        """Search for notes and retrieve their comment information."""
-        utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
+        utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
        zhihu_limit_count = 20  # zhihu limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
        total_failed_contents = 0
        total_saved_comments = 0
        
-        for keyword in config.KEYWORDS.split(","):
+        # 安全地处理关键词列表
+        if not config.KEYWORDS or not config.KEYWORDS.strip():
+            utils.logger.error("[ZhihuCrawler.search] 关键词配置为空，无法执行搜索任务")
+            return
+        
+        keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+        if not keywords_list:
+            utils.logger.error("[ZhihuCrawler.search] 关键词列表为空，无法执行搜索任务")
+            return
+        
+        utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
+        
+        for keyword in keywords_list:
            source_keyword_var.set(keyword)
            utils.logger.info(
                f"[ZhihuCrawler.search] Current search keyword: {keyword}"
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
        cookie_str, cookie_dict = utils.convert_cookies(
            await self.browser_context.cookies()
        )
+        
+        # 获取用户配置的关键词用于 referer，如果没有则使用默认值
+        referer_keyword = "test"
+        if config.KEYWORDS and config.KEYWORDS.strip():
+            keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+            if keywords_list:
+                referer_keyword = keywords_list[0]
+        
+        from urllib.parse import quote
+        encoded_referer_keyword = quote(referer_keyword)
+        referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
+        
        zhihu_client_obj = ZhiHuClient(
            proxy=httpx_proxy,
            headers={
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
                "accept-language": "zh-CN,zh;q=0.9",
                "cookie": cookie_str,
                "priority": "u=1, i",
-                "referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
+                "referer": referer_url,
                "user-agent": self.user_agent,
                "x-api-version": "3.0.91",
                "x-app-za": "OS=Web",