更新部分爬虫以兼容本地运行及数据库存储

2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -9,8 +9,8 @@
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
 # 基础配置
-PLATFORM = "zhihu"  # 平台，xhs | dy | ks | bili | wb | tieba | zhihu
+PLATFORM = "ks"  # 平台，xhs | dy | ks | bili | wb | tieba | zhihu
-KEYWORDS = "F6智慧门店,南京爱福路汽车科技有限公司,汽车后市场,汽修店,新康众"  # 关键词搜索配置，以英文逗号分隔
+KEYWORDS = "F6智慧门店,F6智数,中国汽车后市场白皮书,南京爱福路汽车科技有限公司,汽车后市场,汽车修理厂,新康众,天猫养车,汽后,汽修厂,爱福路,康众"  # 关键词搜索配置，以英文逗号分隔
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
 CRAWLER_TYPE = "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
@@ -30,6 +30,12 @@ IP_PROXY_PROVIDER_NAME = "kuaidaili"  # kuaidaili | wandouhttp
 # 抖音如果一直提示失败，打开浏览器看下是否扫码登录之后出现了手机号验证，如果出现了手动过一下再试。
 HEADLESS = True
 # HTTP/网络配置
 # 如内网有自签名证书导致 TLS 失败，可临时置为 False
 HTTPX_VERIFY = False
 # 若需要指定上游代理（如 http://user:pass@host:port），填此值；留空使用系统/环境变量
 HTTPX_PROXY = ""
 # 是否保存登录状态
 SAVE_LOGIN_STATE = True
@@ -13,10 +13,11 @@ _engines = {}
 async def create_database_if_not_exists(db_type: str):
    if db_type == "mysql" or db_type == "db":
        # Connect to the server without a database
-        server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
+        server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}?charset=utf8mb4"
        engine = create_async_engine(server_url, echo=False)
        async with engine.connect() as conn:
-            await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
+            # 确保数据库使用utf8mb4字符集
            await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
        await engine.dispose()
    elif db_type == "postgresql":
        # Connect to PostgreSQL default database (postgres) to create target database
@@ -48,7 +49,8 @@ def get_async_engine(db_type: str = None):
    if db_type == "sqlite":
        db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
    elif db_type == "mysql" or db_type == "db":
-        db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
+        # 添加charset=utf8mb4以支持完整的UTF-8编码（包括emoji和中文）
        db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}?charset=utf8mb4"
    elif db_type == "postgresql":
        db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
    else:
@@ -11,42 +11,54 @@
 import asyncio
 import sys
-from typing import Optional
+from typing import Dict, Optional, Type
 import importlib
 import cmd_arg
 import config
 from database import db
 from base.base_crawler import AbstractCrawler
 from media_platform.bilibili import BilibiliCrawler
 from media_platform.douyin import DouYinCrawler
 from media_platform.kuaishou import KuaishouCrawler
 from media_platform.tieba import TieBaCrawler
 from media_platform.weibo import WeiboCrawler
 from media_platform.xhs import XiaoHongShuCrawler
 from media_platform.zhihu import ZhihuCrawler
 from tools.async_file_writer import AsyncFileWriter
 from var import crawler_type_var
 class CrawlerFactory:
-    CRAWLERS = {
+    _CRAWLER_PATHS = {
-        "xhs": XiaoHongShuCrawler,
+        "xhs": "media_platform.xhs.XiaoHongShuCrawler",
-        "dy": DouYinCrawler,
+        "dy": "media_platform.douyin.DouYinCrawler",
-        "ks": KuaishouCrawler,
+        "ks": "media_platform.kuaishou.KuaishouCrawler",
-        "bili": BilibiliCrawler,
+        "bili": "media_platform.bilibili.BilibiliCrawler",
-        "wb": WeiboCrawler,
+        "wb": "media_platform.weibo.WeiboCrawler",
-        "tieba": TieBaCrawler,
+        "tieba": "media_platform.tieba.TieBaCrawler",
-        "zhihu": ZhihuCrawler,
+        "zhihu": "media_platform.zhihu.ZhihuCrawler",
    }
    _cache: Dict[str, Type[AbstractCrawler]] = {}
    @staticmethod
    def create_crawler(platform: str) -> AbstractCrawler:
-        crawler_class = CrawlerFactory.CRAWLERS.get(platform)
+        path = CrawlerFactory._CRAWLER_PATHS.get(platform)
-        if not crawler_class:
+        if not path:
            raise ValueError(
                "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
            )
-        return crawler_class()
+
        if platform not in CrawlerFactory._cache:
            module_name, class_name = path.rsplit(".", 1)
            try:
                module = importlib.import_module(module_name)
                crawler_class = getattr(module, class_name)
            except ModuleNotFoundError as exc:
                hint = (
                    "Please install optional dependency 'xhshow' (pip install xhshow) "
                    "or disable the xhs platform."
                    if platform == "xhs" and exc.name == "xhshow"
                    else f"Missing dependency while importing {module_name}"
                )
                raise ModuleNotFoundError(f"{exc}: {hint}") from exc
            CrawlerFactory._cache[platform] = crawler_class
        return CrawlerFactory._cache[platform]()
 crawler: Optional[AbstractCrawler] = None
@@ -59,6 +71,12 @@ crawler: Optional[AbstractCrawler] = None
 async def main():
    # Init crawler
    global crawler
    # 导入工具模块以初始化日志
    from tools import utils
    utils.logger.info("=" * 60)
    utils.logger.info("MediaCrawler 启动")
    utils.logger.info("=" * 60)
    # parse cmd
    args = await cmd_arg.parse_cmd()
@@ -49,8 +49,27 @@ class BilibiliClient(AbstractApiClient):
        self.cookie_dict = cookie_dict
    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
+        """
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+        Basic HTTP request wrapper with retries for transient network errors.
        """
        verify = getattr(config, "HTTPX_VERIFY", True)
        # 优先使用传入 proxy，其次是 config.HTTPX_PROXY，最后走系统环境变量
        proxy = self.proxy or getattr(config, "HTTPX_PROXY", "") or None
        async with httpx.AsyncClient(proxy=proxy, timeout=self.timeout, verify=verify) as client:
            # 简单重试，处理短暂的连接失败
            last_exc: Optional[Exception] = None
            for attempt in range(3):
                try:
                    response = await client.request(method, url, **kwargs)
                    break
                except httpx.HTTPError as e:
                    last_exc = e
                    if attempt == 2:
                        # 3rd failure -> give up
                        utils.logger.error(f"[BilibiliClient.request] Network error on {method} {url}: {repr(e)}")
                        raise DataFetchError(f"network error: {e}") from e
                    await asyncio.sleep(1)
        try:
            data: Dict = response.json()
        except json.JSONDecodeError:
@@ -68,10 +68,23 @@ class BilibiliLogin(AbstractLogin):
            return True
        return False
    async def _has_valid_login_cookie(self) -> bool:
        """
        快速检查当前上下文是否已有登录态，用于避免重复扫码。
        """
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        return bool(cookie_dict.get("SESSDATA") or cookie_dict.get("DedeUserID"))
    async def login_by_qrcode(self):
        """login bilibili website and keep webdriver login state"""
        utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
        # 如果已经登录则直接跳过扫码流程
        if await self._has_valid_login_cookie():
            utils.logger.info("[BilibiliLogin.login_by_qrcode] 已检测到有效登录态，跳过扫码登录")
            return
        # click login button
        login_button_ele = self.context_page.locator(
            "xpath=//div[@class='right-entry__outside go-login-btn']//div"
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
        params["a_bogus"] = a_bogus
    async def request(self, method, url, **kwargs):
        async with httpx.AsyncClient(proxy=self.proxy) as client:
            response = await client.request(method, url, timeout=self.timeout, **kwargs)
        try:
-            if response.text == "" or response.text == "blocked":
+            async with httpx.AsyncClient(proxy=self.proxy) as client:
-                utils.logger.error(f"request params incrr, response.text: {response.text}")
+                response = await client.request(method, url, timeout=self.timeout, **kwargs)
-                raise Exception("account blocked")
+            try:
-            return response.json()
+                if response.text == "" or response.text == "blocked":
                    utils.logger.error(f"request params incrr, response.text: {response.text}")
                    raise Exception("account blocked")
                return response.json()
            except Exception as e:
                raise DataFetchError(f"{e}, {response.text}")
        except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
            utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
            raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
        except httpx.TimeoutException as e:
            utils.logger.error(f"请求超时: {e}")
            raise DataFetchError(f"请求超时: {e}")
        except Exception as e:
-            raise DataFetchError(f"{e}, {response.text}")
+            utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
            raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")
    async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
        """
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
                    page += 1
                    continue
                posts_res = None
                retry_success = False
                try:
                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
                    posts_res = await self.dy_client.search_info_by_keyword(
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
                        publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
                        search_id=dy_search_id,
                    )
-                    if posts_res.get("data") is None or posts_res.get("data") == []:
+                    retry_success = True
-                        utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
+                except DataFetchError as e:
                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
                    # 如果是网络连接错误，等待后重试一次
                    if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
                        utils.logger.warning(f"[DouYinCrawler.search] 网络错误，等待3秒后重试...")
                        await asyncio.sleep(3)
                        try:
                            posts_res = await self.dy_client.search_info_by_keyword(
                                keyword=keyword,
                                offset=page * dy_limit_count - dy_limit_count,
                                publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
                                search_id=dy_search_id,
                            )
                            retry_success = True
                        except Exception as retry_e:
                            utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
                            break
                    else:
                        break
-                except DataFetchError:
+                except Exception as e:
-                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
+                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
                    break
                # 如果请求失败（包括重试失败），跳过后续处理
                if not retry_success or posts_res is None:
                    break
                if posts_res.get("data") is None or posts_res.get("data") == []:
                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
                    break
                page += 1
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
        self.graphql = KuaiShouGraphQL()
    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
+        """Make HTTP request with retry and proxy fallback."""
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+        max_retries = 3
-        data: Dict = response.json()
+
-        if data.get("errors"):
+        # build proxy attempts: try proxy first (if set), then no-proxy
-            raise DataFetchError(data.get("errors", "unkonw error"))
+        proxy_attempts: List[Optional[str]] = []
-        else:
+        if self.proxy:
-            return data.get("data", {})
+            proxy_attempts.append(self.proxy)
        proxy_attempts.append(None)  # always allow a direct attempt
        last_exc: Optional[Exception] = None
        for attempt in range(max_retries):
            proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
            try:
                async with httpx.AsyncClient(proxy=proxy_to_use) as client:
                    response = await client.request(method, url, timeout=self.timeout, **kwargs)
                data: Dict = response.json()
                if data.get("errors"):
                    raise DataFetchError(data.get("errors", "unkonw error"))
                return data.get("data", {})
            except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
                last_exc = e
                utils.logger.warning(
                    f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
                    f"proxy={proxy_to_use} url={url} err={e!r}"
                )
                if attempt < max_retries - 1:
                    await asyncio.sleep(1)
                    continue
                utils.logger.error(
                    f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
                    f"proxy={proxy_to_use} url={url} err={e!r}"
                )
                raise
            except Exception as e:
                # For other exceptions (like DataFetchError), don't retry
                last_exc = e
                utils.logger.error(
                    f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
                )
                raise
        # If somehow we exit the loop without returning, raise last exception
        if last_exc:
            raise last_exc
    async def get(self, uri: str, params=None) -> Dict:
        final_uri = uri
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):
            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(f"{self.index_url}?isHome=1")
+            # 添加重试机制处理网络连接错误
            max_retries = 3
            retry_count = 0
            while retry_count < max_retries:
                try:
                    await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
                    break
                except Exception as e:
                    retry_count += 1
                    error_msg = str(e)
                    if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
                        if retry_count < max_retries:
                            utils.logger.warning(f"[KuaishouCrawler] 网络连接错误，第 {retry_count} 次重试: {e}")
                            await asyncio.sleep(2 * retry_count)  # 递增等待时间
                        else:
                            utils.logger.error(f"[KuaishouCrawler] 网络连接失败，已重试 {max_retries} 次: {e}")
                            raise
                    else:
                        # 非网络错误直接抛出
                        raise
            # Create a client to interact with the kuaishou website.
            self.ks_client = await self.create_ks_client(httpx_proxy_format)
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
        else:
            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
    async def _quick_check_login_state(self) -> bool:
        """
            Quick check if the current login status is successful without retry
            Returns True if logged in, False otherwise
        """
        try:
            current_cookie = await self.browser_context.cookies()
            _, cookie_dict = utils.convert_cookies(current_cookie)
            kuaishou_pass_token = cookie_dict.get("passToken")
            if kuaishou_pass_token:
                return True
            return False
        except Exception:
            return False
    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self) -> bool:
        """
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
        """login kuaishou website and keep webdriver login state"""
        utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
-        # click login button
+        # Check if already logged in (quick check without retry)
        is_logged_in = await self._quick_check_login_state()
        if is_logged_in:
            utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
            return
        # Check if login button exists (if not, might already be logged in)
        login_button_ele = self.context_page.locator(
            "xpath=//p[text()='登录']"
        )
-        await login_button_ele.click()
+        
        try:
            # Wait for the element to be visible with a shorter timeout
            await login_button_ele.wait_for(state="visible", timeout=3000)
            utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
            # Try normal click first
            await login_button_ele.click(timeout=5000)
        except Exception as e:
            # If login button is not found, might already be logged in
            if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
                utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
                # Double check login state (quick check)
                is_logged_in = await self._quick_check_login_state()
                if is_logged_in:
                    utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
                    return
                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
                raise
            else:
                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
                try:
                    # If normal click fails, try force click to bypass overlay
                    await login_button_ele.click(force=True, timeout=5000)
                except Exception as e2:
                    utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
                    # If force click also fails, use JavaScript to click directly
                    await login_button_ele.evaluate("element => element.click()")
        # Wait a moment for the login modal to appear
        await asyncio.sleep(1)
        # find login qrcode
        qrcode_img_selector = "//div[@class='qrcode-img']//img"
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
        self._page_extractor = TieBaExtractor()
        self.default_ip_proxy = default_ip_proxy
        self.playwright_page = playwright_page  # Playwright页面对象
        self._last_captcha_check_time = 0  # 上次验证码检测时间
        self._captcha_verified_recently = False  # 是否最近完成过验证码
    def _sync_request(self, method, url, proxy=None, **kwargs):
        """
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
        self.headers["Cookie"] = cookie_str
        utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
    async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
        """
        检测并等待百度验证码完成（在爬虫过程中使用）
        等待用户手动拖动验证码，验证成功后自动识别并继续
        Args:
            max_wait_time: 最大等待时间（秒），默认120秒
        """
        if not self.playwright_page:
            return
        import time
        async def _detect_captcha() -> bool:
            """更全面地检测验证码（包含文本、URL 及常见容器）"""
            # DOM 选择器
            selector_hits = [
                '.tang-pass-slider',
                '#captcha',
                '.vcode-img',
                '.pass-verify',
                '.tang-pass-verify',
                '.pass-verify-slider',
                'div[id*="captcha"]',
                'div[class*="verify"]',
                'div[class*="captcha"]',
                'text=安全验证',
                'text=请输入验证码',
                'text=拖动',
                'text=滑动',
            ]
            for selector in selector_hits:
                try:
                    element = await self.playwright_page.query_selector(selector)
                    if element and await element.is_visible():
                        return True
                except Exception:
                    continue
            # URL 关键词
            url_lower = (self.playwright_page.url or "").lower()
            if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
                return True
            # 页面文本关键词（截断以降低开销）
            try:
                page_text = (await self.playwright_page.content())[:4000]
                if any(
                    kw in page_text
                    for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
                ):
                    return True
            except Exception:
                pass
            return False
        # 如果最近5秒内刚完成过验证码，跳过检测（避免重复检测）
        if self._captcha_verified_recently:
            time_since_last_check = time.time() - self._last_captcha_check_time
            if time_since_last_check < 5:
                utils.logger.debug(
                    f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码，跳过检测"
                )
                return
            else:
                self._captcha_verified_recently = False
        # 基础选择器（用于后续反复检测）
        captcha_selectors = [
            '.tang-pass-slider',
            '#captcha',
            '.vcode-img',
            '.pass-verify',
            '.tang-pass-verify',
            '.pass-verify-slider',
            'div[id*="captcha"]',
            'div[class*="verify"]',
            'div[class*="captcha"]',
        ]
        success_selectors = [
            '.tang-pass-success',
            '.pass-verify-success',
            'div[class*="success"]',
        ]
        # 检测验证码是否存在
        captcha_found = await _detect_captcha()
        if captcha_found:
            utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码，请手动拖动完成验证...")
        if not captcha_found:
            return
        # 记录当前URL，用于检测页面跳转
        initial_url = self.playwright_page.url
        utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
        utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码（最多等待 {max_wait_time} 秒）...")
        start_time = time.time()
        last_log_time = 0
        check_interval = 1  # 检查间隔改为1秒，更快响应
        while True:
            # 检查是否超时
            elapsed_time = time.time() - start_time
            if elapsed_time >= max_wait_time:
                utils.logger.warning(
                    f"[BaiduTieBaClient] ⏰ 等待验证码超时（{max_wait_time}秒），跳过当前百度贴吧爬取任务"
                )
                # 超时直接中断本次百度贴吧爬虫，交给上层捕获处理
                raise TimeoutError(
                    f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
                )
            try:
                # 检测验证成功的标识
                verification_success = False
                for selector in success_selectors:
                    try:
                        element = await self.playwright_page.query_selector(selector)
                        if element:
                            is_visible = await element.is_visible()
                            if is_visible:
                                verification_success = True
                                utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
                                break
                    except Exception:
                        continue
                # 检测验证码是否还存在
                captcha_still_exists = False
                for selector in captcha_selectors:
                    try:
                        element = await self.playwright_page.query_selector(selector)
                        if element:
                            is_visible = await element.is_visible()
                            if is_visible:
                                captcha_still_exists = True
                                break
                    except Exception:
                        continue
                # 检测页面URL是否变化（验证成功后可能会跳转）
                current_url = self.playwright_page.url
                url_changed = current_url != initial_url
                # 判断验证是否成功
                # 成功条件：1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化（且不是验证码页面）
                if verification_success or (not captcha_still_exists and url_changed):
                    # 验证码消失且URL变化，可能是验证成功后的跳转
                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，检测到页面变化，等待3秒确认验证完成...")
                    await asyncio.sleep(3)
                    # 再次确认验证码是否真的消失了
                    captcha_still_exists = False
                    for selector in captcha_selectors:
                        try:
                            element = await self.playwright_page.query_selector(selector)
                            if element:
                                is_visible = await element.is_visible()
                                if is_visible:
                                    captcha_still_exists = True
                                    break
                        except Exception:
                            continue
                    if not captcha_still_exists:
                        # 确认验证成功
                        final_url = self.playwright_page.url
                        utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功！")
                        if url_changed:
                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
                        else:
                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化，验证在当前页面完成")
                        # 标记最近完成过验证码，避免立即再次检测
                        self._captcha_verified_recently = True
                        import time
                        self._last_captcha_check_time = time.time()
                        # 等待页面稳定，避免立即再次检测验证码
                        await asyncio.sleep(3)
                        # 验证成功后，再次检查是否又出现了验证码（防止跳转到新的验证码页面）
                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
                        await asyncio.sleep(2)
                        captcha_reappeared = False
                        for selector in captcha_selectors:
                            try:
                                element = await self.playwright_page.query_selector(selector)
                                if element:
                                    is_visible = await element.is_visible()
                                    if is_visible:
                                        captcha_reappeared = True
                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
                                        break
                            except Exception:
                                continue
                        if not captcha_reappeared:
                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
                            break
                        else:
                            # 如果又出现了验证码，重置状态继续等待
                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
                            initial_url = self.playwright_page.url
                            start_time = time.time()
                            continue
                    else:
                        # 验证码又出现了，可能验证失败或页面刷新
                        utils.logger.warning("[BaiduTieBaClient] ⚠️  验证码重新出现，可能验证失败，继续等待...")
                elif not captcha_still_exists and not url_changed:
                    # 验证码消失但URL未变化，可能是验证成功但未跳转
                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，等待3秒确认验证完成...")
                    await asyncio.sleep(3)
                    # 再次确认
                    captcha_still_exists = False
                    for selector in captcha_selectors:
                        try:
                            element = await self.playwright_page.query_selector(selector)
                            if element:
                                is_visible = await element.is_visible()
                                if is_visible:
                                    captcha_still_exists = True
                                    break
                        except Exception:
                            continue
                    if not captcha_still_exists:
                        utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功！")
                        # 标记最近完成过验证码
                        self._captcha_verified_recently = True
                        import time
                        self._last_captcha_check_time = time.time()
                        # 等待页面稳定
                        await asyncio.sleep(3)
                        # 验证成功后，再次检查是否又出现了验证码
                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
                        await asyncio.sleep(2)
                        captcha_reappeared = False
                        for selector in captcha_selectors:
                            try:
                                element = await self.playwright_page.query_selector(selector)
                                if element:
                                    is_visible = await element.is_visible()
                                    if is_visible:
                                        captcha_reappeared = True
                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
                                        break
                            except Exception:
                                continue
                        if not captcha_reappeared:
                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
                            break
                        else:
                            # 如果又出现了验证码，重置状态继续等待
                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
                            initial_url = self.playwright_page.url
                            start_time = time.time()
                            continue
            except Exception as e:
                # 如果检测过程中出现异常，继续等待
                utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
            # 等待一段时间后再次检查
            await asyncio.sleep(check_interval)
            # 每10秒输出一次提示
            current_time = int(elapsed_time)
            if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
                remaining_time = max_wait_time - current_time
                utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...（剩余 {remaining_time} 秒）")
                last_log_time = current_time
    async def get_notes_by_keyword(
        self,
        keyword: str,
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问搜索页面
            await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
            # 检测并等待验证码完成
            await self._wait_for_captcha_completion()
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问帖子详情页面
            await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
            # 检测并等待验证码完成
            await self._wait_for_captcha_completion()
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
                # 使用Playwright访问评论页面
                await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
                # 检测并等待验证码完成
                await self._wait_for_captcha_completion()
                # 等待页面加载,使用配置文件中的延时设置
                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
                    # 使用Playwright访问子评论页面
                    await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
                    # 检测并等待验证码完成
                    await self._wait_for_captcha_completion()
                    # 等待页面加载,使用配置文件中的延时设置
                    await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问贴吧页面
            await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
            # 检测并等待验证码完成
            await self._wait_for_captcha_completion()
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者主页
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
            # 检测并等待验证码完成
            await self._wait_for_captcha_completion()
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者帖子列表页面
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
            # 检测并等待验证码完成
            await self._wait_for_captcha_completion()
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -58,11 +58,47 @@ class WeiboClient:
        if enable_return_response:
            return response
-        data: Dict = response.json()
+        # 检查响应状态码
        if response.status_code != 200:
            error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
            raise DataFetchError(error_msg)
        # 检查响应内容类型
        content_type = response.headers.get("content-type", "").lower()
        if "application/json" not in content_type and "text/json" not in content_type:
            # 可能是HTML响应（如登录页面）
            response_text = response.text[:500]
            utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
            # 如果看起来像是HTML，可能是需要登录
            if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
                raise DataFetchError("Response is HTML, may need to login or cookie expired")
            raise DataFetchError(f"Unexpected content type: {content_type}")
        # 安全地解析JSON
        try:
            data: Dict = response.json()
        except ValueError as e:
            # JSON解析失败
            response_text = response.text[:500]
            utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
            utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
            raise DataFetchError(f"Failed to parse JSON response: {e}")
        # 检查响应是否为空
        if not data:
            utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
            return {"cards": []}
        ok_code = data.get("ok")
        if ok_code == 0:  # response error
            msg = data.get("msg", "response error")
            # "这里还没有内容" 是正常情况，表示没有更多数据，不应该抛出异常
            if msg == "这里还没有内容" or "还没有内容" in msg:
                utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
                return {"cards": []}  # 返回空结果，而不是抛出异常
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
-            raise DataFetchError(data.get("msg", "response error"))
+            raise DataFetchError(msg)
        elif ok_code != 1:  # unknown error
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
            raise DataFetchError(data.get("msg", "unknown error"))
@@ -15,6 +15,7 @@
 import asyncio
 import os
 import re
 # import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
        self.cdp_manager = None
    async def start(self):
        # 初始化数据库表（如果需要）
        if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
            try:
                from database.db_session import create_tables
                utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
                await create_tables(config.SAVE_DATA_OPTION)
                utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
            except Exception as e:
                utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
                raise
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
                    page += 1
                    continue
                utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
-                search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
+                try:
                    search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
                    cards = search_res.get("cards", [])
                    utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
                    # 如果没有更多内容，跳出循环
                    if len(cards) == 0:
                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
                        break
                except DataFetchError as e:
                    # 如果是"没有内容"的错误，正常结束
                    if "还没有内容" in str(e) or "没有内容" in str(e):
                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
                        break
                    # 其他错误继续抛出
                    raise
                note_id_list: List[str] = []
-                note_list = filter_search_result_card(search_res.get("cards"))
+                note_list = filter_search_result_card(cards)
                utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
                for note_item in note_list:
                    if note_item:
                        mblog: Dict = note_item.get("mblog")
                        if mblog:
-                            note_id_list.append(mblog.get("id"))
+                            note_id = mblog.get("id")
-                            await weibo_store.update_weibo_note(note_item)
+                            note_id_list.append(note_id)
-                            await self.get_note_images(mblog)
+                            try:
                                await weibo_store.update_weibo_note(note_item)
                                await self.get_note_images(mblog)
                            except Exception as e:
                                utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
                                # 继续处理其他笔记，不中断整个流程
                page += 1
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
    def __init__(
        self,
-        timeout=10,
+        timeout=30,  # 增加超时时间到30秒，避免请求卡住
        proxy=None,
        *,
        headers: Dict[str, str],
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
        """
        d_c0 = self.cookie_dict.get("d_c0")
        if not d_c0:
-            raise Exception("d_c0 not found in cookies")
+            utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
            raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
        sign_res = sign(url, self.default_headers["cookie"])
        headers = self.default_headers.copy()
        headers['x-zst-81'] = sign_res["x-zst-81"]
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
        Returns:
        """
        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
        uri = "/api/v4/search_v3"
        params = {
            "gk_version": "gz-gaokao",
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
            "sort": sort.value,
            "vertical": note_type.value,
        }
-        search_res = await self.get(uri, params)
+        try:
-        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
+            utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
-        return self._extractor.extract_contents_from_search(search_res)
+            search_res = await self.get(uri, params)
            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功，开始解析结果")
            contents = self._extractor.extract_contents_from_search(search_res)
            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成，找到 {len(contents)} 条内容")
            return contents
        except Exception as e:
            utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
            raise
    async def get_root_comments(
        self,
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
                await self.browser_context.add_init_script(path="libs/stealth.min.js")
            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+            # 设置页面超时时间为30秒
            self.context_page.set_default_timeout(30000)
            await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)
            # Create a client to interact with the zhihu website.
            self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
                    cookie_str=config.COOKIES,
                )
                await login_obj.begin()
                # 登录后等待页面稳定
                await asyncio.sleep(2)
                await self.zhihu_client.update_cookies(
                    browser_context=self.browser_context
                )
            # 知乎的搜索接口需要打开搜索页面之后cookies才能访问API，单独的首页不行
            # 使用用户配置的第一个关键词，如果没有关键词则使用默认的"test"
            search_keyword = "test"  # 默认关键词
            if config.KEYWORDS and config.KEYWORDS.strip():
                keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
                if keywords_list:
                    search_keyword = keywords_list[0]
                    utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
                else:
                    utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空，使用默认关键词 'test'")
            else:
                utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词，使用默认关键词 'test'")
            utils.logger.info(
-                "[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies，该过程需要5秒左右"
+                f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
            )
-            await self.context_page.goto(
+            utils.logger.info(
-                f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
+                f"[ZhihuCrawler.start] 关键词: {search_keyword}"
            )
-            await asyncio.sleep(5)
+            try:
-            await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+                # 使用用户配置的关键词，而不是硬编码的python
                from urllib.parse import quote
                encoded_keyword = quote(search_keyword)
                search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
                utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
                # 添加超时时间，避免卡住
                await self.context_page.goto(
                    search_url,
                    wait_until="domcontentloaded",
                    timeout=30000
                )
                utils.logger.info("[ZhihuCrawler.start] 页面跳转完成，等待页面稳定...")
                # 等待页面基本加载完成，不等待networkidle（知乎页面可能一直有请求）
                await asyncio.sleep(3)
                utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载，开始更新cookies")
                await self.zhihu_client.update_cookies(browser_context=self.browser_context)
                utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
            except Exception as e:
                utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e}，尝试继续执行", exc_info=True)
                # 即使跳转失败，也尝试更新cookies
                try:
                    await self.zhihu_client.update_cookies(browser_context=self.browser_context)
                    utils.logger.info("[ZhihuCrawler.start] Cookies更新完成（跳转失败后）")
                except Exception as cookie_error:
                    utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)
            crawler_type_var.set(config.CRAWLER_TYPE)
            utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
            utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
                await self.search()
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
                await self.get_specified_notes()
            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
                utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
                await self.get_creators_and_notes()
            else:
-                pass
+                utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")
            utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
    async def search(self) -> None:
        """Search for notes and retrieve their comment information."""
-        utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
+        utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
        zhihu_limit_count = 20  # zhihu limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
        total_failed_contents = 0
        total_saved_comments = 0
-        for keyword in config.KEYWORDS.split(","):
+        # 安全地处理关键词列表
        if not config.KEYWORDS or not config.KEYWORDS.strip():
            utils.logger.error("[ZhihuCrawler.search] 关键词配置为空，无法执行搜索任务")
            return
        keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
        if not keywords_list:
            utils.logger.error("[ZhihuCrawler.search] 关键词列表为空，无法执行搜索任务")
            return
        utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
        for keyword in keywords_list:
            source_keyword_var.set(keyword)
            utils.logger.info(
                f"[ZhihuCrawler.search] Current search keyword: {keyword}"
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
        cookie_str, cookie_dict = utils.convert_cookies(
            await self.browser_context.cookies()
        )
        # 获取用户配置的关键词用于 referer，如果没有则使用默认值
        referer_keyword = "test"
        if config.KEYWORDS and config.KEYWORDS.strip():
            keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
            if keywords_list:
                referer_keyword = keywords_list[0]
        from urllib.parse import quote
        encoded_referer_keyword = quote(referer_keyword)
        referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
        zhihu_client_obj = ZhiHuClient(
            proxy=httpx_proxy,
            headers={
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
                "accept-language": "zh-CN,zh;q=0.9",
                "cookie": cookie_str,
                "priority": "u=1, i",
-                "referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
+                "referer": referer_url,
                "user-agent": self.user_agent,
                "x-api-version": "3.0.91",
                "x-app-za": "OS=Web",
@@ -119,6 +119,32 @@ class BiliDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        video_id = content_item.get("video_id")
        if not video_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            title = content_item.get("title", "")
            desc = content_item.get("desc", "")
            content_text = title + " " + desc
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[BilibiliDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[BilibiliDbStoreImplement.store_content] Failed to load keyword config: {e}")
        # 确保 video_id 为整数类型，匹配数据库 BigInteger 字段
        if video_id is not None:
            video_id = int(video_id) if not isinstance(video_id, int) else video_id
@@ -88,6 +88,30 @@ class DouyinDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        aweme_id = content_item.get("aweme_id")
        if not aweme_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            desc = content_item.get("desc", "")
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(desc, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[DouyinDbStoreImplement.store_content] ❌ Filtered aweme {aweme_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[DouyinDbStoreImplement.store_content] Failed to load keyword config: {e}")
        async with get_session() as session:
            result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
            aweme_detail = result.scalar_one_or_none()
@@ -89,6 +89,30 @@ class KuaishouDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        video_id = content_item.get("video_id")
        if not video_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            caption = content_item.get("caption", "")
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(caption, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[KuaishouDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[KuaishouDbStoreImplement.store_content] Failed to load keyword config: {e}")
        async with get_session() as session:
            result = await session.execute(select(KuaishouVideo).where(KuaishouVideo.video_id == video_id))
            video_detail = result.scalar_one_or_none()
@@ -95,6 +95,32 @@ class TieBaDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        note_id = content_item.get("note_id")
        if not note_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            title = content_item.get("title", "")
            text = content_item.get("text", "")
            content_text = title + " " + text
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[TiebaDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[TiebaDbStoreImplement.store_content] Failed to load keyword config: {e}")
        async with get_session() as session:
            stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
            res = await session.execute(stmt)
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
        "source_keyword": source_keyword_var.get(),
    }
    utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
-    await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
+    try:
        await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
        utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
    except Exception as e:
        utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
        raise
 async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
        "avatar": user_info.get("profile_image_url", ""),
    }
    utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
-    await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
+    try:
        await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
        utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
    except Exception as e:
        utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
        raise
 async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
@@ -21,7 +21,7 @@ import pathlib
 from typing import Dict
 import aiofiles
-from sqlalchemy import select
+from sqlalchemy import select, text
 from sqlalchemy.ext.asyncio import AsyncSession
 import config
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
 from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
 from tools import utils, words
 from tools.async_file_writer import AsyncFileWriter
-from database.db_session import get_session
+from database.db_session import get_session, get_async_engine
 from var import crawler_type_var
@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):
 class WeiboDbStoreImplement(AbstractStore):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    async def _check_connection(self):
        """检查数据库连接是否正常（使用类变量缓存检查结果）"""
        # 使用类变量缓存检查结果，避免重复检查
        if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
            WeiboDbStoreImplement._global_connection_checked = False
        if WeiboDbStoreImplement._global_connection_checked:
            return True
        try:
            engine = get_async_engine(config.SAVE_DATA_OPTION)
            if engine is None:
                utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
                return False
            async with engine.connect() as conn:
                await conn.execute(text("SELECT 1"))
            WeiboDbStoreImplement._global_connection_checked = True
            utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
            return True
        except Exception as e:
            utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
            return False
    async def store_content(self, content_item: Dict):
        """
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
        """
        note_id = content_item.get("note_id")
-        async with get_session() as session:
+        if not note_id:
-            stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
+            utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
-            res = await session.execute(stmt)
+            return
-            db_note = res.scalar_one_or_none()
+        
-            if db_note:
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
-                db_note.last_modify_ts = utils.get_current_timestamp()
+        # 支持精确匹配和模糊匹配两种模式
-                for key, value in content_item.items():
+        try:
-                    if hasattr(db_note, key):
+            import sys
-                        setattr(db_note, key, value)
+            from pathlib import Path
-            else:
+            project_root = Path(__file__).resolve().parents[4]
-                content_item["add_ts"] = utils.get_current_timestamp()
+            if str(project_root) not in sys.path:
-                content_item["last_modify_ts"] = utils.get_current_timestamp()
+                sys.path.insert(0, str(project_root))
-                db_note = WeiboNote(**content_item)
+            from config import settings
-                session.add(db_note)
+            
-            await session.commit()
+            content_text = content_item.get("content", "")
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
        # 检查数据库连接
        if not await self._check_connection():
            utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
            return
        try:
            async with get_session() as session:
                if session is None:
                    utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
                    return
                stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
                res = await session.execute(stmt)
                db_note = res.scalar_one_or_none()
                if db_note:
                    utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
                    db_note.last_modify_ts = utils.get_current_timestamp()
                    for key, value in content_item.items():
                        if hasattr(db_note, key):
                            setattr(db_note, key, value)
                else:
                    utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
                    content_item["add_ts"] = utils.get_current_timestamp()
                    content_item["last_modify_ts"] = utils.get_current_timestamp()
                    db_note = WeiboNote(**content_item)
                    session.add(db_note)
                await session.commit()
                utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
        except Exception as e:
            utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
            raise
    async def store_comment(self, comment_item: Dict):
        """
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
        """
        comment_id = comment_item.get("comment_id")
-        async with get_session() as session:
+        if not comment_id:
-            stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
+            utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
-            res = await session.execute(stmt)
+            return
-            db_comment = res.scalar_one_or_none()
+        
-            if db_comment:
+        try:
-                db_comment.last_modify_ts = utils.get_current_timestamp()
+            async with get_session() as session:
-                for key, value in comment_item.items():
+                if session is None:
-                    if hasattr(db_comment, key):
+                    utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
-                        setattr(db_comment, key, value)
+                    return
-            else:
+                
-                comment_item["add_ts"] = utils.get_current_timestamp()
+                stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
-                comment_item["last_modify_ts"] = utils.get_current_timestamp()
+                res = await session.execute(stmt)
-                db_comment = WeiboNoteComment(**comment_item)
+                db_comment = res.scalar_one_or_none()
-                session.add(db_comment)
+                if db_comment:
-            await session.commit()
+                    utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
                    db_comment.last_modify_ts = utils.get_current_timestamp()
                    for key, value in comment_item.items():
                        if hasattr(db_comment, key):
                            setattr(db_comment, key, value)
                else:
                    utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
                    comment_item["add_ts"] = utils.get_current_timestamp()
                    comment_item["last_modify_ts"] = utils.get_current_timestamp()
                    db_comment = WeiboNoteComment(**comment_item)
                    session.add(db_comment)
                await session.commit()
                utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
        except Exception as e:
            utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
            raise
    async def store_creator(self, creator: Dict):
        """
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
        note_id = content_item.get("note_id")
        if not note_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            # 添加项目根目录到路径，以便导入 MindSpider 的 config
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            title = content_item.get("title", "")
            desc = content_item.get("desc", "")
            content_text = title + " " + desc
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            # 如果配置了关键词，进行匹配检查
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
                    return
        except Exception as e:
            # 如果配置读取失败，记录警告但不阻止保存（向后兼容）
            utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
        async with get_session() as session:
            if await self.content_is_exist(session, note_id):
                await self.update_content(session, content_item)
@@ -85,8 +85,21 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
    if not comments:
        return
    success_count = 0
    error_count = 0
    for comment_item in comments:
-        await update_zhihu_content_comment(comment_item)
+        try:
            await update_zhihu_content_comment(comment_item)
            success_count += 1
        except Exception as e:
            error_count += 1
            comment_id = getattr(comment_item, 'comment_id', 'unknown')
            utils.logger.error(f"[store.zhihu.batch_update_zhihu_note_comments] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
    if error_count > 0:
        utils.logger.warning(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条, 失败 {error_count} 条")
    else:
        utils.logger.info(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条")
 async def update_zhihu_content_comment(comment_item: ZhihuComment):
@@ -98,10 +111,17 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
    Returns:
    """
-    local_db_item = comment_item.model_dump()
+    try:
-    local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
+        local_db_item = comment_item.model_dump()
-    utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}")
+        local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
-    await ZhihuStoreFactory.create_store().store_comment(local_db_item)
+        # 使用更安全的日志记录方式，避免编码问题导致日志输出异常
        comment_id = local_db_item.get('comment_id', 'unknown')
        utils.logger.debug(f"[store.zhihu.update_zhihu_note_comment] 准备保存评论: comment_id={comment_id}")
        await ZhihuStoreFactory.create_store().store_comment(local_db_item)
    except Exception as e:
        comment_id = getattr(comment_item, 'comment_id', 'unknown')
        utils.logger.error(f"[store.zhihu.update_zhihu_note_comment] 保存评论异常 (comment_id={comment_id}): {e}", exc_info=True)
        raise
 async def save_creator(creator: ZhihuCreator):
@@ -94,23 +94,71 @@ class ZhihuDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        content_id = content_item.get("content_id")
        if not content_id:
            return
        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
        # 支持精确匹配和模糊匹配两种模式
        try:
            import sys
            from pathlib import Path
            project_root = Path(__file__).resolve().parents[4]
            if str(project_root) not in sys.path:
                sys.path.insert(0, str(project_root))
            from config import settings
            title = content_item.get("title", "")
            content = content_item.get("content", "")
            content_text = title + " " + content
            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
            if strict_keywords or fuzzy_keywords:
                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
                    utils.logger.warning(f"[ZhihuDbStoreImplement.store_content] ❌ Filtered content {content_id} - content does not match any keyword")
                    return
        except Exception as e:
            utils.logger.debug(f"[ZhihuDbStoreImplement.store_content] Failed to load keyword config: {e}")
        try:
            # 确保所有字符串值都是正确的UTF-8编码
            cleaned_item = {}
            for key, value in content_item.items():
                if isinstance(value, bytes):
                    # 如果是bytes类型，尝试解码为UTF-8
                    try:
                        value = value.decode('utf-8')
                    except UnicodeDecodeError:
                        # 如果UTF-8解码失败，尝试其他编码
                        try:
                            value = value.decode('gbk', errors='replace')
                        except:
                            value = value.decode('utf-8', errors='replace')
                elif isinstance(value, str):
                    # 确保字符串是有效的UTF-8
                    try:
                        value.encode('utf-8')
                    except UnicodeEncodeError:
                        # 如果编码失败，尝试修复
                        value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
                cleaned_item[key] = value
            async with get_session() as session:
                stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
                result = await session.execute(stmt)
                existing_content = result.scalars().first()
                if existing_content:
-                    for key, value in content_item.items():
+                    for key, value in cleaned_item.items():
                        setattr(existing_content, key, value)
                    utils.logger.debug(f"[ZhihuDbStore] 更新内容: {content_id}")
                else:
-                    new_content = ZhihuContent(**content_item)
+                    new_content = ZhihuContent(**cleaned_item)
                    session.add(new_content)
                    utils.logger.debug(f"[ZhihuDbStore] 新增内容: {content_id}")
                await session.commit()
                utils.logger.info(f"[ZhihuDbStore] 成功保存内容到数据库: {content_id}")
        except Exception as e:
-            utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}")
+            utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}", exc_info=True)
            raise
    async def store_comment(self, comment_item: Dict):
@@ -121,22 +169,44 @@ class ZhihuDbStoreImplement(AbstractStore):
        """
        comment_id = comment_item.get("comment_id")
        try:
            # 确保所有字符串值都是正确的UTF-8编码
            cleaned_item = {}
            for key, value in comment_item.items():
                if isinstance(value, bytes):
                    # 如果是bytes类型，尝试解码为UTF-8
                    try:
                        value = value.decode('utf-8')
                    except UnicodeDecodeError:
                        # 如果UTF-8解码失败，尝试其他编码
                        try:
                            value = value.decode('gbk', errors='replace')
                        except:
                            value = value.decode('utf-8', errors='replace')
                elif isinstance(value, str):
                    # 确保字符串是有效的UTF-8
                    try:
                        value.encode('utf-8')
                    except UnicodeEncodeError:
                        # 如果编码失败，尝试修复
                        value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
                cleaned_item[key] = value
            async with get_session() as session:
                stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
                result = await session.execute(stmt)
                existing_comment = result.scalars().first()
                if existing_comment:
-                    for key, value in comment_item.items():
+                    for key, value in cleaned_item.items():
                        setattr(existing_comment, key, value)
                    utils.logger.debug(f"[ZhihuDbStore] 更新评论: {comment_id}")
                else:
-                    new_comment = ZhihuComment(**comment_item)
+                    new_comment = ZhihuComment(**cleaned_item)
                    session.add(new_comment)
                    utils.logger.debug(f"[ZhihuDbStore] 新增评论: {comment_id}")
                await session.commit()
                utils.logger.info(f"[ZhihuDbStore] 成功保存评论到数据库: {comment_id}")
        except Exception as e:
-            utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}")
+            utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
            raise
    async def store_creator(self, creator: Dict):
@@ -11,6 +11,11 @@
 import argparse
 import logging
 import os
 import re
 import sys
 from logging.handlers import RotatingFileHandler
 from pathlib import Path
 from .crawler_util import *
 from .slider_util import *
@@ -19,17 +24,80 @@ from .time_util import *
 def init_loging_config():
    level = logging.INFO
-    logging.basicConfig(
+    
-        level=level,
+    # 日志格式
-        format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
+    log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s"
-        datefmt='%Y-%m-%d %H:%M:%S'
+    date_format = '%Y-%m-%d %H:%M:%S'
-    )
+    
    # 创建日志目录（项目根目录的 logs 文件夹）
    # 从当前文件位置向上查找，直到找到包含 logs 目录的项目根目录
    current_file = Path(__file__).resolve()
    project_root = None
    # 方法1: 向上查找直到找到 logs 目录
    for parent in current_file.parents:
        logs_dir = parent / "logs"
        if logs_dir.exists() or parent.name == "BettaFish-1.2.0":
            project_root = parent
            break
    # 方法2: 如果没找到，使用当前工作目录
    if project_root is None:
        project_root = Path.cwd()
        # 如果当前在 MediaCrawler 目录，向上查找
        if project_root.name == "MediaCrawler":
            project_root = project_root.parent.parent
    log_dir = project_root / "logs"
    log_dir.mkdir(exist_ok=True)
    # 日志文件路径
    log_file = log_dir / "mediacrawler.log"
    # 配置根日志记录器
    root_logger = logging.getLogger()
    root_logger.setLevel(level)
    # 清除已有的处理器，避免重复
    root_logger.handlers.clear()
    # 控制台处理器 - 明确使用 sys.stdout 确保输出到控制台
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(level)
    console_formatter = logging.Formatter(log_format, datefmt=date_format)
    console_handler.setFormatter(console_formatter)
    root_logger.addHandler(console_handler)
    # 确保输出立即刷新
    sys.stdout.flush()
    sys.stderr.flush()
    # 文件处理器（带轮转，最大10MB，保留5个备份）
    try:
        file_handler = RotatingFileHandler(
            log_file,
            maxBytes=10 * 1024 * 1024,  # 10MB
            backupCount=5,
            encoding='utf-8'
        )
        file_handler.setLevel(level)
        file_formatter = logging.Formatter(log_format, datefmt=date_format)
        file_handler.setFormatter(file_formatter)
        root_logger.addHandler(file_handler)
    except Exception as e:
        # 如果文件日志初始化失败，至少保证控制台日志可用
        print(f"警告: 无法初始化文件日志: {e}")
    # 创建 MediaCrawler 专用日志记录器
    _logger = logging.getLogger("MediaCrawler")
    _logger.setLevel(level)
-
+    
    # 关闭 httpx 的 INFO 日志
    logging.getLogger("httpx").setLevel(logging.WARNING)
-
+    
    # 输出日志文件位置
    _logger.info(f"日志文件: {log_file}")
    return _logger
@@ -44,3 +112,101 @@ def str2bool(v):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')
 def check_keyword_match_strict(content: str, keyword: str) -> bool:
    """
    严格关键词匹配：检查内容是否包含关键词（严格模式）
    Args:
        content: 要检查的内容文本
        keyword: 关键词（可以是单个关键词，也可以是逗号分隔的多个关键词）
    Returns:
        bool: 如果内容包含任意一个关键词返回True，否则返回False
    """
    if not content or not keyword:
        return False
    # 清理HTML标签
    clean_content = re.sub(r"<.*?>", "", content)
    # 转换为小写进行匹配
    clean_content_lower = clean_content.lower()
    # 支持多个关键词（逗号分隔），只要匹配任意一个即可
    keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
    # 检查内容是否包含任意一个关键词
    for kw in keywords:
        if kw in clean_content_lower:
            return True
    return False
 def check_keyword_match_fuzzy(content: str, keyword: str) -> bool:
    """
    模糊关键词匹配：检查内容是否包含关键词（模糊模式，支持部分匹配）
    Args:
        content: 要检查的内容文本
        keyword: 关键词（可以是单个关键词，也可以是逗号分隔的多个关键词）
    Returns:
        bool: 如果内容包含任意一个关键词（或关键词的部分）返回True，否则返回False
    """
    if not content or not keyword:
        return False
    # 清理HTML标签
    clean_content = re.sub(r"<.*?>", "", content)
    # 转换为小写进行匹配
    clean_content_lower = clean_content.lower()
    # 支持多个关键词（逗号分隔），只要匹配任意一个即可
    keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
    # 检查内容是否包含任意一个关键词（或关键词的部分）
    for kw in keywords:
        # 精确匹配
        if kw in clean_content_lower:
            return True
        # 模糊匹配：如果关键词长度>=3，检查是否包含关键词的主要部分
        if len(kw) >= 3:
            # 去除空格后匹配
            kw_no_space = kw.replace(" ", "")
            content_no_space = clean_content_lower.replace(" ", "")
            if kw_no_space in content_no_space:
                return True
            # 检查关键词的前半部分（至少2个字符）
            if len(kw) >= 4:
                half_kw = kw[:len(kw)//2]
                if half_kw in clean_content_lower:
                    return True
    return False
 def check_keyword_match_with_modes(content: str, strict_keywords: str = None, fuzzy_keywords: str = None) -> bool:
    """
    使用精确和模糊两种模式检查关键词匹配
    Args:
        content: 要检查的内容文本
        strict_keywords: 精确匹配关键词（逗号分隔）
        fuzzy_keywords: 模糊匹配关键词（逗号分隔）
    Returns:
        bool: 如果内容匹配任意一个关键词（精确或模糊）返回True，否则返回False
    """
    # 先检查精确匹配关键词
    if strict_keywords:
        if check_keyword_match_strict(content, strict_keywords):
            return True
    # 再检查模糊匹配关键词
    if fuzzy_keywords:
        if check_keyword_match_fuzzy(content, fuzzy_keywords):
            return True
    return False
@@ -251,7 +251,8 @@ class KeywordManager:
    def _get_default_keywords(self) -> List[str]:
        """获取默认关键词列表"""
        return [
-            "F6智慧门店","南京爱福路汽车科技有限公司","汽车后市场","汽修店","新康众"
+            "F6智慧门店","F6智数","中国汽车后市场白皮书","南京爱福路汽车科技有限公司","汽车后市场","汽车修理厂",
            "新康众","天猫养车","汽后","汽修厂","爱福路","康众",
        ]
    def get_all_keywords_for_platforms(self, platforms: List[str], target_date: date = None, 
@@ -252,14 +252,63 @@ postgresql_db_config = {{
            logger.info(f"执行命令: {' '.join(cmd)}")
            # 切换到MediaCrawler目录并执行，捕获输出
-            result = subprocess.run(
+            # 使用utf-8编码，errors='surrogateescape'可以更好地处理编码问题
            # 设置环境变量确保子进程使用UTF-8编码
            env = os.environ.copy()
            env['PYTHONIOENCODING'] = 'utf-8'
            env['PYTHONUTF8'] = '1'
            # 使用 Popen 实时输出日志，而不是等到结束才显示
            import subprocess as sp
            process = sp.Popen(
                cmd,
                cwd=self.mediacrawler_path,
-                timeout=3600,  # 60分钟超时
+                stdout=sp.PIPE,
-                capture_output=True,
+                stderr=sp.STDOUT,  # 将stderr合并到stdout
                text=True,
                encoding='utf-8',
-                errors='replace'
+                errors='surrogateescape',
                env=env,
                bufsize=1,  # 行缓冲
                universal_newlines=True
            )
            # 实时读取并输出日志
            output_lines = []
            error_lines = []
            try:
                for line in process.stdout:
                    line = line.rstrip()
                    if line:
                        output_lines.append(line)
                        # 实时输出到控制台
                        print(f"[{platform}] {line}", flush=True)
                        logger.info(f"[{platform}] {line}")
                # 等待进程完成
                return_code = process.wait(timeout=3600)
            except sp.TimeoutExpired:
                process.kill()
                process.wait()
                return_code = -1
                logger.error(f"[{platform}] 爬取超时")
            except Exception as e:
                process.kill()
                process.wait()
                return_code = -1
                logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
            # 创建类似 subprocess.run 的 result 对象
            class Result:
                def __init__(self, returncode, stdout, stderr):
                    self.returncode = returncode
                    self.stdout = stdout
                    self.stderr = stderr
            result = Result(
                returncode=return_code,
                stdout='\n'.join(output_lines),
                stderr='\n'.join(error_lines)
            )
            end_time = datetime.now()
@@ -269,6 +318,19 @@ postgresql_db_config = {{
            output_lines = result.stdout.split('\n') if result.stdout else []
            error_lines = result.stderr.split('\n') if result.stderr else []
            # 输出日志到控制台和日志文件
            if output_lines:
                logger.info(f"[{platform}] 爬虫标准输出:")
                for line in output_lines:
                    if line.strip():  # 忽略空行
                        logger.info(f"[{platform}] {line}")
            if error_lines:
                logger.warning(f"[{platform}] 爬虫错误输出:")
                for line in error_lines:
                    if line.strip():  # 忽略空行
                        logger.warning(f"[{platform}] {line}")
            # 合并所有输出行用于解析
            all_lines = output_lines + error_lines
@@ -329,10 +391,64 @@ postgresql_db_config = {{
        # 合并所有行用于解析
        all_lines = output_lines + error_lines
        # 用于统计各平台的保存操作次数（通过日志关键字统计）
        # 视频/内容保存操作的关键字
        content_save_keywords = [
            "[store.bilibili.update_bilibili_video]",
            "update_bilibili_video",
            "[store.douyin.update_dy_aweme]",
            "update_dy_aweme",
            "[store.kuaishou.update_kuaishou_video]",
            "update_kuaishou_video",
            "[store.xhs.update_xhs_note]",
            "update_xhs_note",
            "[store.weibo.update_weibo_note]",
            "update_weibo_note",
            "[store.tieba.update_tieba_note]",
            "update_tieba_note",
            "[store.zhihu.update_zhihu_content]",
            "update_zhihu_content",
        ]
        # 评论保存操作的关键字
        comment_save_keywords = [
            "[store.bilibili.update_bilibili_video_comment]",
            "update_bilibili_video_comment",
            "[store.douyin.update_dy_aweme_comment]",
            "update_dy_aweme_comment",
            "[store.kuaishou.update_ks_video_comment]",
            "update_ks_video_comment",
            "[store.xhs.update_xhs_note_comment]",
            "update_xhs_note_comment",
            "[store.weibo.update_weibo_note_comment]",
            "update_weibo_note_comment",
            "[store.tieba.update_tieba_note_comment]",
            "update_tieba_note_comment",
            "[store.zhihu.update_zhihu_content_comment]",
            "update_zhihu_note_comment",
            "update_zhihu_content_comment",
        ]
        # 先统计日志关键字出现的次数（用于bilibili等没有汇总信息的平台）
        log_keyword_content_count = 0
        log_keyword_comment_count = 0
        # 解析输出行，查找各种可能的数据保存信息
        for line in all_lines:
            line_lower = line.lower()
            # 统计视频/内容保存操作（通过日志关键字）
            for keyword in content_save_keywords:
                if keyword in line or keyword.lower() in line_lower:
                    log_keyword_content_count += 1
                    break  # 避免重复计数
            # 统计评论保存操作（通过日志关键字）
            for keyword in comment_save_keywords:
                if keyword in line or keyword.lower() in line_lower:
                    log_keyword_comment_count += 1
                    break  # 避免重复计数
            # 查找保存的内容数量（多种可能的格式）
            # 例如："保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
            if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
@@ -367,10 +483,18 @@ postgresql_db_config = {{
            if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
                stats["login_required"] = True
        # 如果通过汇总信息没有找到保存数量，使用日志关键字统计的结果
        # 这样可以支持bilibili等没有输出汇总信息的平台
        if stats["notes_count"] == 0 and log_keyword_content_count > 0:
            stats["notes_count"] = log_keyword_content_count
        if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
            stats["comments_count"] = log_keyword_comment_count
        # 如果没有找到明确的保存数量，尝试从数据库操作日志中提取
        if stats["notes_count"] == 0 and stats["comments_count"] == 0:
            # 查找可能的数据库插入信息
            for line in all_lines:
                line_lower = line.lower()
                # 查找类似 "insert into" 或 "保存到" 的信息
                if "insert" in line_lower or "保存到" in line_lower:
                    try:
@@ -34,6 +34,14 @@ class Settings(BaseSettings):
        True,
        description="开启后运行基于关键词的爬取流程，关闭则完全跳过关键词搜索"
    )
    STRICT_KEYWORDS: Optional[str] = Field(
        None,
        description="精确匹配关键词（逗号分隔），内容必须完整包含这些关键词才能落库"
    )
    FUZZY_KEYWORDS: Optional[str] = Field(
        None,
        description="模糊匹配关键词（逗号分隔），内容包含这些关键词的部分即可落库"
    )
    MINDSPIDER_API_KEY: Optional[str] = Field(None, description="MINDSPIDER API密钥")
    MINDSPIDER_BASE_URL: Optional[str] = Field("https://api.deepseek.com", description="MINDSPIDER API基础URL，推荐deepseek-chat模型使用https://api.deepseek.com")
    MINDSPIDER_MODEL_NAME: Optional[str] = Field("deepseek-chat", description="MINDSPIDER API模型名称, 推荐deepseek-chat")