From 64b94d79f94365f054e7ea8336d6bcb5931ade29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=80=81=E8=91=9B?= <ptghb@qq.com>
Date: Sat, 29 Nov 2025 14:26:26 +0800
Subject: [PATCH] =?UTF-8?q?feat(xhs):=20=E9=9B=86=E6=88=90xhshow=E5=BA=93?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=AD=BE=E5=90=8D=E7=94=9F=E6=88=90=E4=B8=8E?=
 =?UTF-8?q?=E8=AF=B7=E6=B1=82=E5=8F=82=E6=95=B0=20(#330)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(xhs): 集成xhshow库优化签名生成与请求参数

- 引入xhshow库用于小红书API签名生成
- 替换原有的seccore_signv2_playwright签名校验方式
- 支持GET和POST请求的差异化签名处理
- 增加对b1值从localStorage获取的容错处理
- 更新x-t时间戳为毫秒级精度
- 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源
- 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求

* Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py

移除配置文件

* Add xhs_config.py for Xiaohongshu platform settings

恢复错误删除的文件

---------

Co-authored-by: gehongbin <gehongbin@autohome.com.cn>
Co-authored-by: Doiiars <doiiars@qq.com>
---
 .../MediaCrawler/media_platform/xhs/client.py | 68 +++++++++++++------
 .../MediaCrawler/media_platform/xhs/core.py   | 15 +++-
 .../MediaCrawler/requirements.txt             |  1 +
 MindSpider/requirements.txt                   |  1 +
 requirements.txt                              |  1 +
 5 files changed, 64 insertions(+), 22 deletions(-)

diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
index 652667f..3b0db95 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
@@ -17,6 +17,7 @@ from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page
 from tenacity import retry, stop_after_attempt, wait_fixed
+from xhshow import Xhshow
 
 import config
 from base.base_crawler import AbstractApiClient
@@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError
 from .field import SearchNoteType, SearchSortType
 from .help import get_search_id, sign
 from .extractor import XiaoHongShuExtractor
-from .secsign import seccore_signv2_playwright
 
 
 class XiaoHongShuClient(AbstractApiClient):
@@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient):
         self.playwright_page = playwright_page
         self.cookie_dict = cookie_dict
         self._extractor = XiaoHongShuExtractor()
+        # 初始化 xhshow 客户端用于签名生成
+        self._xhshow_client = Xhshow()
 
     async def _pre_headers(self, url: str, data=None) -> Dict:
         """
-        请求头参数签名
+        请求头参数签名，使用 xhshow 库生成签名
         Args:
-            url:
-            data:
+            url: 完整的 URI（GET 请求包含查询参数）
+            data: POST 请求的请求体数据
 
         Returns:
 
         """
-        x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
-        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
+        # 获取 a1 cookie 值
+        a1_value = self.cookie_dict.get("a1", "")
+
+        # 根据请求类型使用不同的签名方法
+        if data is None:
+            # GET 请求：从 url 中提取参数
+            from urllib.parse import urlparse, parse_qs
+            parsed = urlparse(url)
+            params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
+            # 使用完整的 URL（包含 host）
+            full_url = f"{self._host}{url}"
+            x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
+        else:
+            # POST 请求：使用 data 作为 payload
+            full_url = f"{self._host}{url}"
+            x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
+
+        # 尝试获取 b1 值（从 localStorage），如果获取失败则使用空字符串
+        b1_value = ""
+        try:
+            if self.playwright_page:
+                local_storage = await self.playwright_page.evaluate("() => window.localStorage")
+                b1_value = local_storage.get("b1", "")
+        except Exception as e:
+            utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
+
+        # 使用 sign 函数生成其他签名头
         signs = sign(
-            a1=self.cookie_dict.get("a1", ""),
-            b1=local_storage.get("b1", ""),
+            a1=a1_value,
+            b1=b1_value,
             x_s=x_s,
-            x_t=str(int(time.time())),
+            x_t=str(int(time.time() * 1000)),  # x-t 使用毫秒时间戳
         )
 
         headers = {
@@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient):
         elif data["code"] == self.IP_ERROR_CODE:
             raise IPBlockError(self.IP_ERROR_STR)
         else:
-            raise DataFetchError(data.get("msg", None))
+            err_msg = data.get("msg", None) or f"{response.text}"
+            raise DataFetchError(err_msg)
 
     async def get(self, uri: str, params=None) -> Dict:
         """
@@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient):
         creator: str,
         cursor: str,
         page_size: int = 30,
+        xsec_token: str = "",
+        xsec_source: str = "pc_feed",
     ) -> Dict:
         """
         获取博主的笔记
@@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient):
             creator: 博主ID
             cursor: 上一页最后一条笔记的ID
             page_size: 分页数据长度
+            xsec_token: 验证token
+            xsec_source: 渠道来源
 
         Returns:
 
         """
-        uri = "/api/sns/web/v1/user_posted"
-        data = {
-            "user_id": creator,
-            "cursor": cursor,
-            "num": page_size,
-            "image_formats": "jpg,webp,avif",
-        }
-        return await self.get(uri, data)
+        uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
+        return await self.get(uri)
 
     async def get_all_notes_by_creator(
         self,
         user_id: str,
         crawl_interval: float = 1.0,
         callback: Optional[Callable] = None,
+        xsec_token: str = "",
+        xsec_source: str = "pc_feed",
     ) -> List[Dict]:
         """
         获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
@@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient):
             user_id: 用户ID
             crawl_interval: 爬取一次的延迟单位（秒）
             callback: 一次分页爬取结束后的更新回调函数
+            xsec_token: 验证token
+            xsec_source: 渠道来源
 
         Returns:
 
@@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient):
         notes_has_more = True
         notes_cursor = ""
         while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
-            notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
+            notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
             if not notes_res:
                 utils.logger.error(
                     f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
index 68d2139..bbc8ee7 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
@@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
                 user_id=user_id,
                 crawl_interval=crawl_interval,
                 callback=self.fetch_creator_notes_detail,
+                xsec_token=creator_info.xsec_token,
+                xsec_source=creator_info.xsec_source,
             )
 
             note_ids = []
@@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
             Dict: note detail
         """
         note_detail = None
+        utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
         async with semaphore:
             try:
-                utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-                note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
+                try:
+                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
+                except RetryError:
+                    pass
+
                 if not note_detail:
-                    raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
+                                                                                 enable_cookie=True)
+                    if not note_detail:
+                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
 
                 note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
                 
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt b/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt
index a04b4f6..8acb441 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt
@@ -24,3 +24,4 @@ cryptography>=45.0.7
 alembic>=1.16.5
 asyncmy>=0.2.10
 sqlalchemy>=2.0.43
+xhshow>=0.1.3
diff --git a/MindSpider/requirements.txt b/MindSpider/requirements.txt
index 87171f7..23de1ee 100644
--- a/MindSpider/requirements.txt
+++ b/MindSpider/requirements.txt
@@ -49,6 +49,7 @@ parsel==1.9.1
 pyexecjs==1.5.1
 typer>=0.12.3
 pyhumps==3.8.0
+xhshow>=0.1.3
 
 # ===============================
 # 工具包
diff --git a/requirements.txt b/requirements.txt
index cfd9872..69e8d92 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,6 +48,7 @@ beautifulsoup4>=4.12.0
 lxml>=4.9.0
 parsel==1.9.1
 pyexecjs==1.5.1
+xhshow>=0.1.3
 
 # ===== 可视化 =====
 plotly>=5.17.0