From 64b94d79f94365f054e7ea8336d6bcb5931ade29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=80=81=E8=91=9B?= Date: Sat, 29 Nov 2025 14:26:26 +0800 Subject: [PATCH] =?UTF-8?q?feat(xhs):=20=E9=9B=86=E6=88=90xhshow=E5=BA=93?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=AD=BE=E5=90=8D=E7=94=9F=E6=88=90=E4=B8=8E?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=E5=8F=82=E6=95=B0=20(#330)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(xhs): 集成xhshow库优化签名生成与请求参数 - 引入xhshow库用于小红书API签名生成 - 替换原有的seccore_signv2_playwright签名校验方式 - 支持GET和POST请求的差异化签名处理 - 增加对b1值从localStorage获取的容错处理 - 更新x-t时间戳为毫秒级精度 - 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源 - 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求 * Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py 移除配置文件 * Add xhs_config.py for Xiaohongshu platform settings 恢复错误删除的文件 --------- Co-authored-by: gehongbin Co-authored-by: Doiiars --- .../MediaCrawler/media_platform/xhs/client.py | 68 +++++++++++++------ .../MediaCrawler/media_platform/xhs/core.py | 15 +++- .../MediaCrawler/requirements.txt | 1 + MindSpider/requirements.txt | 1 + requirements.txt | 1 + 5 files changed, 64 insertions(+), 22 deletions(-) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py index 652667f..3b0db95 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py @@ -17,6 +17,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page from tenacity import retry, stop_after_attempt, wait_fixed +from xhshow import Xhshow import config from base.base_crawler import AbstractApiClient @@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType from .help import get_search_id, sign from .extractor import XiaoHongShuExtractor -from .secsign import seccore_signv2_playwright class XiaoHongShuClient(AbstractApiClient): @@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient): self.playwright_page = playwright_page self.cookie_dict = cookie_dict self._extractor = XiaoHongShuExtractor() + # 初始化 xhshow 客户端用于签名生成 + self._xhshow_client = Xhshow() async def _pre_headers(self, url: str, data=None) -> Dict: """ - 请求头参数签名 + 请求头参数签名,使用 xhshow 库生成签名 Args: - url: - data: + url: 完整的 URI(GET 请求包含查询参数) + data: POST 请求的请求体数据 Returns: """ - x_s = await seccore_signv2_playwright(self.playwright_page, url, data) - local_storage = await self.playwright_page.evaluate("() => window.localStorage") + # 获取 a1 cookie 值 + a1_value = self.cookie_dict.get("a1", "") + + # 根据请求类型使用不同的签名方法 + if data is None: + # GET 请求:从 url 中提取参数 + from urllib.parse import urlparse, parse_qs + parsed = urlparse(url) + params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()} + # 使用完整的 URL(包含 host) + full_url = f"{self._host}{url}" + x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params) + else: + # POST 请求:使用 data 作为 payload + full_url = f"{self._host}{url}" + x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data) + + # 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串 + b1_value = "" + try: + if self.playwright_page: + local_storage = await self.playwright_page.evaluate("() => window.localStorage") + b1_value = local_storage.get("b1", "") + except Exception as e: + utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string") + + # 使用 sign 函数生成其他签名头 signs = sign( - a1=self.cookie_dict.get("a1", ""), - b1=local_storage.get("b1", ""), + a1=a1_value, + b1=b1_value, x_s=x_s, - x_t=str(int(time.time())), + x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳 ) headers = { @@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient): elif data["code"] == self.IP_ERROR_CODE: raise IPBlockError(self.IP_ERROR_STR) else: - raise DataFetchError(data.get("msg", None)) + err_msg = data.get("msg", None) or f"{response.text}" + raise DataFetchError(err_msg) async def get(self, uri: str, params=None) -> Dict: """ @@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient): creator: str, cursor: str, page_size: int = 30, + xsec_token: str = "", + xsec_source: str = "pc_feed", ) -> Dict: """ 获取博主的笔记 @@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient): creator: 博主ID cursor: 上一页最后一条笔记的ID page_size: 分页数据长度 + xsec_token: 验证token + xsec_source: 渠道来源 Returns: """ - uri = "/api/sns/web/v1/user_posted" - data = { - "user_id": creator, - "cursor": cursor, - "num": page_size, - "image_formats": "jpg,webp,avif", - } - return await self.get(uri, data) + uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}" + return await self.get(uri) async def get_all_notes_by_creator( self, user_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, + xsec_token: str = "", + xsec_source: str = "pc_feed", ) -> List[Dict]: """ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 @@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient): user_id: 用户ID crawl_interval: 爬取一次的延迟单位(秒) callback: 一次分页爬取结束后的更新回调函数 + xsec_token: 验证token + xsec_source: 渠道来源 Returns: @@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient): notes_has_more = True notes_cursor = "" while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: - notes_res = await self.get_notes_by_creator(user_id, notes_cursor) + notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source) if not notes_res: utils.logger.error( f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py index 68d2139..bbc8ee7 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py @@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler): user_id=user_id, crawl_interval=crawl_interval, callback=self.fetch_creator_notes_detail, + xsec_token=creator_info.xsec_token, + xsec_source=creator_info.xsec_source, ) note_ids = [] @@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler): Dict: note detail """ note_detail = None + utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}") async with semaphore: try: - utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}") - note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True) + try: + note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) + except RetryError: + pass + if not note_detail: - raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") + note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, + enable_cookie=True) + if not note_detail: + raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt b/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt index a04b4f6..8acb441 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/requirements.txt @@ -24,3 +24,4 @@ cryptography>=45.0.7 alembic>=1.16.5 asyncmy>=0.2.10 sqlalchemy>=2.0.43 +xhshow>=0.1.3 diff --git a/MindSpider/requirements.txt b/MindSpider/requirements.txt index 87171f7..23de1ee 100644 --- a/MindSpider/requirements.txt +++ b/MindSpider/requirements.txt @@ -49,6 +49,7 @@ parsel==1.9.1 pyexecjs==1.5.1 typer>=0.12.3 pyhumps==3.8.0 +xhshow>=0.1.3 # =============================== # 工具包 diff --git a/requirements.txt b/requirements.txt index cfd9872..69e8d92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,6 +48,7 @@ beautifulsoup4>=4.12.0 lxml>=4.9.0 parsel==1.9.1 pyexecjs==1.5.1 +xhshow>=0.1.3 # ===== 可视化 ===== plotly>=5.17.0