Uploading the AI Crawler System: MindSpider

2025-08-27 13:49:07 +08:00
parent 822bad557f
commit 587e709e82
174 changed files with 34562 additions and 25 deletions
@@ -0,0 +1,13 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+from .core import XiaoHongShuCrawler
+from .field import *
@@ -0,0 +1,592 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import asyncio
+import json
+import re
+from typing import Any, Callable, Dict, List, Optional, Union
+from urllib.parse import urlencode
+
+import httpx
+from playwright.async_api import BrowserContext, Page
+from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
+
+import config
+from base.base_crawler import AbstractApiClient
+from tools import utils
+from html import unescape
+
+from .exception import DataFetchError, IPBlockError
+from .field import SearchNoteType, SearchSortType
+from .help import get_search_id, sign
+
+
+class XiaoHongShuClient(AbstractApiClient):
+
+    def __init__(
+        self,
+        timeout=60,  # 若开启爬取媒体选项，xhs 的长视频需要更久的超时时间
+        proxy=None,
+        *,
+        headers: Dict[str, str],
+        playwright_page: Page,
+        cookie_dict: Dict[str, str],
+    ):
+        self.proxy = proxy
+        self.timeout = timeout
+        self.headers = headers
+        self._host = "https://edith.xiaohongshu.com"
+        self._domain = "https://www.xiaohongshu.com"
+        self.IP_ERROR_STR = "网络连接异常，请检查网络设置或重启试试"
+        self.IP_ERROR_CODE = 300012
+        self.NOTE_ABNORMAL_STR = "笔记状态异常，请稍后查看"
+        self.NOTE_ABNORMAL_CODE = -510001
+        self.playwright_page = playwright_page
+        self.cookie_dict = cookie_dict
+
+    async def _pre_headers(self, url: str, data=None) -> Dict:
+        """
+        请求头参数签名
+        Args:
+            url:
+            data:
+
+        Returns:
+
+        """
+        encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
+        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
+        signs = sign(
+            a1=self.cookie_dict.get("a1", ""),
+            b1=local_storage.get("b1", ""),
+            x_s=encrypt_params.get("X-s", ""),
+            x_t=str(encrypt_params.get("X-t", "")),
+        )
+
+        headers = {
+            "X-S": signs["x-s"],
+            "X-T": signs["x-t"],
+            "x-S-Common": signs["x-s-common"],
+            "X-B3-Traceid": signs["x-b3-traceid"],
+        }
+        self.headers.update(headers)
+        return self.headers
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+    async def request(self, method, url, **kwargs) -> Union[str, Any]:
+        """
+        封装httpx的公共请求方法，对请求响应做一些处理
+        Args:
+            method: 请求方法
+            url: 请求的URL
+            **kwargs: 其他请求参数，例如请求头、请求体等
+
+        Returns:
+
+        """
+        # return response.text
+        return_response = kwargs.pop("return_response", False)
+        async with httpx.AsyncClient(proxy=self.proxy) as client:
+            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+
+        if response.status_code == 471 or response.status_code == 461:
+            # someday someone maybe will bypass captcha
+            verify_type = response.headers["Verifytype"]
+            verify_uuid = response.headers["Verifyuuid"]
+            msg = f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}"
+            utils.logger.error(msg)
+            raise Exception(msg)
+
+        if return_response:
+            return response.text
+        data: Dict = response.json()
+        if data["success"]:
+            return data.get("data", data.get("success", {}))
+        elif data["code"] == self.IP_ERROR_CODE:
+            raise IPBlockError(self.IP_ERROR_STR)
+        else:
+            raise DataFetchError(data.get("msg", None))
+
+    async def get(self, uri: str, params=None) -> Dict:
+        """
+        GET请求，对请求头签名
+        Args:
+            uri: 请求路由
+            params: 请求参数
+
+        Returns:
+
+        """
+        final_uri = uri
+        if isinstance(params, dict):
+            final_uri = f"{uri}?" f"{urlencode(params)}"
+        headers = await self._pre_headers(final_uri)
+        return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
+
+    async def post(self, uri: str, data: dict, **kwargs) -> Dict:
+        """
+        POST请求，对请求头签名
+        Args:
+            uri: 请求路由
+            data: 请求体参数
+
+        Returns:
+
+        """
+        headers = await self._pre_headers(uri, data)
+        json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
+        return await self.request(
+            method="POST",
+            url=f"{self._host}{uri}",
+            data=json_str,
+            headers=headers,
+            **kwargs,
+        )
+
+    async def get_note_media(self, url: str) -> Union[bytes, None]:
+        async with httpx.AsyncClient(proxy=self.proxy) as client:
+            try:
+                response = await client.request("GET", url, timeout=self.timeout)
+                response.raise_for_status()
+                if not response.reason_phrase == "OK":
+                    utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
+                    return None
+                else:
+                    return response.content
+            except httpx.HTTPError as exc:  # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
+                utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}")  # 保留原始异常类型名称，以便开发者调试
+                return None
+
+    async def pong(self) -> bool:
+        """
+        用于检查登录态是否失效了
+        Returns:
+
+        """
+        """get a note to check if login state is ok"""
+        utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
+        ping_flag = False
+        try:
+            note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
+            if note_card.get("items"):
+                ping_flag = True
+        except Exception as e:
+            utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
+            ping_flag = False
+        return ping_flag
+
+    async def update_cookies(self, browser_context: BrowserContext):
+        """
+        API客户端提供的更新cookies方法，一般情况下登录成功后会调用此方法
+        Args:
+            browser_context: 浏览器上下文对象
+
+        Returns:
+
+        """
+        cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+        self.headers["Cookie"] = cookie_str
+        self.cookie_dict = cookie_dict
+
+    async def get_note_by_keyword(
+        self,
+        keyword: str,
+        search_id: str = get_search_id(),
+        page: int = 1,
+        page_size: int = 20,
+        sort: SearchSortType = SearchSortType.GENERAL,
+        note_type: SearchNoteType = SearchNoteType.ALL,
+    ) -> Dict:
+        """
+        根据关键词搜索笔记
+        Args:
+            keyword: 关键词参数
+            page: 分页第几页
+            page_size: 分页数据长度
+            sort: 搜索结果排序指定
+            note_type: 搜索的笔记类型
+
+        Returns:
+
+        """
+        uri = "/api/sns/web/v1/search/notes"
+        data = {
+            "keyword": keyword,
+            "page": page,
+            "page_size": page_size,
+            "search_id": search_id,
+            "sort": sort.value,
+            "note_type": note_type.value,
+        }
+        return await self.post(uri, data)
+
+    async def get_note_by_id(
+        self,
+        note_id: str,
+        xsec_source: str,
+        xsec_token: str,
+    ) -> Dict:
+        """
+        获取笔记详情API
+        Args:
+            note_id:笔记ID
+            xsec_source: 渠道来源
+            xsec_token: 搜索关键字之后返回的比较列表中返回的token
+
+        Returns:
+
+        """
+        if xsec_source == "":
+            xsec_source = "pc_search"
+
+        data = {
+            "source_note_id": note_id,
+            "image_formats": ["jpg", "webp", "avif"],
+            "extra": {
+                "need_body_topic": 1
+            },
+            "xsec_source": xsec_source,
+            "xsec_token": xsec_token,
+        }
+        uri = "/api/sns/web/v1/feed"
+        res = await self.post(uri, data)
+        if res and res.get("items"):
+            res_dict: Dict = res["items"][0]["note_card"]
+            return res_dict
+        # 爬取频繁了可能会出现有的笔记能有结果有的没有
+        utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
+        return dict()
+
+    async def get_note_comments(
+        self,
+        note_id: str,
+        xsec_token: str,
+        cursor: str = "",
+    ) -> Dict:
+        """
+        获取一级评论的API
+        Args:
+            note_id: 笔记ID
+            xsec_token: 验证token
+            cursor: 分页游标
+
+        Returns:
+
+        """
+        uri = "/api/sns/web/v2/comment/page"
+        params = {
+            "note_id": note_id,
+            "cursor": cursor,
+            "top_comment_id": "",
+            "image_formats": "jpg,webp,avif",
+            "xsec_token": xsec_token,
+        }
+        return await self.get(uri, params)
+
+    async def get_note_sub_comments(
+        self,
+        note_id: str,
+        root_comment_id: str,
+        xsec_token: str,
+        num: int = 10,
+        cursor: str = "",
+    ):
+        """
+        获取指定父评论下的子评论的API
+        Args:
+            note_id: 子评论的帖子ID
+            root_comment_id: 根评论ID
+            xsec_token: 验证token
+            num: 分页数量
+            cursor: 分页游标
+
+        Returns:
+
+        """
+        uri = "/api/sns/web/v2/comment/sub/page"
+        params = {
+            "note_id": note_id,
+            "root_comment_id": root_comment_id,
+            "num": num,
+            "cursor": cursor,
+            "image_formats": "jpg,webp,avif",
+            "top_comment_id": "",
+            "xsec_token": xsec_token,
+        }
+        return await self.get(uri, params)
+
+    async def get_note_all_comments(
+        self,
+        note_id: str,
+        xsec_token: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+        max_count: int = 10,
+    ) -> List[Dict]:
+        """
+        获取指定笔记下的所有一级评论，该方法会一直查找一个帖子下的所有评论信息
+        Args:
+            note_id: 笔记ID
+            xsec_token: 验证token
+            crawl_interval: 爬取一次笔记的延迟单位（秒）
+            callback: 一次笔记爬取结束后
+            max_count: 一次笔记爬取的最大评论数量
+        Returns:
+
+        """
+        result = []
+        comments_has_more = True
+        comments_cursor = ""
+        while comments_has_more and len(result) < max_count:
+            comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
+            comments_has_more = comments_res.get("has_more", False)
+            comments_cursor = comments_res.get("cursor", "")
+            if "comments" not in comments_res:
+                utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
+                break
+            comments = comments_res["comments"]
+            if len(result) + len(comments) > max_count:
+                comments = comments[:max_count - len(result)]
+            if callback:
+                await callback(note_id, comments)
+            await asyncio.sleep(crawl_interval)
+            result.extend(comments)
+            sub_comments = await self.get_comments_all_sub_comments(
+                comments=comments,
+                xsec_token=xsec_token,
+                crawl_interval=crawl_interval,
+                callback=callback,
+            )
+            result.extend(sub_comments)
+        return result
+
+    async def get_comments_all_sub_comments(
+        self,
+        comments: List[Dict],
+        xsec_token: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
+        Args:
+            comments: 评论列表
+            xsec_token: 验证token
+            crawl_interval: 爬取一次评论的延迟单位（秒）
+            callback: 一次评论爬取结束后
+
+        Returns:
+
+        """
+        if not config.ENABLE_GET_SUB_COMMENTS:
+            utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+            return []
+
+        result = []
+        for comment in comments:
+            note_id = comment.get("note_id")
+            sub_comments = comment.get("sub_comments")
+            if sub_comments and callback:
+                await callback(note_id, sub_comments)
+
+            sub_comment_has_more = comment.get("sub_comment_has_more")
+            if not sub_comment_has_more:
+                continue
+
+            root_comment_id = comment.get("id")
+            sub_comment_cursor = comment.get("sub_comment_cursor")
+
+            while sub_comment_has_more:
+                comments_res = await self.get_note_sub_comments(
+                    note_id=note_id,
+                    root_comment_id=root_comment_id,
+                    xsec_token=xsec_token,
+                    num=10,
+                    cursor=sub_comment_cursor,
+                )
+
+                if comments_res is None:
+                    utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
+                    continue
+                sub_comment_has_more = comments_res.get("has_more", False)
+                sub_comment_cursor = comments_res.get("cursor", "")
+                if "comments" not in comments_res:
+                    utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
+                    break
+                comments = comments_res["comments"]
+                if callback:
+                    await callback(note_id, comments)
+                await asyncio.sleep(crawl_interval)
+                result.extend(comments)
+        return result
+
+    async def get_creator_info(self, user_id: str) -> Dict:
+        """
+        通过解析网页版的用户主页HTML，获取用户个人简要信息
+        PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的，解析它即可
+        eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
+        """
+        uri = f"/user/profile/{user_id}"
+        html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
+        match = re.search(r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html_content, re.M)
+
+        if match is None:
+            return {}
+
+        info = json.loads(match.group(1).replace(":undefined", ":null"), strict=False)
+        if info is None:
+            return {}
+        return info.get("user").get("userPageData")
+
+    async def get_notes_by_creator(
+        self,
+        creator: str,
+        cursor: str,
+        page_size: int = 30,
+    ) -> Dict:
+        """
+        获取博主的笔记
+        Args:
+            creator: 博主ID
+            cursor: 上一页最后一条笔记的ID
+            page_size: 分页数据长度
+
+        Returns:
+
+        """
+        uri = "/api/sns/web/v1/user_posted"
+        data = {
+            "user_id": creator,
+            "cursor": cursor,
+            "num": page_size,
+            "image_formats": "jpg,webp,avif",
+        }
+        return await self.get(uri, data)
+
+    async def get_all_notes_by_creator(
+        self,
+        user_id: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
+        Args:
+            user_id: 用户ID
+            crawl_interval: 爬取一次的延迟单位（秒）
+            callback: 一次分页爬取结束后的更新回调函数
+
+        Returns:
+
+        """
+        result = []
+        notes_has_more = True
+        notes_cursor = ""
+        while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
+            notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
+            if not notes_res:
+                utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
+                break
+
+            notes_has_more = notes_res.get("has_more", False)
+            notes_cursor = notes_res.get("cursor", "")
+            if "notes" not in notes_res:
+                utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
+                break
+
+            notes = notes_res["notes"]
+            utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
+
+            remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result)
+            if remaining <= 0:
+                break
+
+            notes_to_add = notes[:remaining]
+            if callback:
+                await callback(notes_to_add)
+
+            result.extend(notes_to_add)
+            await asyncio.sleep(crawl_interval)
+
+        utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}")
+        return result
+
+    async def get_note_short_url(self, note_id: str) -> Dict:
+        """
+        获取笔记的短链接
+        Args:
+            note_id: 笔记ID
+
+        Returns:
+
+        """
+        uri = f"/api/sns/web/short_url"
+        data = {"original_url": f"{self._domain}/discovery/item/{note_id}"}
+        return await self.post(uri, data=data, return_response=True)
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+    async def get_note_by_id_from_html(
+        self,
+        note_id: str,
+        xsec_source: str,
+        xsec_token: str,
+        enable_cookie: bool = False,
+    ) -> Optional[Dict]:
+        """
+        通过解析网页版的笔记详情页HTML，获取笔记详情, 该接口可能会出现失败的情况，这里尝试重试3次
+        copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
+        thanks for ReaJason
+        Args:
+            note_id:
+            xsec_source:
+            xsec_token:
+            enable_cookie:
+
+        Returns:
+
+        """
+
+        def camel_to_underscore(key):
+            return re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
+
+        def transform_json_keys(json_data):
+            data_dict = json.loads(json_data)
+            dict_new = {}
+            for key, value in data_dict.items():
+                new_key = camel_to_underscore(key)
+                if not value:
+                    dict_new[new_key] = value
+                elif isinstance(value, dict):
+                    dict_new[new_key] = transform_json_keys(json.dumps(value))
+                elif isinstance(value, list):
+                    dict_new[new_key] = [(transform_json_keys(json.dumps(item)) if (item and isinstance(item, dict)) else item) for item in value]
+                else:
+                    dict_new[new_key] = value
+            return dict_new
+
+        url = ("https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}")
+        copy_headers = self.headers.copy()
+        if not enable_cookie:
+            del copy_headers["Cookie"]
+
+        html = await self.request(method="GET", url=url, return_response=True, headers=copy_headers)
+
+        def get_note_dict(html):
+            state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
+
+            if state != "{}":
+                note_dict = transform_json_keys(state)
+                return note_dict["note"]["note_detail_map"][note_id]["note"]
+            return {}
+
+        try:
+            return get_note_dict(html)
+        except:
+            return None
@@ -0,0 +1,485 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import asyncio
+import os
+import random
+import time
+from asyncio import Task
+from typing import Dict, List, Optional, Tuple
+
+from playwright.async_api import (
+    BrowserContext,
+    BrowserType,
+    Page,
+    Playwright,
+    async_playwright,
+)
+from tenacity import RetryError
+
+import config
+from base.base_crawler import AbstractCrawler
+from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
+from model.m_xiaohongshu import NoteUrlInfo
+from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import xhs as xhs_store
+from tools import utils
+from tools.cdp_browser import CDPBrowserManager
+from var import crawler_type_var, source_keyword_var
+
+from .client import XiaoHongShuClient
+from .exception import DataFetchError
+from .field import SearchSortType
+from .help import parse_note_info_from_note_url, get_search_id
+from .login import XiaoHongShuLogin
+
+
+class XiaoHongShuCrawler(AbstractCrawler):
+    context_page: Page
+    xhs_client: XiaoHongShuClient
+    browser_context: BrowserContext
+    cdp_manager: Optional[CDPBrowserManager]
+
+    def __init__(self) -> None:
+        self.index_url = "https://www.xiaohongshu.com"
+        # self.user_agent = utils.get_user_agent()
+        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
+        self.cdp_manager = None
+
+    async def start(self) -> None:
+        playwright_proxy_format, httpx_proxy_format = None, None
+        if config.ENABLE_IP_PROXY:
+            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
+            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
+            playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
+
+        async with async_playwright() as playwright:
+            # 根据配置选择启动模式
+            if config.ENABLE_CDP_MODE:
+                utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
+                self.browser_context = await self.launch_browser_with_cdp(
+                    playwright,
+                    playwright_proxy_format,
+                    self.user_agent,
+                    headless=config.CDP_HEADLESS,
+                )
+            else:
+                utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
+                # Launch a browser context.
+                chromium = playwright.chromium
+                self.browser_context = await self.launch_browser(
+                    chromium,
+                    playwright_proxy_format,
+                    self.user_agent,
+                    headless=config.HEADLESS,
+                )
+            # stealth.min.js is a js script to prevent the website from detecting the crawler.
+            await self.browser_context.add_init_script(path="libs/stealth.min.js")
+            self.context_page = await self.browser_context.new_page()
+            await self.context_page.goto(self.index_url)
+
+            # Create a client to interact with the xiaohongshu website.
+            self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
+            if not await self.xhs_client.pong():
+                login_obj = XiaoHongShuLogin(
+                    login_type=config.LOGIN_TYPE,
+                    login_phone="",  # input your phone number
+                    browser_context=self.browser_context,
+                    context_page=self.context_page,
+                    cookie_str=config.COOKIES,
+                )
+                await login_obj.begin()
+                await self.xhs_client.update_cookies(browser_context=self.browser_context)
+
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
+                # Search for notes and retrieve their comment information.
+                await self.search()
+            elif config.CRAWLER_TYPE == "detail":
+                # Get the information and comments of the specified post
+                await self.get_specified_notes()
+            elif config.CRAWLER_TYPE == "creator":
+                # Get creator's information and their notes and comments
+                await self.get_creators_and_notes()
+            else:
+                pass
+
+            utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
+
+    async def search(self) -> None:
+        """Search for notes and retrieve their comment information."""
+        utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
+        xhs_limit_count = 20  # xhs limit page fixed value
+        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
+            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
+            source_keyword_var.set(keyword)
+            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
+            page = 1
+            search_id = get_search_id()
+            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
+                    page += 1
+                    continue
+
+                try:
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
+                    note_ids: List[str] = []
+                    xsec_tokens: List[str] = []
+                    notes_res = await self.xhs_client.get_note_by_keyword(
+                        keyword=keyword,
+                        search_id=search_id,
+                        page=page,
+                        sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL),
+                    )
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
+                    if not notes_res or not notes_res.get("has_more", False):
+                        utils.logger.info("No more content!")
+                        break
+                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                    task_list = [
+                        self.get_note_detail_async_task(
+                            note_id=post_item.get("id"),
+                            xsec_source=post_item.get("xsec_source"),
+                            xsec_token=post_item.get("xsec_token"),
+                            semaphore=semaphore,
+                        ) for post_item in notes_res.get("items", {}) if post_item.get("model_type") not in ("rec_query", "hot_query")
+                    ]
+                    note_details = await asyncio.gather(*task_list)
+                    for note_detail in note_details:
+                        if note_detail:
+                            await xhs_store.update_xhs_note(note_detail)
+                            await self.get_notice_media(note_detail)
+                            note_ids.append(note_detail.get("note_id"))
+                            xsec_tokens.append(note_detail.get("xsec_token"))
+                    page += 1
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
+                    await self.batch_get_note_comments(note_ids, xsec_tokens)
+                except DataFetchError:
+                    utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
+                    break
+
+    async def get_creators_and_notes(self) -> None:
+        """Get creator's notes and retrieve their comment information."""
+        utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
+        for user_id in config.XHS_CREATOR_ID_LIST:
+            # get creator detail info from web html content
+            createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
+            if createor_info:
+                await xhs_store.save_creator(user_id, creator=createor_info)
+
+            # When proxy is not enabled, increase the crawling interval
+            if config.ENABLE_IP_PROXY:
+                crawl_interval = random.random()
+            else:
+                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+            # Get all note information of the creator
+            all_notes_list = await self.xhs_client.get_all_notes_by_creator(
+                user_id=user_id,
+                crawl_interval=crawl_interval,
+                callback=self.fetch_creator_notes_detail,
+            )
+
+            note_ids = []
+            xsec_tokens = []
+            for note_item in all_notes_list:
+                note_ids.append(note_item.get("note_id"))
+                xsec_tokens.append(note_item.get("xsec_token"))
+            await self.batch_get_note_comments(note_ids, xsec_tokens)
+
+    async def fetch_creator_notes_detail(self, note_list: List[Dict]):
+        """
+        Concurrently obtain the specified post list and save the data
+        """
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_note_detail_async_task(
+                note_id=post_item.get("note_id"),
+                xsec_source=post_item.get("xsec_source"),
+                xsec_token=post_item.get("xsec_token"),
+                semaphore=semaphore,
+            ) for post_item in note_list
+        ]
+
+        note_details = await asyncio.gather(*task_list)
+        for note_detail in note_details:
+            if note_detail:
+                await xhs_store.update_xhs_note(note_detail)
+                await self.get_notice_media(note_detail)
+
+    async def get_specified_notes(self):
+        """
+        Get the information and comments of the specified post
+        must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
+        Returns:
+
+        """
+        get_note_detail_task_list = []
+        for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
+            note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
+            utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
+            crawler_task = self.get_note_detail_async_task(
+                note_id=note_url_info.note_id,
+                xsec_source=note_url_info.xsec_source,
+                xsec_token=note_url_info.xsec_token,
+                semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
+            )
+            get_note_detail_task_list.append(crawler_task)
+
+        need_get_comment_note_ids = []
+        xsec_tokens = []
+        note_details = await asyncio.gather(*get_note_detail_task_list)
+        for note_detail in note_details:
+            if note_detail:
+                need_get_comment_note_ids.append(note_detail.get("note_id", ""))
+                xsec_tokens.append(note_detail.get("xsec_token", ""))
+                await xhs_store.update_xhs_note(note_detail)
+                await self.get_notice_media(note_detail)
+        await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
+
+    async def get_note_detail_async_task(
+        self,
+        note_id: str,
+        xsec_source: str,
+        xsec_token: str,
+        semaphore: asyncio.Semaphore,
+    ) -> Optional[Dict]:
+        """Get note detail
+
+        Args:
+            note_id:
+            xsec_source:
+            xsec_token:
+            semaphore:
+
+        Returns:
+            Dict: note detail
+        """
+        note_detail = None
+        async with semaphore:
+            try:
+                utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
+
+                try:
+                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
+                except RetryError as e:
+                    pass
+
+                if not note_detail:
+                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
+                    if not note_detail:
+                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+
+                note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
+                return note_detail
+
+            except DataFetchError as ex:
+                utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
+                return None
+            except KeyError as ex:
+                utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}")
+                return None
+
+    async def batch_get_note_comments(self, note_list: List[str], xsec_tokens: List[str]):
+        """Batch get note comments"""
+        if not config.ENABLE_GET_COMMENTS:
+            utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
+            return
+
+        utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list: List[Task] = []
+        for index, note_id in enumerate(note_list):
+            task = asyncio.create_task(
+                self.get_comments(note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore),
+                name=note_id,
+            )
+            task_list.append(task)
+        await asyncio.gather(*task_list)
+
+    async def get_comments(self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore):
+        """Get note comments with keyword filtering and quantity limitation"""
+        async with semaphore:
+            utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
+            # When proxy is not enabled, increase the crawling interval
+            if config.ENABLE_IP_PROXY:
+                crawl_interval = random.random()
+            else:
+                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+            await self.xhs_client.get_note_all_comments(
+                note_id=note_id,
+                xsec_token=xsec_token,
+                crawl_interval=crawl_interval,
+                callback=xhs_store.batch_update_xhs_note_comments,
+                max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
+            )
+
+    async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
+        """Create xhs client"""
+        utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
+        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
+        xhs_client_obj = XiaoHongShuClient(
+            proxy=httpx_proxy,
+            headers={
+                "accept": "application/json, text/plain, */*",
+                "accept-language": "zh-CN,zh;q=0.9",
+                "cache-control": "no-cache",
+                "content-type": "application/json;charset=UTF-8",
+                "origin": "https://www.xiaohongshu.com",
+                "pragma": "no-cache",
+                "priority": "u=1, i",
+                "referer": "https://www.xiaohongshu.com/",
+                "sec-ch-ua": '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
+                "sec-ch-ua-mobile": "?0",
+                "sec-ch-ua-platform": '"Windows"',
+                "sec-fetch-dest": "empty",
+                "sec-fetch-mode": "cors",
+                "sec-fetch-site": "same-site",
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
+                "Cookie": cookie_str,
+            },
+            playwright_page=self.context_page,
+            cookie_dict=cookie_dict,
+        )
+        return xhs_client_obj
+
+    async def launch_browser(
+        self,
+        chromium: BrowserType,
+        playwright_proxy: Optional[Dict],
+        user_agent: Optional[str],
+        headless: bool = True,
+    ) -> BrowserContext:
+        """Launch browser and create browser context"""
+        utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
+        if config.SAVE_LOGIN_STATE:
+            # feat issue #14
+            # we will save login state to avoid login every time
+            user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
+            browser_context = await chromium.launch_persistent_context(
+                user_data_dir=user_data_dir,
+                accept_downloads=True,
+                headless=headless,
+                proxy=playwright_proxy,  # type: ignore
+                viewport={
+                    "width": 1920,
+                    "height": 1080
+                },
+                user_agent=user_agent,
+            )
+            return browser_context
+        else:
+            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
+            browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
+            return browser_context
+
+    async def launch_browser_with_cdp(
+        self,
+        playwright: Playwright,
+        playwright_proxy: Optional[Dict],
+        user_agent: Optional[str],
+        headless: bool = True,
+    ) -> BrowserContext:
+        """
+        使用CDP模式启动浏览器
+        """
+        try:
+            self.cdp_manager = CDPBrowserManager()
+            browser_context = await self.cdp_manager.launch_and_connect(
+                playwright=playwright,
+                playwright_proxy=playwright_proxy,
+                user_agent=user_agent,
+                headless=headless,
+            )
+
+            # 显示浏览器信息
+            browser_info = await self.cdp_manager.get_browser_info()
+            utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}")
+
+            return browser_context
+
+        except Exception as e:
+            utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败，回退到标准模式: {e}")
+            # 回退到标准模式
+            chromium = playwright.chromium
+            return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
+
+    async def close(self):
+        """Close browser context"""
+        # 如果使用CDP模式，需要特殊处理
+        if self.cdp_manager:
+            await self.cdp_manager.cleanup()
+            self.cdp_manager = None
+        else:
+            await self.browser_context.close()
+        utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
+
+    async def get_notice_media(self, note_detail: Dict):
+        if not config.ENABLE_GET_MEIDAS:
+            utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
+            return
+        await self.get_note_images(note_detail)
+        await self.get_notice_video(note_detail)
+
+    async def get_note_images(self, note_item: Dict):
+        """
+        get note images. please use get_notice_media
+        :param note_item:
+        :return:
+        """
+        if not config.ENABLE_GET_MEIDAS:
+            return
+        note_id = note_item.get("note_id")
+        image_list: List[Dict] = note_item.get("image_list", [])
+
+        for img in image_list:
+            if img.get("url_default") != "":
+                img.update({"url": img.get("url_default")})
+
+        if not image_list:
+            return
+        picNum = 0
+        for pic in image_list:
+            url = pic.get("url")
+            if not url:
+                continue
+            content = await self.xhs_client.get_note_media(url)
+            await asyncio.sleep(random.random())
+            if content is None:
+                continue
+            extension_file_name = f"{picNum}.jpg"
+            picNum += 1
+            await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
+
+    async def get_notice_video(self, note_item: Dict):
+        """
+        get note videos. please use get_notice_media
+        :param note_item:
+        :return:
+        """
+        if not config.ENABLE_GET_MEIDAS:
+            return
+        note_id = note_item.get("note_id")
+
+        videos = xhs_store.get_video_url_arr(note_item)
+
+        if not videos:
+            return
+        videoNum = 0
+        for url in videos:
+            content = await self.xhs_client.get_note_media(url)
+            await asyncio.sleep(random.random())
+            if content is None:
+                continue
+            extension_file_name = f"{videoNum}.mp4"
+            videoNum += 1
+            await xhs_store.update_xhs_note_video(note_id, content, extension_file_name)
@@ -0,0 +1,20 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+from httpx import RequestError
+
+
+class DataFetchError(RequestError):
+    """something error when fetch"""
+
+
+class IPBlockError(RequestError):
+    """fetch so fast that the server block us ip"""
@@ -0,0 +1,83 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+from enum import Enum
+from typing import NamedTuple
+
+
+class FeedType(Enum):
+    # 推荐
+    RECOMMEND = "homefeed_recommend"
+    # 穿搭
+    FASION = "homefeed.fashion_v3"
+    # 美食
+    FOOD = "homefeed.food_v3"
+    # 彩妆
+    COSMETICS = "homefeed.cosmetics_v3"
+    # 影视
+    MOVIE = "homefeed.movie_and_tv_v3"
+    # 职场
+    CAREER = "homefeed.career_v3"
+    # 情感
+    EMOTION = "homefeed.love_v3"
+    # 家居
+    HOURSE = "homefeed.household_product_v3"
+    # 游戏
+    GAME = "homefeed.gaming_v3"
+    # 旅行
+    TRAVEL = "homefeed.travel_v3"
+    # 健身
+    FITNESS = "homefeed.fitness_v3"
+
+
+class NoteType(Enum):
+    NORMAL = "normal"
+    VIDEO = "video"
+
+
+class SearchSortType(Enum):
+    """search sort type"""
+    # default
+    GENERAL = "general"
+    # most popular
+    MOST_POPULAR = "popularity_descending"
+    # Latest
+    LATEST = "time_descending"
+
+
+class SearchNoteType(Enum):
+    """search note type
+    """
+    # default
+    ALL = 0
+    # only video
+    VIDEO = 1
+    # only image
+    IMAGE = 2
+
+
+class Note(NamedTuple):
+    """note tuple"""
+    note_id: str
+    title: str
+    desc: str
+    type: str
+    user: dict
+    img_urls: list
+    video_url: str
+    tag_list: list
+    at_user_list: list
+    collected_count: str
+    comment_count: str
+    liked_count: str
+    share_count: str
+    time: int
+    last_update_time: int
@@ -0,0 +1,316 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+import ctypes
+import json
+import random
+import time
+import urllib.parse
+
+from model.m_xiaohongshu import NoteUrlInfo
+from tools.crawler_util import extract_url_params_to_dict
+
+
+def sign(a1="", b1="", x_s="", x_t=""):
+    """
+    takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
+    """
+    common = {
+        "s0": 3,  # getPlatformCode
+        "s1": "",
+        "x0": "1",  # localStorage.getItem("b1b1")
+        "x1": "3.7.8-2",  # version
+        "x2": "Mac OS",
+        "x3": "xhs-pc-web",
+        "x4": "4.27.2",
+        "x5": a1,  # cookie of a1
+        "x6": x_t,
+        "x7": x_s,
+        "x8": b1,  # localStorage.getItem("b1")
+        "x9": mrc(x_t + x_s + b1),
+        "x10": 154,  # getSigCount
+    }
+    encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
+    x_s_common = b64Encode(encode_str)
+    x_b3_traceid = get_b3_trace_id()
+    return {
+        "x-s": x_s,
+        "x-t": x_t,
+        "x-s-common": x_s_common,
+        "x-b3-traceid": x_b3_traceid
+    }
+
+
+def get_b3_trace_id():
+    re = "abcdef0123456789"
+    je = 16
+    e = ""
+    for t in range(16):
+        e += re[random.randint(0, je - 1)]
+    return e
+
+
+def mrc(e):
+    ie = [
+        0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685,
+        2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995,
+        2125561021, 3887607047, 2428444049, 498536548, 1789927666, 4089016648,
+        2227061214, 450548861, 1843258603, 4107580753, 2211677639, 325883990,
+        1684777152, 4251122042, 2321926636, 335633487, 1661365465, 4195302755,
+        2366115317, 997073096, 1281953886, 3579855332, 2724688242, 1006888145,
+        1258607687, 3524101629, 2768942443, 901097722, 1119000684, 3686517206,
+        2898065728, 853044451, 1172266101, 3705015759, 2882616665, 651767980,
+        1373503546, 3369554304, 3218104598, 565507253, 1454621731, 3485111705,
+        3099436303, 671266974, 1594198024, 3322730930, 2970347812, 795835527,
+        1483230225, 3244367275, 3060149565, 1994146192, 31158534, 2563907772,
+        4023717930, 1907459465, 112637215, 2680153253, 3904427059, 2013776290,
+        251722036, 2517215374, 3775830040, 2137656763, 141376813, 2439277719,
+        3865271297, 1802195444, 476864866, 2238001368, 4066508878, 1812370925,
+        453092731, 2181625025, 4111451223, 1706088902, 314042704, 2344532202,
+        4240017532, 1658658271, 366619977, 2362670323, 4224994405, 1303535960,
+        984961486, 2747007092, 3569037538, 1256170817, 1037604311, 2765210733,
+        3554079995, 1131014506, 879679996, 2909243462, 3663771856, 1141124467,
+        855842277, 2852801631, 3708648649, 1342533948, 654459306, 3188396048,
+        3373015174, 1466479909, 544179635, 3110523913, 3462522015, 1591671054,
+        702138776, 2966460450, 3352799412, 1504918807, 783551873, 3082640443,
+        3233442989, 3988292384, 2596254646, 62317068, 1957810842, 3939845945,
+        2647816111, 81470997, 1943803523, 3814918930, 2489596804, 225274430,
+        2053790376, 3826175755, 2466906013, 167816743, 2097651377, 4027552580,
+        2265490386, 503444072, 1762050814, 4150417245, 2154129355, 426522225,
+        1852507879, 4275313526, 2312317920, 282753626, 1742555852, 4189708143,
+        2394877945, 397917763, 1622183637, 3604390888, 2714866558, 953729732,
+        1340076626, 3518719985, 2797360999, 1068828381, 1219638859, 3624741850,
+        2936675148, 906185462, 1090812512, 3747672003, 2825379669, 829329135,
+        1181335161, 3412177804, 3160834842, 628085408, 1382605366, 3423369109,
+        3138078467, 570562233, 1426400815, 3317316542, 2998733608, 733239954,
+        1555261956, 3268935591, 3050360625, 752459403, 1541320221, 2607071920,
+        3965973030, 1969922972, 40735498, 2617837225, 3943577151, 1913087877,
+        83908371, 2512341634, 3803740692, 2075208622, 213261112, 2463272603,
+        3855990285, 2094854071, 198958881, 2262029012, 4057260610, 1759359992,
+        534414190, 2176718541, 4139329115, 1873836001, 414664567, 2282248934,
+        4279200368, 1711684554, 285281116, 2405801727, 4167216745, 1634467795,
+        376229701, 2685067896, 3608007406, 1308918612, 956543938, 2808555105,
+        3495958263, 1231636301, 1047427035, 2932959818, 3654703836, 1088359270,
+        936918000, 2847714899, 3736837829, 1202900863, 817233897, 3183342108,
+        3401237130, 1404277552, 615818150, 3134207493, 3453421203, 1423857449,
+        601450431, 3009837614, 3294710456, 1567103746, 711928724, 3020668471,
+        3272380065, 1510334235, 755167117,
+    ]
+    o = -1
+
+    def right_without_sign(num: int, bit: int=0) -> int:
+        val = ctypes.c_uint32(num).value >> bit
+        MAX32INT = 4294967295
+        return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1
+
+    for n in range(57):
+        o = ie[(o & 255) ^ ord(e[n])] ^ right_without_sign(o, 8)
+    return o ^ -1 ^ 3988292384
+
+
+lookup = [
+    "Z",
+    "m",
+    "s",
+    "e",
+    "r",
+    "b",
+    "B",
+    "o",
+    "H",
+    "Q",
+    "t",
+    "N",
+    "P",
+    "+",
+    "w",
+    "O",
+    "c",
+    "z",
+    "a",
+    "/",
+    "L",
+    "p",
+    "n",
+    "g",
+    "G",
+    "8",
+    "y",
+    "J",
+    "q",
+    "4",
+    "2",
+    "K",
+    "W",
+    "Y",
+    "j",
+    "0",
+    "D",
+    "S",
+    "f",
+    "d",
+    "i",
+    "k",
+    "x",
+    "3",
+    "V",
+    "T",
+    "1",
+    "6",
+    "I",
+    "l",
+    "U",
+    "A",
+    "F",
+    "M",
+    "9",
+    "7",
+    "h",
+    "E",
+    "C",
+    "v",
+    "u",
+    "R",
+    "X",
+    "5",
+]
+
+
+def tripletToBase64(e):
+    return (
+            lookup[63 & (e >> 18)] +
+            lookup[63 & (e >> 12)] +
+            lookup[(e >> 6) & 63] +
+            lookup[e & 63]
+    )
+
+
+def encodeChunk(e, t, r):
+    m = []
+    for b in range(t, r, 3):
+        n = (16711680 & (e[b] << 16)) + \
+            ((e[b + 1] << 8) & 65280) + (e[b + 2] & 255)
+        m.append(tripletToBase64(n))
+    return ''.join(m)
+
+
+def b64Encode(e):
+    P = len(e)
+    W = P % 3
+    U = []
+    z = 16383
+    H = 0
+    Z = P - W
+    while H < Z:
+        U.append(encodeChunk(e, H, Z if H + z > Z else H + z))
+        H += z
+    if 1 == W:
+        F = e[P - 1]
+        U.append(lookup[F >> 2] + lookup[(F << 4) & 63] + "==")
+    elif 2 == W:
+        F = (e[P - 2] << 8) + e[P - 1]
+        U.append(lookup[F >> 10] + lookup[63 & (F >> 4)] +
+                 lookup[(F << 2) & 63] + "=")
+    return "".join(U)
+
+
+def encodeUtf8(e):
+    b = []
+    m = urllib.parse.quote(e, safe='~()*!.\'')
+    w = 0
+    while w < len(m):
+        T = m[w]
+        if T == "%":
+            E = m[w + 1] + m[w + 2]
+            S = int(E, 16)
+            b.append(S)
+            w += 2
+        else:
+            b.append(ord(T[0]))
+        w += 1
+    return b
+
+
+def base36encode(number, alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
+    """Converts an integer to a base36 string."""
+    if not isinstance(number, int):
+        raise TypeError('number must be an integer')
+
+    base36 = ''
+    sign = ''
+
+    if number < 0:
+        sign = '-'
+        number = -number
+
+    if 0 <= number < len(alphabet):
+        return sign + alphabet[number]
+
+    while number != 0:
+        number, i = divmod(number, len(alphabet))
+        base36 = alphabet[i] + base36
+
+    return sign + base36
+
+
+def base36decode(number):
+    return int(number, 36)
+
+
+def get_search_id():
+    e = int(time.time() * 1000) << 64
+    t = int(random.uniform(0, 2147483646))
+    return base36encode((e + t))
+
+
+img_cdns = [
+    "https://sns-img-qc.xhscdn.com",
+    "https://sns-img-hw.xhscdn.com",
+    "https://sns-img-bd.xhscdn.com",
+    "https://sns-img-qn.xhscdn.com",
+]
+
+def get_img_url_by_trace_id(trace_id: str, format_type: str = "png"):
+    return f"{random.choice(img_cdns)}/{trace_id}?imageView2/format/{format_type}"
+
+
+def get_img_urls_by_trace_id(trace_id: str, format_type: str = "png"):
+    return [f"{cdn}/{trace_id}?imageView2/format/{format_type}" for cdn in img_cdns]
+
+
+def get_trace_id(img_url: str):
+    # 浏览器端上传的图片多了 /spectrum/ 这个路径
+    return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
+
+
+def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
+    """
+    从小红书笔记url中解析出笔记信息
+    Args:
+        url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+    Returns:
+
+    """
+    note_id = url.split("/")[-1].split("?")[0]
+    params = extract_url_params_to_dict(url)
+    xsec_token = params.get("xsec_token", "")
+    xsec_source = params.get("xsec_source", "")
+    return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
+
+
+if __name__ == '__main__':
+    _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
+    # 获取一个图片地址在多个cdn下的url地址
+    # final_img_urls = get_img_urls_by_trace_id(get_trace_id(_img_url))
+    final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
+    print(final_img_url)
+
+
@@ -0,0 +1,197 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+import asyncio
+import functools
+import sys
+from typing import Optional
+
+from playwright.async_api import BrowserContext, Page
+from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
+                      wait_fixed)
+
+import config
+from base.base_crawler import AbstractLogin
+from cache.cache_factory import CacheFactory
+from tools import utils
+
+
+class XiaoHongShuLogin(AbstractLogin):
+
+    def __init__(self,
+                 login_type: str,
+                 browser_context: BrowserContext,
+                 context_page: Page,
+                 login_phone: Optional[str] = "",
+                 cookie_str: str = ""
+                 ):
+        config.LOGIN_TYPE = login_type
+        self.browser_context = browser_context
+        self.context_page = context_page
+        self.login_phone = login_phone
+        self.cookie_str = cookie_str
+
+    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
+    async def check_login_state(self, no_logged_in_session: str) -> bool:
+        """
+            Check if the current login status is successful and return True otherwise return False
+            retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
+            if max retry times reached, raise RetryError
+        """
+
+        if "请通过验证" in await self.context_page.content():
+            utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码，请手动验证")
+
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        current_web_session = cookie_dict.get("web_session")
+        if current_web_session != no_logged_in_session:
+            return True
+        return False
+
+    async def begin(self):
+        """Start login xiaohongshu"""
+        utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
+        if config.LOGIN_TYPE == "qrcode":
+            await self.login_by_qrcode()
+        elif config.LOGIN_TYPE == "phone":
+            await self.login_by_mobile()
+        elif config.LOGIN_TYPE == "cookie":
+            await self.login_by_cookies()
+        else:
+            raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
+
+    async def login_by_mobile(self):
+        """Login xiaohongshu by mobile"""
+        utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
+        await asyncio.sleep(1)
+        try:
+            # 小红书进入首页后，有可能不会自动弹出登录框，需要手动点击登录按钮
+            login_button_ele = await self.context_page.wait_for_selector(
+                selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
+                timeout=5000
+            )
+            await login_button_ele.click()
+            # 弹窗的登录对话框也有两种形态，一种是直接可以看到手机号和验证码的
+            # 另一种是需要点击切换到手机登录的
+            element = await self.context_page.wait_for_selector(
+                selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
+                timeout=5000
+            )
+            await element.click()
+        except Exception as e:
+            utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
+
+        await asyncio.sleep(1)
+        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
+        input_ele = await login_container_ele.query_selector("label.phone > input")
+        await input_ele.fill(self.login_phone)
+        await asyncio.sleep(0.5)
+
+        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
+        await send_btn_ele.click()  # 点击发送验证码
+        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
+        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
+        cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
+        max_get_sms_code_time = 60 * 2  # 最长获取验证码的时间为2分钟
+        no_logged_in_session = ""
+        while max_get_sms_code_time > 0:
+            utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
+            await asyncio.sleep(1)
+            sms_code_key = f"xhs_{self.login_phone}"
+            sms_code_value = cache_client.get(sms_code_key)
+            if not sms_code_value:
+                max_get_sms_code_time -= 1
+                continue
+
+            current_cookie = await self.browser_context.cookies()
+            _, cookie_dict = utils.convert_cookies(current_cookie)
+            no_logged_in_session = cookie_dict.get("web_session")
+
+            await sms_code_input_ele.fill(value=sms_code_value.decode())  # 输入短信验证码
+            await asyncio.sleep(0.5)
+            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
+            await agree_privacy_ele.click()  # 点击同意隐私协议
+            await asyncio.sleep(0.5)
+
+            await submit_btn_ele.click()  # 点击登录
+
+            # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
+            break
+
+        try:
+            await self.check_login_state(no_logged_in_session)
+        except RetryError:
+            utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_qrcode(self):
+        """login xiaohongshu website and keep webdriver login state"""
+        utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
+        # login_selector = "div.login-container > div.left > div.qrcode > img"
+        qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
+        # find login qrcode
+        base64_qrcode_img = await utils.find_login_qrcode(
+            self.context_page,
+            selector=qrcode_img_selector
+        )
+        if not base64_qrcode_img:
+            utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
+            # if this website does not automatically popup login dialog box, we will manual click login button
+            await asyncio.sleep(0.5)
+            login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
+            await login_button_ele.click()
+            base64_qrcode_img = await utils.find_login_qrcode(
+                self.context_page,
+                selector=qrcode_img_selector
+            )
+            if not base64_qrcode_img:
+                sys.exit()
+
+        # get not logged session
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        no_logged_in_session = cookie_dict.get("web_session")
+
+        # show login qrcode
+        # fix issue #12
+        # we need to use partial function to call show_qrcode function and run in executor
+        # then current asyncio event loop will not be blocked
+        partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
+        asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
+
+        utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
+        try:
+            await self.check_login_state(no_logged_in_session)
+        except RetryError:
+            utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_cookies(self):
+        """login xiaohongshu website by cookies"""
+        utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
+        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
+            if key != "web_session":  # only set web_session cookie attr
+                continue
+            await self.browser_context.add_cookies([{
+                'name': key,
+                'value': value,
+                'domain': ".xiaohongshu.com",
+                'path': "/"
+            }])