Uploading the AI Crawler System: MindSpider

2025-08-27 13:49:07 +08:00
parent 822bad557f
commit 587e709e82
174 changed files with 34562 additions and 25 deletions
@@ -0,0 +1,13 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+# -*- coding: utf-8 -*-
+from .core import KuaishouCrawler
@@ -0,0 +1,313 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+
+# -*- coding: utf-8 -*-
+import asyncio
+import json
+from typing import Any, Callable, Dict, List, Optional
+from urllib.parse import urlencode
+
+import httpx
+from playwright.async_api import BrowserContext, Page
+
+import config
+from base.base_crawler import AbstractApiClient
+from tools import utils
+
+from .exception import DataFetchError
+from .graphql import KuaiShouGraphQL
+
+
+class KuaiShouClient(AbstractApiClient):
+    def __init__(
+        self,
+        timeout=10,
+        proxy=None,
+        *,
+        headers: Dict[str, str],
+        playwright_page: Page,
+        cookie_dict: Dict[str, str],
+    ):
+        self.proxy = proxy
+        self.timeout = timeout
+        self.headers = headers
+        self._host = "https://www.kuaishou.com/graphql"
+        self.playwright_page = playwright_page
+        self.cookie_dict = cookie_dict
+        self.graphql = KuaiShouGraphQL()
+
+    async def request(self, method, url, **kwargs) -> Any:
+        async with httpx.AsyncClient(proxy=self.proxy) as client:
+            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+        data: Dict = response.json()
+        if data.get("errors"):
+            raise DataFetchError(data.get("errors", "unkonw error"))
+        else:
+            return data.get("data", {})
+
+    async def get(self, uri: str, params=None) -> Dict:
+        final_uri = uri
+        if isinstance(params, dict):
+            final_uri = f"{uri}?" f"{urlencode(params)}"
+        return await self.request(
+            method="GET", url=f"{self._host}{final_uri}", headers=self.headers
+        )
+
+    async def post(self, uri: str, data: dict) -> Dict:
+        json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
+        return await self.request(
+            method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers
+        )
+
+    async def pong(self) -> bool:
+        """get a note to check if login state is ok"""
+        utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
+        ping_flag = False
+        try:
+            post_data = {
+                "operationName": "visionProfileUserList",
+                "variables": {
+                    "ftype": 1,
+                },
+                "query": self.graphql.get("vision_profile_user_list"),
+            }
+            res = await self.post("", post_data)
+            if res.get("visionProfileUserList", {}).get("result") == 1:
+                ping_flag = True
+        except Exception as e:
+            utils.logger.error(
+                f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..."
+            )
+            ping_flag = False
+        return ping_flag
+
+    async def update_cookies(self, browser_context: BrowserContext):
+        cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+        self.headers["Cookie"] = cookie_str
+        self.cookie_dict = cookie_dict
+
+    async def search_info_by_keyword(
+        self, keyword: str, pcursor: str, search_session_id: str = ""
+    ):
+        """
+        KuaiShou web search api
+        :param keyword: search keyword
+        :param pcursor: limite page curson
+        :param search_session_id: search session id
+        :return:
+        """
+        post_data = {
+            "operationName": "visionSearchPhoto",
+            "variables": {
+                "keyword": keyword,
+                "pcursor": pcursor,
+                "page": "search",
+                "searchSessionId": search_session_id,
+            },
+            "query": self.graphql.get("search_query"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_info(self, photo_id: str) -> Dict:
+        """
+        Kuaishou web video detail api
+        :param photo_id:
+        :return:
+        """
+        post_data = {
+            "operationName": "visionVideoDetail",
+            "variables": {"photoId": photo_id, "page": "search"},
+            "query": self.graphql.get("video_detail"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
+        """get video comments
+        :param photo_id: photo id you want to fetch
+        :param pcursor: last you get pcursor, defaults to ""
+        :return:
+        """
+        post_data = {
+            "operationName": "commentListQuery",
+            "variables": {"photoId": photo_id, "pcursor": pcursor},
+            "query": self.graphql.get("comment_list"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_sub_comments(
+        self, photo_id: str, rootCommentId: str, pcursor: str = ""
+    ) -> Dict:
+        """get video sub comments
+        :param photo_id: photo id you want to fetch
+        :param pcursor: last you get pcursor, defaults to ""
+        :return:
+        """
+        post_data = {
+            "operationName": "visionSubCommentList",
+            "variables": {
+                "photoId": photo_id,
+                "pcursor": pcursor,
+                "rootCommentId": rootCommentId,
+            },
+            "query": self.graphql.get("vision_sub_comment_list"),
+        }
+        return await self.post("", post_data)
+
+    async def get_creator_profile(self, userId: str) -> Dict:
+        post_data = {
+            "operationName": "visionProfile",
+            "variables": {"userId": userId},
+            "query": self.graphql.get("vision_profile"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
+        post_data = {
+            "operationName": "visionProfilePhotoList",
+            "variables": {"page": "profile", "pcursor": pcursor, "userId": userId},
+            "query": self.graphql.get("vision_profile_photo_list"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_all_comments(
+        self,
+        photo_id: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+        max_count: int = 10,
+    ):
+        """
+        get video all comments include sub comments
+        :param photo_id:
+        :param crawl_interval:
+        :param callback:
+        :param max_count:
+        :return:
+        """
+
+        result = []
+        pcursor = ""
+
+        while pcursor != "no_more" and len(result) < max_count:
+            comments_res = await self.get_video_comments(photo_id, pcursor)
+            vision_commen_list = comments_res.get("visionCommentList", {})
+            pcursor = vision_commen_list.get("pcursor", "")
+            comments = vision_commen_list.get("rootComments", [])
+            if len(result) + len(comments) > max_count:
+                comments = comments[: max_count - len(result)]
+            if callback:  # 如果有回调函数，就执行回调函数
+                await callback(photo_id, comments)
+            result.extend(comments)
+            await asyncio.sleep(crawl_interval)
+            sub_comments = await self.get_comments_all_sub_comments(
+                comments, photo_id, crawl_interval, callback
+            )
+            result.extend(sub_comments)
+        return result
+
+    async def get_comments_all_sub_comments(
+        self,
+        comments: List[Dict],
+        photo_id,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
+        Args:
+            comments: 评论列表
+            photo_id: 视频id
+            crawl_interval: 爬取一次评论的延迟单位（秒）
+            callback: 一次评论爬取结束后
+        Returns:
+
+        """
+        if not config.ENABLE_GET_SUB_COMMENTS:
+            utils.logger.info(
+                f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+            )
+            return []
+
+        result = []
+        for comment in comments:
+            sub_comments = comment.get("subComments")
+            if sub_comments and callback:
+                await callback(photo_id, sub_comments)
+
+            sub_comment_pcursor = comment.get("subCommentsPcursor")
+            if sub_comment_pcursor == "no_more":
+                continue
+
+            root_comment_id = comment.get("commentId")
+            sub_comment_pcursor = ""
+
+            while sub_comment_pcursor != "no_more":
+                comments_res = await self.get_video_sub_comments(
+                    photo_id, root_comment_id, sub_comment_pcursor
+                )
+                vision_sub_comment_list = comments_res.get("visionSubCommentList", {})
+                sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
+
+                comments = vision_sub_comment_list.get("subComments", {})
+                if callback:
+                    await callback(photo_id, comments)
+                await asyncio.sleep(crawl_interval)
+                result.extend(comments)
+        return result
+
+    async def get_creator_info(self, user_id: str) -> Dict:
+        """
+        eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
+        快手用户主页
+        """
+
+        visionProfile = await self.get_creator_profile(user_id)
+        return visionProfile.get("userProfile")
+
+    async def get_all_videos_by_creator(
+        self,
+        user_id: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
+        Args:
+            user_id: 用户ID
+            crawl_interval: 爬取一次的延迟单位（秒）
+            callback: 一次分页爬取结束后的更新回调函数
+        Returns:
+
+        """
+        result = []
+        pcursor = ""
+
+        while pcursor != "no_more":
+            videos_res = await self.get_video_by_creater(user_id, pcursor)
+            if not videos_res:
+                utils.logger.error(
+                    f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
+                )
+                break
+
+            vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
+            pcursor = vision_profile_photo_list.get("pcursor", "")
+
+            videos = vision_profile_photo_list.get("feeds", [])
+            utils.logger.info(
+                f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
+            )
+
+            if callback:
+                await callback(videos)
+            await asyncio.sleep(crawl_interval)
+            result.extend(videos)
+        return result
@@ -0,0 +1,396 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+
+import asyncio
+import os
+import random
+import time
+from asyncio import Task
+from typing import Dict, List, Optional, Tuple
+
+from playwright.async_api import (
+    BrowserContext,
+    BrowserType,
+    Page,
+    Playwright,
+    async_playwright,
+)
+
+import config
+from base.base_crawler import AbstractCrawler
+from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from store import kuaishou as kuaishou_store
+from tools import utils
+from tools.cdp_browser import CDPBrowserManager
+from var import comment_tasks_var, crawler_type_var, source_keyword_var
+
+from .client import KuaiShouClient
+from .exception import DataFetchError
+from .login import KuaishouLogin
+
+
+class KuaishouCrawler(AbstractCrawler):
+    context_page: Page
+    ks_client: KuaiShouClient
+    browser_context: BrowserContext
+    cdp_manager: Optional[CDPBrowserManager]
+
+    def __init__(self):
+        self.index_url = "https://www.kuaishou.com"
+        self.user_agent = utils.get_user_agent()
+        self.cdp_manager = None
+
+    async def start(self):
+        playwright_proxy_format, httpx_proxy_format = None, None
+        if config.ENABLE_IP_PROXY:
+            ip_proxy_pool = await create_ip_pool(
+                config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
+            )
+            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
+            playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
+                ip_proxy_info
+            )
+
+        async with async_playwright() as playwright:
+            # 根据配置选择启动模式
+            if config.ENABLE_CDP_MODE:
+                utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器")
+                self.browser_context = await self.launch_browser_with_cdp(
+                    playwright,
+                    playwright_proxy_format,
+                    self.user_agent,
+                    headless=config.CDP_HEADLESS,
+                )
+            else:
+                utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器")
+                # Launch a browser context.
+                chromium = playwright.chromium
+                self.browser_context = await self.launch_browser(
+                    chromium, None, self.user_agent, headless=config.HEADLESS
+                )
+            # stealth.min.js is a js script to prevent the website from detecting the crawler.
+            await self.browser_context.add_init_script(path="libs/stealth.min.js")
+            self.context_page = await self.browser_context.new_page()
+            await self.context_page.goto(f"{self.index_url}?isHome=1")
+
+            # Create a client to interact with the kuaishou website.
+            self.ks_client = await self.create_ks_client(httpx_proxy_format)
+            if not await self.ks_client.pong():
+                login_obj = KuaishouLogin(
+                    login_type=config.LOGIN_TYPE,
+                    login_phone=httpx_proxy_format,
+                    browser_context=self.browser_context,
+                    context_page=self.context_page,
+                    cookie_str=config.COOKIES,
+                )
+                await login_obj.begin()
+                await self.ks_client.update_cookies(
+                    browser_context=self.browser_context
+                )
+
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
+                # Search for videos and retrieve their comment information.
+                await self.search()
+            elif config.CRAWLER_TYPE == "detail":
+                # Get the information and comments of the specified post
+                await self.get_specified_videos()
+            elif config.CRAWLER_TYPE == "creator":
+                # Get creator's information and their videos and comments
+                await self.get_creators_and_videos()
+            else:
+                pass
+
+            utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
+
+    async def search(self):
+        utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
+        ks_limit_count = 20  # kuaishou limit page fixed value
+        if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
+            config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
+            search_session_id = ""
+            source_keyword_var.set(keyword)
+            utils.logger.info(
+                f"[KuaishouCrawler.search] Current search keyword: {keyword}"
+            )
+            page = 1
+            while (
+                page - start_page + 1
+            ) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                utils.logger.info(
+                    f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}"
+                )
+                video_id_list: List[str] = []
+                videos_res = await self.ks_client.search_info_by_keyword(
+                    keyword=keyword,
+                    pcursor=str(page),
+                    search_session_id=search_session_id,
+                )
+                if not videos_res:
+                    utils.logger.error(
+                        f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data"
+                    )
+                    continue
+
+                vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
+                if vision_search_photo.get("result") != 1:
+                    utils.logger.error(
+                        f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data "
+                    )
+                    continue
+                search_session_id = vision_search_photo.get("searchSessionId", "")
+                for video_detail in vision_search_photo.get("feeds"):
+                    video_id_list.append(video_detail.get("photo", {}).get("id"))
+                    await kuaishou_store.update_kuaishou_video(video_item=video_detail)
+
+                # batch fetch video comments
+                page += 1
+                await self.batch_get_video_comments(video_id_list)
+
+    async def get_specified_videos(self):
+        """Get the information and comments of the specified post"""
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_video_info_task(video_id=video_id, semaphore=semaphore)
+            for video_id in config.KS_SPECIFIED_ID_LIST
+        ]
+        video_details = await asyncio.gather(*task_list)
+        for video_detail in video_details:
+            if video_detail is not None:
+                await kuaishou_store.update_kuaishou_video(video_detail)
+        await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
+
+    async def get_video_info_task(
+        self, video_id: str, semaphore: asyncio.Semaphore
+    ) -> Optional[Dict]:
+        """Get video detail task"""
+        async with semaphore:
+            try:
+                result = await self.ks_client.get_video_info(video_id)
+                utils.logger.info(
+                    f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
+                )
+                return result.get("visionVideoDetail")
+            except DataFetchError as ex:
+                utils.logger.error(
+                    f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}"
+                )
+                return None
+            except KeyError as ex:
+                utils.logger.error(
+                    f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}"
+                )
+                return None
+
+    async def batch_get_video_comments(self, video_id_list: List[str]):
+        """
+        batch get video comments
+        :param video_id_list:
+        :return:
+        """
+        if not config.ENABLE_GET_COMMENTS:
+            utils.logger.info(
+                f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled"
+            )
+            return
+
+        utils.logger.info(
+            f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}"
+        )
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list: List[Task] = []
+        for video_id in video_id_list:
+            task = asyncio.create_task(
+                self.get_comments(video_id, semaphore), name=video_id
+            )
+            task_list.append(task)
+
+        comment_tasks_var.set(task_list)
+        await asyncio.gather(*task_list)
+
+    async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
+        """
+        get comment for video id
+        :param video_id:
+        :param semaphore:
+        :return:
+        """
+        async with semaphore:
+            try:
+                utils.logger.info(
+                    f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
+                )
+                await self.ks_client.get_video_all_comments(
+                    photo_id=video_id,
+                    crawl_interval=random.random(),
+                    callback=kuaishou_store.batch_update_ks_video_comments,
+                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
+                )
+            except DataFetchError as ex:
+                utils.logger.error(
+                    f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
+                )
+            except Exception as e:
+                utils.logger.error(
+                    f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}"
+                )
+                # use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
+                # maybe kuaishou block our request, we will take a nap and update the cookie again
+                current_running_tasks = comment_tasks_var.get()
+                for task in current_running_tasks:
+                    task.cancel()
+                time.sleep(20)
+                await self.context_page.goto(f"{self.index_url}?isHome=1")
+                await self.ks_client.update_cookies(
+                    browser_context=self.browser_context
+                )
+
+    async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
+        """Create ks client"""
+        utils.logger.info(
+            "[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..."
+        )
+        cookie_str, cookie_dict = utils.convert_cookies(
+            await self.browser_context.cookies()
+        )
+        ks_client_obj = KuaiShouClient(
+            proxy=httpx_proxy,
+            headers={
+                "User-Agent": self.user_agent,
+                "Cookie": cookie_str,
+                "Origin": self.index_url,
+                "Referer": self.index_url,
+                "Content-Type": "application/json;charset=UTF-8",
+            },
+            playwright_page=self.context_page,
+            cookie_dict=cookie_dict,
+        )
+        return ks_client_obj
+
+    async def launch_browser(
+        self,
+        chromium: BrowserType,
+        playwright_proxy: Optional[Dict],
+        user_agent: Optional[str],
+        headless: bool = True,
+    ) -> BrowserContext:
+        """Launch browser and create browser context"""
+        utils.logger.info(
+            "[KuaishouCrawler.launch_browser] Begin create browser context ..."
+        )
+        if config.SAVE_LOGIN_STATE:
+            user_data_dir = os.path.join(
+                os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
+            )  # type: ignore
+            browser_context = await chromium.launch_persistent_context(
+                user_data_dir=user_data_dir,
+                accept_downloads=True,
+                headless=headless,
+                proxy=playwright_proxy,  # type: ignore
+                viewport={"width": 1920, "height": 1080},
+                user_agent=user_agent,
+            )
+            return browser_context
+        else:
+            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
+            browser_context = await browser.new_context(
+                viewport={"width": 1920, "height": 1080}, user_agent=user_agent
+            )
+            return browser_context
+
+    async def launch_browser_with_cdp(
+        self,
+        playwright: Playwright,
+        playwright_proxy: Optional[Dict],
+        user_agent: Optional[str],
+        headless: bool = True,
+    ) -> BrowserContext:
+        """
+        使用CDP模式启动浏览器
+        """
+        try:
+            self.cdp_manager = CDPBrowserManager()
+            browser_context = await self.cdp_manager.launch_and_connect(
+                playwright=playwright,
+                playwright_proxy=playwright_proxy,
+                user_agent=user_agent,
+                headless=headless,
+            )
+
+            # 显示浏览器信息
+            browser_info = await self.cdp_manager.get_browser_info()
+            utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}")
+
+            return browser_context
+
+        except Exception as e:
+            utils.logger.error(
+                f"[KuaishouCrawler] CDP模式启动失败，回退到标准模式: {e}"
+            )
+            # 回退到标准模式
+            chromium = playwright.chromium
+            return await self.launch_browser(
+                chromium, playwright_proxy, user_agent, headless
+            )
+
+    async def get_creators_and_videos(self) -> None:
+        """Get creator's videos and retrieve their comment information."""
+        utils.logger.info(
+            "[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
+        )
+        for user_id in config.KS_CREATOR_ID_LIST:
+            # get creator detail info from web html content
+            createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
+            if createor_info:
+                await kuaishou_store.save_creator(user_id, creator=createor_info)
+
+            # Get all video information of the creator
+            all_video_list = await self.ks_client.get_all_videos_by_creator(
+                user_id=user_id,
+                crawl_interval=random.random(),
+                callback=self.fetch_creator_video_detail,
+            )
+
+            video_ids = [
+                video_item.get("photo", {}).get("id") for video_item in all_video_list
+            ]
+            await self.batch_get_video_comments(video_ids)
+
+    async def fetch_creator_video_detail(self, video_list: List[Dict]):
+        """
+        Concurrently obtain the specified post list and save the data
+        """
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore)
+            for post_item in video_list
+        ]
+
+        video_details = await asyncio.gather(*task_list)
+        for video_detail in video_details:
+            if video_detail is not None:
+                await kuaishou_store.update_kuaishou_video(video_detail)
+
+    async def close(self):
+        """Close browser context"""
+        # 如果使用CDP模式，需要特殊处理
+        if self.cdp_manager:
+            await self.cdp_manager.cleanup()
+            self.cdp_manager = None
+        else:
+            await self.browser_context.close()
+        utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
@@ -0,0 +1,20 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+from httpx import RequestError
+
+
+class DataFetchError(RequestError):
+    """something error when fetch"""
+
+
+class IPBlockError(RequestError):
+    """fetch so fast that the server block us ip"""
@@ -0,0 +1,12 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+# -*- coding: utf-8 -*-
@@ -0,0 +1,33 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+# 快手的数据传输是基于GraphQL实现的
+# 这个类负责获取一些GraphQL的schema
+from typing import Dict
+
+
+class KuaiShouGraphQL:
+    graphql_queries: Dict[str, str]= {}
+
+    def __init__(self):
+        self.graphql_dir = "media_platform/kuaishou/graphql/"
+        self.load_graphql_queries()
+
+    def load_graphql_queries(self):
+        graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
+
+        for file in graphql_files:
+            with open(self.graphql_dir + file, mode="r") as f:
+                query_name = file.split(".")[0]
+                self.graphql_queries[query_name] = f.read()
+
+    def get(self, query_name: str) -> str:
+        return self.graphql_queries.get(query_name, "Query not found")
@@ -0,0 +1,39 @@
+query commentListQuery($photoId: String, $pcursor: String) {
+  visionCommentList(photoId: $photoId, pcursor: $pcursor) {
+    commentCount
+    pcursor
+    rootComments {
+      commentId
+      authorId
+      authorName
+      content
+      headurl
+      timestamp
+      likedCount
+      realLikedCount
+      liked
+      status
+      authorLiked
+      subCommentCount
+      subCommentsPcursor
+      subComments {
+        commentId
+        authorId
+        authorName
+        content
+        headurl
+        timestamp
+        likedCount
+        realLikedCount
+        liked
+        status
+        authorLiked
+        replyToUserName
+        replyTo
+        __typename
+      }
+      __typename
+    }
+    __typename
+  }
+}
@@ -0,0 +1,111 @@
+fragment photoContent on PhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+}
+
+fragment recoPhotoFragment on recoPhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+}
+
+fragment feedContent on Feed {
+  type
+  author {
+    id
+    name
+    headerUrl
+    following
+    headerUrls {
+      url
+      __typename
+    }
+    __typename
+  }
+  photo {
+    ...photoContent
+    ...recoPhotoFragment
+    __typename
+  }
+  canAddComment
+  llsid
+  status
+  currentPcursor
+  tags {
+    type
+    name
+    __typename
+  }
+  __typename
+}
+
+query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
+  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
+    result
+    llsid
+    webPageArea
+    feeds {
+      ...feedContent
+      __typename
+    }
+    searchSessionId
+    pcursor
+    aladdinBanner {
+      imgUrl
+      link
+      __typename
+    }
+    __typename
+  }
+}
@@ -0,0 +1,80 @@
+query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {
+  visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {
+    status
+    type
+    author {
+      id
+      name
+      following
+      headerUrl
+      __typename
+    }
+    photo {
+      id
+      duration
+      caption
+      likeCount
+      realLikeCount
+      coverUrl
+      photoUrl
+      liked
+      timestamp
+      expTag
+      llsid
+      viewCount
+      videoRatio
+      stereoType
+      musicBlocked
+      manifest {
+        mediaType
+        businessType
+        version
+        adaptationSet {
+          id
+          duration
+          representation {
+            id
+            defaultSelect
+            backupUrl
+            codecs
+            url
+            height
+            width
+            avgBitrate
+            maxBitrate
+            m3u8Slice
+            qualityType
+            qualityLabel
+            frameRate
+            featureP2sp
+            hidden
+            disableAdaptive
+            __typename
+          }
+          __typename
+        }
+        __typename
+      }
+      manifestH265
+      photoH265Url
+      coronaCropManifest
+      coronaCropManifestH265
+      croppedPhotoH265Url
+      croppedPhotoUrl
+      videoResource
+      __typename
+    }
+    tags {
+      type
+      name
+      __typename
+    }
+    commentLimit {
+      canAddComment
+      __typename
+    }
+    llsid
+    danmakuSwitch
+    __typename
+  }
+}
@@ -0,0 +1,27 @@
+query visionProfile($userId: String) {
+  visionProfile(userId: $userId) {
+    result
+    hostName
+    userProfile {
+      ownerCount {
+        fan
+        photo
+        follow
+        photo_public
+        __typename
+      }
+      profile {
+        gender
+        user_name
+        user_id
+        headurl
+        user_text
+        user_profile_bg_url
+        __typename
+      }
+      isFollowing
+      __typename
+    }
+    __typename
+  }
+}
@@ -0,0 +1,110 @@
+fragment photoContent on PhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+  riskTagContent
+  riskTagUrl
+}
+
+fragment recoPhotoFragment on recoPhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+  riskTagContent
+  riskTagUrl
+}
+
+fragment feedContent on Feed {
+  type
+  author {
+    id
+    name
+    headerUrl
+    following
+    headerUrls {
+      url
+      __typename
+    }
+    __typename
+  }
+  photo {
+    ...photoContent
+    ...recoPhotoFragment
+    __typename
+  }
+  canAddComment
+  llsid
+  status
+  currentPcursor
+  tags {
+    type
+    name
+    __typename
+  }
+  __typename
+}
+
+query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
+  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
+    result
+    llsid
+    webPageArea
+    feeds {
+      ...feedContent
+      __typename
+    }
+    hostName
+    pcursor
+    __typename
+  }
+}
@@ -0,0 +1,16 @@
+query visionProfileUserList($pcursor: String, $ftype: Int) {
+  visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+    result
+    fols {
+      user_name
+      headurl
+      user_text
+      isFollowing
+      user_id
+      __typename
+    }
+    hostName
+    pcursor
+    __typename
+  }
+}
@@ -0,0 +1,22 @@
+mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
+  visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
+    pcursor
+    subComments {
+      commentId
+      authorId
+      authorName
+      content
+      headurl
+      timestamp
+      likedCount
+      realLikedCount
+      liked
+      status
+      authorLiked
+      replyToUserName
+      replyTo
+      __typename
+    }
+    __typename
+  }
+}
@@ -0,0 +1,113 @@
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
+# 1. 不得用于任何商业用途。  
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
+# 3. 不得进行大规模爬取或对平台造成运营干扰。  
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 5. 不得用于任何非法或不当的用途。
+#   
+# 详细许可条款请参阅项目根目录下的LICENSE文件。  
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+
+
+import asyncio
+import functools
+import sys
+from typing import Optional
+
+from playwright.async_api import BrowserContext, Page
+from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
+                      wait_fixed)
+
+import config
+from base.base_crawler import AbstractLogin
+from tools import utils
+
+
+class KuaishouLogin(AbstractLogin):
+    def __init__(self,
+                 login_type: str,
+                 browser_context: BrowserContext,
+                 context_page: Page,
+                 login_phone: Optional[str] = "",
+                 cookie_str: str = ""
+                 ):
+        config.LOGIN_TYPE = login_type
+        self.browser_context = browser_context
+        self.context_page = context_page
+        self.login_phone = login_phone
+        self.cookie_str = cookie_str
+
+    async def begin(self):
+        """Start login xiaohongshu"""
+        utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
+        if config.LOGIN_TYPE == "qrcode":
+            await self.login_by_qrcode()
+        elif config.LOGIN_TYPE == "phone":
+            await self.login_by_mobile()
+        elif config.LOGIN_TYPE == "cookie":
+            await self.login_by_cookies()
+        else:
+            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
+
+    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
+    async def check_login_state(self) -> bool:
+        """
+            Check if the current login status is successful and return True otherwise return False
+            retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
+            if max retry times reached, raise RetryError
+        """
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        kuaishou_pass_token = cookie_dict.get("passToken")
+        if kuaishou_pass_token:
+            return True
+        return False
+
+    async def login_by_qrcode(self):
+        """login kuaishou website and keep webdriver login state"""
+        utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
+
+        # click login button
+        login_button_ele = self.context_page.locator(
+            "xpath=//p[text()='登录']"
+        )
+        await login_button_ele.click()
+
+        # find login qrcode
+        qrcode_img_selector = "//div[@class='qrcode-img']//img"
+        base64_qrcode_img = await utils.find_login_qrcode(
+            self.context_page,
+            selector=qrcode_img_selector
+        )
+        if not base64_qrcode_img:
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
+            sys.exit()
+
+
+        # show login qrcode
+        partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
+        asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
+
+        utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
+        try:
+            await self.check_login_state()
+        except RetryError:
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_mobile(self):
+        pass
+
+    async def login_by_cookies(self):
+        utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
+        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
+            await self.browser_context.add_cookies([{
+                'name': key,
+                'value': value,
+                'domain': ".kuaishou.com",
+                'path': "/"
+            }])