The framework has been restructured again, and the Flask framework has been abandoned.

2025-08-22 13:52:05 +08:00
parent 15b3a3343b
commit 0c31be4287
279 changed files with 2725 additions and 1648837 deletions
@@ -1,18 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
-# 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/23 15:40
-# @Desc    :
-from .client import WeiboClient
-from .core import WeiboCrawler
-from .login import WeiboLogin
@@ -1,381 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/23 15:40
-# @Desc    : 微博爬虫 API 请求 client
-
-import asyncio
-import copy
-import json
-import re
-from typing import Callable, Dict, List, Optional, Union
-from urllib.parse import parse_qs, unquote, urlencode
-
-import httpx
-from httpx import Response
-from playwright.async_api import BrowserContext, Page
-
-import config
-from tools import utils
-
-from .exception import DataFetchError
-from .field import SearchType
-
-
-class WeiboClient:
-
-    def __init__(
-        self,
-        timeout=60,  # 若开启爬取媒体选项，weibo 的图片需要更久的超时时间
-        proxy=None,
-        *,
-        headers: Dict[str, str],
-        playwright_page: Page,
-        cookie_dict: Dict[str, str],
-    ):
-        self.proxy = proxy
-        self.timeout = timeout
-        self.headers = headers
-        self._host = "https://m.weibo.cn"
-        self.playwright_page = playwright_page
-        self.cookie_dict = cookie_dict
-        self._image_agent_host = "https://i1.wp.com/"
-
-    async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
-        enable_return_response = kwargs.pop("return_response", False)
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
-
-        if enable_return_response:
-            return response
-
-        data: Dict = response.json()
-        ok_code = data.get("ok")
-        if ok_code == 0:  # response error
-            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
-            raise DataFetchError(data.get("msg", "response error"))
-        elif ok_code != 1:  # unknown error
-            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
-            raise DataFetchError(data.get("msg", "unknown error"))
-        else:  # response right
-            return data.get("data", {})
-
-    async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
-        final_uri = uri
-        if isinstance(params, dict):
-            final_uri = (f"{uri}?"
-                         f"{urlencode(params)}")
-
-        if headers is None:
-            headers = self.headers
-        return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
-
-    async def post(self, uri: str, data: dict) -> Dict:
-        json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
-        return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
-
-    async def pong(self) -> bool:
-        """get a note to check if login state is ok"""
-        utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
-        ping_flag = False
-        try:
-            uri = "/api/config"
-            resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
-            if resp_data.get("login"):
-                ping_flag = True
-            else:
-                utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
-        except Exception as e:
-            utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
-            ping_flag = False
-        return ping_flag
-
-    async def update_cookies(self, browser_context: BrowserContext):
-        cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
-        self.headers["Cookie"] = cookie_str
-        self.cookie_dict = cookie_dict
-
-    async def get_note_by_keyword(
-        self,
-        keyword: str,
-        page: int = 1,
-        search_type: SearchType = SearchType.DEFAULT,
-    ) -> Dict:
-        """
-        search note by keyword
-        :param keyword: 微博搜搜的关键词
-        :param page: 分页参数 -当前页码
-        :param search_type: 搜索的类型，见 weibo/filed.py 中的枚举SearchType
-        :return:
-        """
-        uri = "/api/container/getIndex"
-        containerid = f"100103type={search_type.value}&q={keyword}"
-        params = {
-            "containerid": containerid,
-            "page_type": "searchall",
-            "page": page,
-        }
-        return await self.get(uri, params)
-
-    async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
-        """get notes comments
-        :param mid_id: 微博ID
-        :param max_id: 分页参数ID
-        :param max_id_type: 分页参数ID类型
-        :return:
-        """
-        uri = "/comments/hotflow"
-        params = {
-            "id": mid_id,
-            "mid": mid_id,
-            "max_id_type": max_id_type,
-        }
-        if max_id > 0:
-            params.update({"max_id": max_id})
-        referer_url = f"https://m.weibo.cn/detail/{mid_id}"
-        headers = copy.copy(self.headers)
-        headers["Referer"] = referer_url
-
-        return await self.get(uri, params, headers=headers)
-
-    async def get_note_all_comments(
-        self,
-        note_id: str,
-        crawl_interval: float = 1.0,
-        callback: Optional[Callable] = None,
-        max_count: int = 10,
-    ):
-        """
-        get note all comments include sub comments
-        :param note_id:
-        :param crawl_interval:
-        :param callback:
-        :param max_count:
-        :return:
-        """
-        result = []
-        is_end = False
-        max_id = -1
-        max_id_type = 0
-        while not is_end and len(result) < max_count:
-            comments_res = await self.get_note_comments(note_id, max_id, max_id_type)
-            max_id: int = comments_res.get("max_id")
-            max_id_type: int = comments_res.get("max_id_type")
-            comment_list: List[Dict] = comments_res.get("data", [])
-            is_end = max_id == 0
-            if len(result) + len(comment_list) > max_count:
-                comment_list = comment_list[:max_count - len(result)]
-            if callback:  # 如果有回调函数，就执行回调函数
-                await callback(note_id, comment_list)
-            await asyncio.sleep(crawl_interval)
-            result.extend(comment_list)
-            sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
-            result.extend(sub_comment_result)
-        return result
-
-    @staticmethod
-    async def get_comments_all_sub_comments(
-        note_id: str,
-        comment_list: List[Dict],
-        callback: Optional[Callable] = None,
-    ) -> List[Dict]:
-        """
-        获取评论的所有子评论
-        Args:
-            note_id:
-            comment_list:
-            callback:
-
-        Returns:
-
-        """
-        if not config.ENABLE_GET_SUB_COMMENTS:
-            utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
-            return []
-
-        res_sub_comments = []
-        for comment in comment_list:
-            sub_comments = comment.get("comments")
-            if sub_comments and isinstance(sub_comments, list):
-                await callback(note_id, sub_comments)
-                res_sub_comments.extend(sub_comments)
-        return res_sub_comments
-
-    async def get_note_info_by_id(self, note_id: str) -> Dict:
-        """
-        根据帖子ID获取详情
-        :param note_id:
-        :return:
-        """
-        url = f"{self._host}/detail/{note_id}"
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
-            if response.status_code != 200:
-                raise DataFetchError(f"get weibo detail err: {response.text}")
-            match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
-            if match:
-                render_data_json = match.group(1)
-                render_data_dict = json.loads(render_data_json)
-                note_detail = render_data_dict[0].get("status")
-                note_item = {"mblog": note_detail}
-                return note_item
-            else:
-                utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
-                return dict()
-
-    async def get_note_image(self, image_url: str) -> bytes:
-        image_url = image_url[8:]  # 去掉 https://
-        sub_url = image_url.split("/")
-        image_url = ""
-        for i in range(len(sub_url)):
-            if i == 1:
-                image_url += "large/"  # 都获取高清大图
-            elif i == len(sub_url) - 1:
-                image_url += sub_url[i]
-            else:
-                image_url += sub_url[i] + "/"
-        # 微博图床对外存在防盗链，所以需要代理访问
-        # 由于微博图片是通过 i1.wp.com 来访问的，所以需要拼接一下
-        final_uri = (f"{self._image_agent_host}"
-                     f"{image_url}")
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            try:
-                response = await client.request("GET", final_uri, timeout=self.timeout)
-                response.raise_for_status()
-                if not response.reason_phrase == "OK":
-                    utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
-                    return None
-                else:
-                    return response.content
-            except httpx.HTTPError as exc:  # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
-                utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}")    # 保留原始异常类型名称，以便开发者调试
-                return None
-
-    async def get_creator_container_info(self, creator_id: str) -> Dict:
-        """
-        获取用户的容器ID, 容器信息代表着真实请求的API路径
-            fid_container_id：用户的微博详情API的容器ID
-            lfid_container_id：用户的微博列表API的容器ID
-        Args:
-            creator_id:
-
-        Returns: {
-
-        """
-        response = await self.get(f"/u/{creator_id}", return_response=True)
-        m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
-        if not m_weibocn_params:
-            raise DataFetchError("get containerid failed")
-        m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
-        return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
-
-    async def get_creator_info_by_id(self, creator_id: str) -> Dict:
-        """
-        根据用户ID获取用户详情
-        Args:
-            creator_id:
-
-        Returns:
-
-        """
-        uri = "/api/container/getIndex"
-        container_info = await self.get_creator_container_info(creator_id)
-        if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
-            utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
-            raise DataFetchError("get containerid failed")
-        params = {
-            "jumpfrom": "weibocom",
-            "type": "uid",
-            "value": creator_id,
-            "containerid": container_info["fid_container_id"],
-        }
-
-        user_res = await self.get(uri, params)
-
-        if user_res.get("tabsInfo"):
-            tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
-            for tab in tabs:
-                if tab.get("tabKey") == "weibo":
-                    container_info["lfid_container_id"] = tab.get("containerid")
-                    break
-
-        user_res.update(container_info)
-        return user_res
-
-    async def get_notes_by_creator(
-        self,
-        creator: str,
-        container_id: str,
-        since_id: str = "0",
-    ) -> Dict:
-        """
-        获取博主的笔记
-        Args:
-            creator: 博主ID
-            container_id: 容器ID
-            since_id: 上一页最后一条笔记的ID
-        Returns:
-
-        """
-
-        uri = "/api/container/getIndex"
-        params = {
-            "jumpfrom": "weibocom",
-            "type": "uid",
-            "value": creator,
-            "containerid": container_id,
-            "since_id": since_id,
-        }
-        return await self.get(uri, params)
-
-    async def get_all_notes_by_creator_id(
-        self,
-        creator_id: str,
-        container_id: str,
-        crawl_interval: float = 1.0,
-        callback: Optional[Callable] = None,
-    ) -> List[Dict]:
-        """
-        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
-        Args:
-            creator_id:
-            container_id:
-            crawl_interval:
-            callback:
-
-        Returns:
-
-        """
-        result = []
-        notes_has_more = True
-        since_id = ""
-        crawler_total_count = 0
-        while notes_has_more:
-            notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
-            if not notes_res:
-                utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
-                break
-            since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
-            if "cards" not in notes_res:
-                utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
-                break
-
-            notes = notes_res["cards"]
-            utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
-            notes = [note for note in notes if note.get("card_type") == 9]
-            if callback:
-                await callback(notes)
-            await asyncio.sleep(crawl_interval)
-            result.extend(notes)
-            crawler_total_count += 10
-            notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
-        return result
@@ -1,373 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/23 15:41
-# @Desc    : 微博爬虫主流程代码
-
-import asyncio
-import os
-import random
-from asyncio import Task
-from typing import Dict, List, Optional, Tuple
-
-from playwright.async_api import (
-    BrowserContext,
-    BrowserType,
-    Page,
-    Playwright,
-    async_playwright,
-)
-
-import config
-from base.base_crawler import AbstractCrawler
-from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
-from store import weibo as weibo_store
-from tools import utils
-from tools.cdp_browser import CDPBrowserManager
-from var import crawler_type_var, source_keyword_var
-
-from .client import WeiboClient
-from .exception import DataFetchError
-from .field import SearchType
-from .help import filter_search_result_card
-from .login import WeiboLogin
-
-
-class WeiboCrawler(AbstractCrawler):
-    context_page: Page
-    wb_client: WeiboClient
-    browser_context: BrowserContext
-    cdp_manager: Optional[CDPBrowserManager]
-
-    def __init__(self):
-        self.index_url = "https://www.weibo.com"
-        self.mobile_index_url = "https://m.weibo.cn"
-        self.user_agent = utils.get_user_agent()
-        self.mobile_user_agent = utils.get_mobile_user_agent()
-        self.cdp_manager = None
-
-    async def start(self):
-        playwright_proxy_format, httpx_proxy_format = None, None
-        if config.ENABLE_IP_PROXY:
-            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
-            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
-            playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
-
-        async with async_playwright() as playwright:
-            # 根据配置选择启动模式
-            if config.ENABLE_CDP_MODE:
-                utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
-                self.browser_context = await self.launch_browser_with_cdp(
-                    playwright,
-                    playwright_proxy_format,
-                    self.mobile_user_agent,
-                    headless=config.CDP_HEADLESS,
-                )
-            else:
-                utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
-                # Launch a browser context.
-                chromium = playwright.chromium
-                self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
-            # stealth.min.js is a js script to prevent the website from detecting the crawler.
-            await self.browser_context.add_init_script(path="libs/stealth.min.js")
-            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(self.mobile_index_url)
-
-            # Create a client to interact with the xiaohongshu website.
-            self.wb_client = await self.create_weibo_client(httpx_proxy_format)
-            if not await self.wb_client.pong():
-                login_obj = WeiboLogin(
-                    login_type=config.LOGIN_TYPE,
-                    login_phone="",  # your phone number
-                    browser_context=self.browser_context,
-                    context_page=self.context_page,
-                    cookie_str=config.COOKIES,
-                )
-                await login_obj.begin()
-
-                # 登录成功后重定向到手机端的网站，再更新手机端登录成功的cookie
-                utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
-                await self.context_page.goto(self.mobile_index_url)
-                await asyncio.sleep(2)
-                await self.wb_client.update_cookies(browser_context=self.browser_context)
-
-            crawler_type_var.set(config.CRAWLER_TYPE)
-            if config.CRAWLER_TYPE == "search":
-                # Search for video and retrieve their comment information.
-                await self.search()
-            elif config.CRAWLER_TYPE == "detail":
-                # Get the information and comments of the specified post
-                await self.get_specified_notes()
-            elif config.CRAWLER_TYPE == "creator":
-                # Get creator's information and their notes and comments
-                await self.get_creators_and_notes()
-            else:
-                pass
-            utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
-
-    async def search(self):
-        """
-        search weibo note with keywords
-        :return:
-        """
-        utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
-        weibo_limit_count = 10  # weibo limit page fixed value
-        if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
-            config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
-        start_page = config.START_PAGE
-
-        # Set the search type based on the configuration for weibo
-        if config.WEIBO_SEARCH_TYPE == "default":
-            search_type = SearchType.DEFAULT
-        elif config.WEIBO_SEARCH_TYPE == "real_time":
-            search_type = SearchType.REAL_TIME
-        elif config.WEIBO_SEARCH_TYPE == "popular":
-            search_type = SearchType.POPULAR
-        elif config.WEIBO_SEARCH_TYPE == "video":
-            search_type = SearchType.VIDEO
-        else:
-            utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
-            return
-
-        for keyword in config.KEYWORDS.split(","):
-            source_keyword_var.set(keyword)
-            utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
-            page = 1
-            while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                if page < start_page:
-                    utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
-                    page += 1
-                    continue
-                utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
-                search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
-                note_id_list: List[str] = []
-                note_list = filter_search_result_card(search_res.get("cards"))
-                for note_item in note_list:
-                    if note_item:
-                        mblog: Dict = note_item.get("mblog")
-                        if mblog:
-                            note_id_list.append(mblog.get("id"))
-                            await weibo_store.update_weibo_note(note_item)
-                            await self.get_note_images(mblog)
-
-                page += 1
-                await self.batch_get_notes_comments(note_id_list)
-
-    async def get_specified_notes(self):
-        """
-        get specified notes info
-        :return:
-        """
-        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-        task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
-        video_details = await asyncio.gather(*task_list)
-        for note_item in video_details:
-            if note_item:
-                await weibo_store.update_weibo_note(note_item)
-        await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
-
-    async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
-        """
-        Get note detail task
-        :param note_id:
-        :param semaphore:
-        :return:
-        """
-        async with semaphore:
-            try:
-                result = await self.wb_client.get_note_info_by_id(note_id)
-                return result
-            except DataFetchError as ex:
-                utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
-                return None
-            except KeyError as ex:
-                utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
-                return None
-
-    async def batch_get_notes_comments(self, note_id_list: List[str]):
-        """
-        batch get notes comments
-        :param note_id_list:
-        :return:
-        """
-        if not config.ENABLE_GET_COMMENTS:
-            utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
-            return
-
-        utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
-        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-        task_list: List[Task] = []
-        for note_id in note_id_list:
-            task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
-            task_list.append(task)
-        await asyncio.gather(*task_list)
-
-    async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
-        """
-        get comment for note id
-        :param note_id:
-        :param semaphore:
-        :return:
-        """
-        async with semaphore:
-            try:
-                utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
-                await self.wb_client.get_note_all_comments(
-                    note_id=note_id,
-                    crawl_interval=random.randint(1, 3),  # 微博对API的限流比较严重，所以延时提高一些
-                    callback=weibo_store.batch_update_weibo_note_comments,
-                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
-                )
-            except DataFetchError as ex:
-                utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
-            except Exception as e:
-                utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
-
-    async def get_note_images(self, mblog: Dict):
-        """
-        get note images
-        :param mblog:
-        :return:
-        """
-        if not config.ENABLE_GET_MEIDAS:
-            utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
-            return
-
-        pics: Dict = mblog.get("pics")
-        if not pics:
-            return
-        for pic in pics:
-            url = pic.get("url")
-            if not url:
-                continue
-            content = await self.wb_client.get_note_image(url)
-            await asyncio.sleep(random.random())
-            if content != None:
-                extension_file_name = url.split(".")[-1]
-                await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
-
-    async def get_creators_and_notes(self) -> None:
-        """
-        Get creator's information and their notes and comments
-        Returns:
-
-        """
-        utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
-        for user_id in config.WEIBO_CREATOR_ID_LIST:
-            createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
-            if createor_info_res:
-                createor_info: Dict = createor_info_res.get("userInfo", {})
-                utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
-                if not createor_info:
-                    raise DataFetchError("Get creator info error")
-                await weibo_store.save_creator(user_id, user_info=createor_info)
-
-                # Get all note information of the creator
-                all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
-                    creator_id=user_id,
-                    container_id=createor_info_res.get("lfid_container_id"),
-                    crawl_interval=0,
-                    callback=weibo_store.batch_update_weibo_notes,
-                )
-
-                note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
-                await self.batch_get_notes_comments(note_ids)
-
-            else:
-                utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
-
-    async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
-        """Create xhs client"""
-        utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
-        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
-        weibo_client_obj = WeiboClient(
-            proxy=httpx_proxy,
-            headers={
-                "User-Agent": utils.get_mobile_user_agent(),
-                "Cookie": cookie_str,
-                "Origin": "https://m.weibo.cn",
-                "Referer": "https://m.weibo.cn",
-                "Content-Type": "application/json;charset=UTF-8",
-            },
-            playwright_page=self.context_page,
-            cookie_dict=cookie_dict,
-        )
-        return weibo_client_obj
-
-    async def launch_browser(
-        self,
-        chromium: BrowserType,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
-    ) -> BrowserContext:
-        """Launch browser and create browser context"""
-        utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
-        if config.SAVE_LOGIN_STATE:
-            user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
-            browser_context = await chromium.launch_persistent_context(
-                user_data_dir=user_data_dir,
-                accept_downloads=True,
-                headless=headless,
-                proxy=playwright_proxy,  # type: ignore
-                viewport={
-                    "width": 1920,
-                    "height": 1080
-                },
-                user_agent=user_agent,
-            )
-            return browser_context
-        else:
-            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
-            browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
-            return browser_context
-
-    async def launch_browser_with_cdp(
-        self,
-        playwright: Playwright,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
-    ) -> BrowserContext:
-        """
-        使用CDP模式启动浏览器
-        """
-        try:
-            self.cdp_manager = CDPBrowserManager()
-            browser_context = await self.cdp_manager.launch_and_connect(
-                playwright=playwright,
-                playwright_proxy=playwright_proxy,
-                user_agent=user_agent,
-                headless=headless,
-            )
-
-            # 显示浏览器信息
-            browser_info = await self.cdp_manager.get_browser_info()
-            utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
-
-            return browser_context
-
-        except Exception as e:
-            utils.logger.error(f"[WeiboCrawler] CDP模式启动失败，回退到标准模式: {e}")
-            # 回退到标准模式
-            chromium = playwright.chromium
-            return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
-
-    async def close(self):
-        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
@@ -1,25 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
-# 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/2 18:44
-# @Desc    :
-
-from httpx import RequestError
-
-
-class DataFetchError(RequestError):
-    """something error when fetch"""
-
-
-class IPBlockError(RequestError):
-    """fetch so fast that the server block us ip"""
@@ -1,30 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
-# 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/23 15:41
-# @Desc    :
-from enum import Enum
-
-
-class SearchType(Enum):
-    # 综合
-    DEFAULT = "1"
-
-    # 实时
-    REAL_TIME = "61"
-
-    # 热门
-    POPULAR = "60"
-
-    # 视频
-    VIDEO = "64"
@@ -1,36 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
-# 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/24 17:37
-# @Desc    :
-
-from typing import Dict, List
-
-
-def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
-    """
-    过滤微博搜索的结果，只保留card_type为9类型的数据
-    :param card_list:
-    :return:
-    """
-    note_list: List[Dict] = []
-    for card_item in card_list:
-        if card_item.get("card_type") == 9:
-            note_list.append(card_item)
-        if len(card_item.get("card_group", [])) > 0:
-            card_group = card_item.get("card_group")
-            for card_group_item in card_group:
-                if card_group_item.get("card_type") == 9:
-                    note_list.append(card_group_item)
-
-    return note_list
@@ -1,123 +0,0 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
-# 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
-
-
-# -*- coding: utf-8 -*-
-# @Author  : relakkes@gmail.com
-# @Time    : 2023/12/23 15:42
-# @Desc    : 微博登录实现
-
-import asyncio
-import functools
-import sys
-from typing import Optional
-
-from playwright.async_api import BrowserContext, Page
-from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
-                      wait_fixed)
-
-import config
-from base.base_crawler import AbstractLogin
-from tools import utils
-
-
-class WeiboLogin(AbstractLogin):
-    def __init__(self,
-                 login_type: str,
-                 browser_context: BrowserContext,
-                 context_page: Page,
-                 login_phone: Optional[str] = "",
-                 cookie_str: str = ""
-                 ):
-        config.LOGIN_TYPE = login_type
-        self.browser_context = browser_context
-        self.context_page = context_page
-        self.login_phone = login_phone
-        self.cookie_str = cookie_str
-        self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
-
-    async def begin(self):
-        """Start login weibo"""
-        utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
-        if config.LOGIN_TYPE == "qrcode":
-            await self.login_by_qrcode()
-        elif config.LOGIN_TYPE == "phone":
-            await self.login_by_mobile()
-        elif config.LOGIN_TYPE == "cookie":
-            await self.login_by_cookies()
-        else:
-            raise ValueError(
-                "[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
-
-
-    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
-    async def check_login_state(self, no_logged_in_session: str) -> bool:
-        """
-            Check if the current login status is successful and return True otherwise return False
-            retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
-            if max retry times reached, raise RetryError
-        """
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        if cookie_dict.get("SSOLoginState"):
-            return True
-        current_web_session = cookie_dict.get("WBPSESS")
-        if current_web_session != no_logged_in_session:
-            return True
-        return False
-
-    async def login_by_qrcode(self):
-        """login weibo website and keep webdriver login state"""
-        utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
-        await self.context_page.goto(self.weibo_sso_login_url)
-        # find login qrcode
-        qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
-        base64_qrcode_img = await utils.find_login_qrcode(
-            self.context_page,
-            selector=qrcode_img_selector
-        )
-        if not base64_qrcode_img:
-            utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
-            sys.exit()
-
-        # show login qrcode
-        partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
-        asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
-
-        utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
-
-        # get not logged session
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        no_logged_in_session = cookie_dict.get("WBPSESS")
-
-        try:
-            await self.check_login_state(no_logged_in_session)
-        except RetryError:
-            utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
-            sys.exit()
-
-        wait_redirect_seconds = 5
-        utils.logger.info(
-            f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
-        await asyncio.sleep(wait_redirect_seconds)
-
-    async def login_by_mobile(self):
-        pass
-
-    async def login_by_cookies(self):
-        utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...")
-        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
-            await self.browser_context.add_cookies([{
-                'name': key,
-                'value': value,
-                'domain': ".weibo.cn",
-                'path': "/"
-            }])