The framework has been restructured again, and the Flask framework has been abandoned.

This commit is contained in:
戒酒的李白
2025-08-22 13:52:05 +08:00
parent 15b3a3343b
commit 0c31be4287
279 changed files with 2725 additions and 1648837 deletions
@@ -1,18 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:40
# @Desc :
from .client import WeiboClient
from .core import WeiboCrawler
from .login import WeiboLogin
@@ -1,381 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:40
# @Desc : 微博爬虫 API 请求 client
import asyncio
import copy
import json
import re
from typing import Callable, Dict, List, Optional, Union
from urllib.parse import parse_qs, unquote, urlencode
import httpx
from httpx import Response
from playwright.async_api import BrowserContext, Page
import config
from tools import utils
from .exception import DataFetchError
from .field import SearchType
class WeiboClient:
def __init__(
self,
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
proxy=None,
*,
headers: Dict[str, str],
playwright_page: Page,
cookie_dict: Dict[str, str],
):
self.proxy = proxy
self.timeout = timeout
self.headers = headers
self._host = "https://m.weibo.cn"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
self._image_agent_host = "https://i1.wp.com/"
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
enable_return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
if enable_return_response:
return response
data: Dict = response.json()
ok_code = data.get("ok")
if ok_code == 0: # response error
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "response error"))
elif ok_code != 1: # unknown error
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unknown error"))
else: # response right
return data.get("data", {})
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
if headers is None:
headers = self.headers
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
ping_flag = False
try:
uri = "/api/config"
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
if resp_data.get("login"):
ping_flag = True
else:
utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
except Exception as e:
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
ping_flag = False
return ping_flag
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def get_note_by_keyword(
self,
keyword: str,
page: int = 1,
search_type: SearchType = SearchType.DEFAULT,
) -> Dict:
"""
search note by keyword
:param keyword: 微博搜搜的关键词
:param page: 分页参数 -当前页码
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
:return:
"""
uri = "/api/container/getIndex"
containerid = f"100103type={search_type.value}&q={keyword}"
params = {
"containerid": containerid,
"page_type": "searchall",
"page": page,
}
return await self.get(uri, params)
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
"""get notes comments
:param mid_id: 微博ID
:param max_id: 分页参数ID
:param max_id_type: 分页参数ID类型
:return:
"""
uri = "/comments/hotflow"
params = {
"id": mid_id,
"mid": mid_id,
"max_id_type": max_id_type,
}
if max_id > 0:
params.update({"max_id": max_id})
referer_url = f"https://m.weibo.cn/detail/{mid_id}"
headers = copy.copy(self.headers)
headers["Referer"] = referer_url
return await self.get(uri, params, headers=headers)
async def get_note_all_comments(
self,
note_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 10,
):
"""
get note all comments include sub comments
:param note_id:
:param crawl_interval:
:param callback:
:param max_count:
:return:
"""
result = []
is_end = False
max_id = -1
max_id_type = 0
while not is_end and len(result) < max_count:
comments_res = await self.get_note_comments(note_id, max_id, max_id_type)
max_id: int = comments_res.get("max_id")
max_id_type: int = comments_res.get("max_id_type")
comment_list: List[Dict] = comments_res.get("data", [])
is_end = max_id == 0
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数
await callback(note_id, comment_list)
await asyncio.sleep(crawl_interval)
result.extend(comment_list)
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
result.extend(sub_comment_result)
return result
@staticmethod
async def get_comments_all_sub_comments(
note_id: str,
comment_list: List[Dict],
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取评论的所有子评论
Args:
note_id:
comment_list:
callback:
Returns:
"""
if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
return []
res_sub_comments = []
for comment in comment_list:
sub_comments = comment.get("comments")
if sub_comments and isinstance(sub_comments, list):
await callback(note_id, sub_comments)
res_sub_comments.extend(sub_comments)
return res_sub_comments
async def get_note_info_by_id(self, note_id: str) -> Dict:
"""
根据帖子ID获取详情
:param note_id:
:return:
"""
url = f"{self._host}/detail/{note_id}"
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
if response.status_code != 200:
raise DataFetchError(f"get weibo detail err: {response.text}")
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
if match:
render_data_json = match.group(1)
render_data_dict = json.loads(render_data_json)
note_detail = render_data_dict[0].get("status")
note_item = {"mblog": note_detail}
return note_item
else:
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
return dict()
async def get_note_image(self, image_url: str) -> bytes:
image_url = image_url[8:] # 去掉 https://
sub_url = image_url.split("/")
image_url = ""
for i in range(len(sub_url)):
if i == 1:
image_url += "large/" # 都获取高清大图
elif i == len(sub_url) - 1:
image_url += sub_url[i]
else:
image_url += sub_url[i] + "/"
# 微博图床对外存在防盗链,所以需要代理访问
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
final_uri = (f"{self._image_agent_host}"
f"{image_url}")
async with httpx.AsyncClient(proxy=self.proxy) as client:
try:
response = await client.request("GET", final_uri, timeout=self.timeout)
response.raise_for_status()
if not response.reason_phrase == "OK":
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
return None
else:
return response.content
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
return None
async def get_creator_container_info(self, creator_id: str) -> Dict:
"""
获取用户的容器ID, 容器信息代表着真实请求的API路径
fid_container_id:用户的微博详情API的容器ID
lfid_container_id:用户的微博列表API的容器ID
Args:
creator_id:
Returns: {
"""
response = await self.get(f"/u/{creator_id}", return_response=True)
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
if not m_weibocn_params:
raise DataFetchError("get containerid failed")
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
"""
根据用户ID获取用户详情
Args:
creator_id:
Returns:
"""
uri = "/api/container/getIndex"
container_info = await self.get_creator_container_info(creator_id)
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
raise DataFetchError("get containerid failed")
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator_id,
"containerid": container_info["fid_container_id"],
}
user_res = await self.get(uri, params)
if user_res.get("tabsInfo"):
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
for tab in tabs:
if tab.get("tabKey") == "weibo":
container_info["lfid_container_id"] = tab.get("containerid")
break
user_res.update(container_info)
return user_res
async def get_notes_by_creator(
self,
creator: str,
container_id: str,
since_id: str = "0",
) -> Dict:
"""
获取博主的笔记
Args:
creator: 博主ID
container_id: 容器ID
since_id: 上一页最后一条笔记的ID
Returns:
"""
uri = "/api/container/getIndex"
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator,
"containerid": container_id,
"since_id": since_id,
}
return await self.get(uri, params)
async def get_all_notes_by_creator_id(
self,
creator_id: str,
container_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
Args:
creator_id:
container_id:
crawl_interval:
callback:
Returns:
"""
result = []
notes_has_more = True
since_id = ""
crawler_total_count = 0
while notes_has_more:
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
if not notes_res:
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
break
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
if "cards" not in notes_res:
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
break
notes = notes_res["cards"]
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
notes = [note for note in notes if note.get("card_type") == 9]
if callback:
await callback(notes)
await asyncio.sleep(crawl_interval)
result.extend(notes)
crawler_total_count += 10
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
return result
@@ -1,373 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:41
# @Desc : 微博爬虫主流程代码
import asyncio
import os
import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import weibo as weibo_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var
from .client import WeiboClient
from .exception import DataFetchError
from .field import SearchType
from .help import filter_search_result_card
from .login import WeiboLogin
class WeiboCrawler(AbstractCrawler):
context_page: Page
wb_client: WeiboClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self):
self.index_url = "https://www.weibo.com"
self.mobile_index_url = "https://m.weibo.cn"
self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent()
self.cdp_manager = None
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
self.mobile_user_agent,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.mobile_index_url)
# Create a client to interact with the xiaohongshu website.
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong():
login_obj = WeiboLogin(
login_type=config.LOGIN_TYPE,
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
await self.context_page.goto(self.mobile_index_url)
await asyncio.sleep(2)
await self.wb_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
pass
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
async def search(self):
"""
search weibo note with keywords
:return:
"""
utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
start_page = config.START_PAGE
# Set the search type based on the configuration for weibo
if config.WEIBO_SEARCH_TYPE == "default":
search_type = SearchType.DEFAULT
elif config.WEIBO_SEARCH_TYPE == "real_time":
search_type = SearchType.REAL_TIME
elif config.WEIBO_SEARCH_TYPE == "popular":
search_type = SearchType.POPULAR
elif config.WEIBO_SEARCH_TYPE == "video":
search_type = SearchType.VIDEO
else:
utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
return
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1
continue
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
note_id_list: List[str] = []
note_list = filter_search_result_card(search_res.get("cards"))
for note_item in note_list:
if note_item:
mblog: Dict = note_item.get("mblog")
if mblog:
note_id_list.append(mblog.get("id"))
await weibo_store.update_weibo_note(note_item)
await self.get_note_images(mblog)
page += 1
await self.batch_get_notes_comments(note_id_list)
async def get_specified_notes(self):
"""
get specified notes info
:return:
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
video_details = await asyncio.gather(*task_list)
for note_item in video_details:
if note_item:
await weibo_store.update_weibo_note(note_item)
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
"""
Get note detail task
:param note_id:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.wb_client.get_note_info_by_id(note_id)
return result
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
return None
async def batch_get_notes_comments(self, note_id_list: List[str]):
"""
batch get notes comments
:param note_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for note_id in note_id_list:
task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
task_list.append(task)
await asyncio.gather(*task_list)
async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""
get comment for note id
:param note_id:
:param semaphore:
:return:
"""
async with semaphore:
try:
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
await self.wb_client.get_note_all_comments(
note_id=note_id,
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
async def get_note_images(self, mblog: Dict):
"""
get note images
:param mblog:
:return:
"""
if not config.ENABLE_GET_MEIDAS:
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
return
pics: Dict = mblog.get("pics")
if not pics:
return
for pic in pics:
url = pic.get("url")
if not url:
continue
content = await self.wb_client.get_note_image(url)
await asyncio.sleep(random.random())
if content != None:
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for user_id in config.WEIBO_CREATOR_ID_LIST:
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
if createor_info_res:
createor_info: Dict = createor_info_res.get("userInfo", {})
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
if not createor_info:
raise DataFetchError("Get creator info error")
await weibo_store.save_creator(user_id, user_info=createor_info)
# Get all note information of the creator
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
creator_id=user_id,
container_id=createor_info_res.get("lfid_container_id"),
crawl_interval=0,
callback=weibo_store.batch_update_weibo_notes,
)
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
await self.batch_get_notes_comments(note_ids)
else:
utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client"""
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
weibo_client_obj = WeiboClient(
proxy=httpx_proxy,
headers={
"User-Agent": utils.get_mobile_user_agent(),
"Cookie": cookie_str,
"Origin": "https://m.weibo.cn",
"Referer": "https://m.weibo.cn",
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return weibo_client_obj
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent,
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless,
)
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
async def close(self):
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
@@ -1,25 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 18:44
# @Desc :
from httpx import RequestError
class DataFetchError(RequestError):
"""something error when fetch"""
class IPBlockError(RequestError):
"""fetch so fast that the server block us ip"""
@@ -1,30 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:41
# @Desc :
from enum import Enum
class SearchType(Enum):
# 综合
DEFAULT = "1"
# 实时
REAL_TIME = "61"
# 热门
POPULAR = "60"
# 视频
VIDEO = "64"
@@ -1,36 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/24 17:37
# @Desc :
from typing import Dict, List
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
"""
过滤微博搜索的结果,只保留card_type为9类型的数据
:param card_list:
:return:
"""
note_list: List[Dict] = []
for card_item in card_list:
if card_item.get("card_type") == 9:
note_list.append(card_item)
if len(card_item.get("card_group", [])) > 0:
card_group = card_item.get("card_group")
for card_group_item in card_group:
if card_group_item.get("card_type") == 9:
note_list.append(card_group_item)
return note_list
@@ -1,123 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:42
# @Desc : 微博登录实现
import asyncio
import functools
import sys
from typing import Optional
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from tools import utils
class WeiboLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: Optional[str] = "",
cookie_str: str = ""
):
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
async def begin(self):
"""Start login weibo"""
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError(
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self, no_logged_in_session: str) -> bool:
"""
Check if the current login status is successful and return True otherwise return False
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
if max retry times reached, raise RetryError
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
if cookie_dict.get("SSOLoginState"):
return True
current_web_session = cookie_dict.get("WBPSESS")
if current_web_session != no_logged_in_session:
return True
return False
async def login_by_qrcode(self):
"""login weibo website and keep webdriver login state"""
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
await self.context_page.goto(self.weibo_sso_login_url)
# find login qrcode
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()
# show login qrcode
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
# get not logged session
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("WBPSESS")
try:
await self.check_login_state(no_logged_in_session)
except RetryError:
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(
f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self):
pass
async def login_by_cookies(self):
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".weibo.cn",
'path': "/"
}])