The framework has been restructured again, and the Flask framework has been abandoned.
This commit is contained in:
@@ -1,18 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc :
|
||||
from .client import WeiboClient
|
||||
from .core import WeiboCrawler
|
||||
from .login import WeiboLogin
|
||||
@@ -1,381 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc : 微博爬虫 API 请求 client
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import parse_qs, unquote, urlencode
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
|
||||
|
||||
class WeiboClient:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://m.weibo.cn"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if enable_return_response:
|
||||
return response
|
||||
|
||||
data: Dict = response.json()
|
||||
ok_code = data.get("ok")
|
||||
if ok_code == 0: # response error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "response error"))
|
||||
elif ok_code != 1: # unknown error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "unknown error"))
|
||||
else: # response right
|
||||
return data.get("data", {})
|
||||
|
||||
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
|
||||
if headers is None:
|
||||
headers = self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
||||
ping_flag = False
|
||||
try:
|
||||
uri = "/api/config"
|
||||
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
|
||||
if resp_data.get("login"):
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
search_type: SearchType = SearchType.DEFAULT,
|
||||
) -> Dict:
|
||||
"""
|
||||
search note by keyword
|
||||
:param keyword: 微博搜搜的关键词
|
||||
:param page: 分页参数 -当前页码
|
||||
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
||||
:return:
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
containerid = f"100103type={search_type.value}&q={keyword}"
|
||||
params = {
|
||||
"containerid": containerid,
|
||||
"page_type": "searchall",
|
||||
"page": page,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
|
||||
"""get notes comments
|
||||
:param mid_id: 微博ID
|
||||
:param max_id: 分页参数ID
|
||||
:param max_id_type: 分页参数ID类型
|
||||
:return:
|
||||
"""
|
||||
uri = "/comments/hotflow"
|
||||
params = {
|
||||
"id": mid_id,
|
||||
"mid": mid_id,
|
||||
"max_id_type": max_id_type,
|
||||
}
|
||||
if max_id > 0:
|
||||
params.update({"max_id": max_id})
|
||||
referer_url = f"https://m.weibo.cn/detail/{mid_id}"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = referer_url
|
||||
|
||||
return await self.get(uri, params, headers=headers)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get note all comments include sub comments
|
||||
:param note_id:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
is_end = False
|
||||
max_id = -1
|
||||
max_id_type = 0
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = await self.get_note_comments(note_id, max_id, max_id_type)
|
||||
max_id: int = comments_res.get("max_id")
|
||||
max_id_type: int = comments_res.get("max_id_type")
|
||||
comment_list: List[Dict] = comments_res.get("data", [])
|
||||
is_end = max_id == 0
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comment_list)
|
||||
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
||||
result.extend(sub_comment_result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
async def get_comments_all_sub_comments(
|
||||
note_id: str,
|
||||
comment_list: List[Dict],
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
res_sub_comments = []
|
||||
for comment in comment_list:
|
||||
sub_comments = comment.get("comments")
|
||||
if sub_comments and isinstance(sub_comments, list):
|
||||
await callback(note_id, sub_comments)
|
||||
res_sub_comments.extend(sub_comments)
|
||||
return res_sub_comments
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
:param note_id:
|
||||
:return:
|
||||
"""
|
||||
url = f"{self._host}/detail/{note_id}"
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
if response.status_code != 200:
|
||||
raise DataFetchError(f"get weibo detail err: {response.text}")
|
||||
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
|
||||
if match:
|
||||
render_data_json = match.group(1)
|
||||
render_data_dict = json.loads(render_data_json)
|
||||
note_detail = render_data_dict[0].get("status")
|
||||
note_item = {"mblog": note_detail}
|
||||
return note_item
|
||||
else:
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||
return dict()
|
||||
|
||||
async def get_note_image(self, image_url: str) -> bytes:
|
||||
image_url = image_url[8:] # 去掉 https://
|
||||
sub_url = image_url.split("/")
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" # 都获取高清大图
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
image_url += sub_url[i] + "/"
|
||||
# 微博图床对外存在防盗链,所以需要代理访问
|
||||
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||
final_uri = (f"{self._image_agent_host}"
|
||||
f"{image_url}")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
||||
fid_container_id:用户的微博详情API的容器ID
|
||||
lfid_container_id:用户的微博列表API的容器ID
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns: {
|
||||
|
||||
"""
|
||||
response = await self.get(f"/u/{creator_id}", return_response=True)
|
||||
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
|
||||
if not m_weibocn_params:
|
||||
raise DataFetchError("get containerid failed")
|
||||
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
|
||||
return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
|
||||
|
||||
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
根据用户ID获取用户详情
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
container_info = await self.get_creator_container_info(creator_id)
|
||||
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
|
||||
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
|
||||
raise DataFetchError("get containerid failed")
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator_id,
|
||||
"containerid": container_info["fid_container_id"],
|
||||
}
|
||||
|
||||
user_res = await self.get(uri, params)
|
||||
|
||||
if user_res.get("tabsInfo"):
|
||||
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
|
||||
for tab in tabs:
|
||||
if tab.get("tabKey") == "weibo":
|
||||
container_info["lfid_container_id"] = tab.get("containerid")
|
||||
break
|
||||
|
||||
user_res.update(container_info)
|
||||
return user_res
|
||||
|
||||
async def get_notes_by_creator(
|
||||
self,
|
||||
creator: str,
|
||||
container_id: str,
|
||||
since_id: str = "0",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Args:
|
||||
creator: 博主ID
|
||||
container_id: 容器ID
|
||||
since_id: 上一页最后一条笔记的ID
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
uri = "/api/container/getIndex"
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator,
|
||||
"containerid": container_id,
|
||||
"since_id": since_id,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_notes_by_creator_id(
|
||||
self,
|
||||
creator_id: str,
|
||||
container_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
creator_id:
|
||||
container_id:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = True
|
||||
since_id = ""
|
||||
crawler_total_count = 0
|
||||
while notes_has_more:
|
||||
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
break
|
||||
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
||||
if "cards" not in notes_res:
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||
break
|
||||
|
||||
notes = notes_res["cards"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
|
||||
notes = [note for note in notes if note.get("card_type") == 9]
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
crawler_total_count += 10
|
||||
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
|
||||
return result
|
||||
@@ -1,373 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc : 微博爬虫主流程代码
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import weibo as weibo_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import WeiboClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
from .help import filter_search_result_card
|
||||
from .login import WeiboLogin
|
||||
|
||||
|
||||
class WeiboCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
wb_client: WeiboClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.weibo.com"
|
||||
self.mobile_index_url = "https://m.weibo.cn"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.mobile_user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
||||
if not await self.wb_client.pong():
|
||||
login_obj = WeiboLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
await asyncio.sleep(2)
|
||||
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for video and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search weibo note with keywords
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
|
||||
weibo_limit_count = 10 # weibo limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||
start_page = config.START_PAGE
|
||||
|
||||
# Set the search type based on the configuration for weibo
|
||||
if config.WEIBO_SEARCH_TYPE == "default":
|
||||
search_type = SearchType.DEFAULT
|
||||
elif config.WEIBO_SEARCH_TYPE == "real_time":
|
||||
search_type = SearchType.REAL_TIME
|
||||
elif config.WEIBO_SEARCH_TYPE == "popular":
|
||||
search_type = SearchType.POPULAR
|
||||
elif config.WEIBO_SEARCH_TYPE == "video":
|
||||
search_type = SearchType.VIDEO
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
|
||||
return
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
mblog: Dict = note_item.get("mblog")
|
||||
if mblog:
|
||||
note_id_list.append(mblog.get("id"))
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
|
||||
page += 1
|
||||
await self.batch_get_notes_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
get specified notes info
|
||||
:return:
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for note_item in video_details:
|
||||
if note_item:
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get note detail task
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.wb_client.get_note_info_by_id(note_id)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_notes_comments(self, note_id_list: List[str]):
|
||||
"""
|
||||
batch get notes comments
|
||||
:param note_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_id in note_id_list:
|
||||
task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for note id
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||
await self.wb_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
|
||||
callback=weibo_store.batch_update_weibo_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
|
||||
|
||||
async def get_note_images(self, mblog: Dict):
|
||||
"""
|
||||
get note images
|
||||
:param mblog:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
|
||||
return
|
||||
|
||||
pics: Dict = mblog.get("pics")
|
||||
if not pics:
|
||||
return
|
||||
for pic in pics:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.wb_client.get_note_image(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content != None:
|
||||
extension_file_name = url.split(".")[-1]
|
||||
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||
for user_id in config.WEIBO_CREATOR_ID_LIST:
|
||||
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
|
||||
if createor_info_res:
|
||||
createor_info: Dict = createor_info_res.get("userInfo", {})
|
||||
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
|
||||
if not createor_info:
|
||||
raise DataFetchError("Get creator info error")
|
||||
await weibo_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
|
||||
creator_id=user_id,
|
||||
container_id=createor_info_res.get("lfid_container_id"),
|
||||
crawl_interval=0,
|
||||
callback=weibo_store.batch_update_weibo_notes,
|
||||
)
|
||||
|
||||
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
|
||||
await self.batch_get_notes_comments(note_ids)
|
||||
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
|
||||
|
||||
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
weibo_client_obj = WeiboClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": utils.get_mobile_user_agent(),
|
||||
"Cookie": cookie_str,
|
||||
"Origin": "https://m.weibo.cn",
|
||||
"Referer": "https://m.weibo.cn",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
|
||||
@@ -1,25 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc :
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -1,30 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc :
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
# 综合
|
||||
DEFAULT = "1"
|
||||
|
||||
# 实时
|
||||
REAL_TIME = "61"
|
||||
|
||||
# 热门
|
||||
POPULAR = "60"
|
||||
|
||||
# 视频
|
||||
VIDEO = "64"
|
||||
@@ -1,36 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/24 17:37
|
||||
# @Desc :
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
过滤微博搜索的结果,只保留card_type为9类型的数据
|
||||
:param card_list:
|
||||
:return:
|
||||
"""
|
||||
note_list: List[Dict] = []
|
||||
for card_item in card_list:
|
||||
if card_item.get("card_type") == 9:
|
||||
note_list.append(card_item)
|
||||
if len(card_item.get("card_group", [])) > 0:
|
||||
card_group = card_item.get("card_group")
|
||||
for card_group_item in card_group:
|
||||
if card_group_item.get("card_type") == 9:
|
||||
note_list.append(card_group_item)
|
||||
|
||||
return note_list
|
||||
@@ -1,123 +0,0 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:42
|
||||
# @Desc : 微博登录实现
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class WeiboLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
||||
|
||||
async def begin(self):
|
||||
"""Start login weibo"""
|
||||
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError(
|
||||
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SSOLoginState"):
|
||||
return True
|
||||
current_web_session = cookie_dict.get("WBPSESS")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login weibo website and keep webdriver login state"""
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||
await self.context_page.goto(self.weibo_sso_login_url)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("WBPSESS")
|
||||
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".weibo.cn",
|
||||
'path': "/"
|
||||
}])
|
||||
Reference in New Issue
Block a user