Uploading the AI Crawler System: MindSpider
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:36
|
||||
# @Desc :
|
||||
|
||||
from .core import *
|
||||
@@ -0,0 +1,553 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibili 请求客户端
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import CommentOrderType, SearchOrderType
|
||||
from .help import BilibiliSign
|
||||
|
||||
|
||||
class BilibiliClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.JSONDecodeError:
|
||||
utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
|
||||
raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
|
||||
if data.get("code") != 0:
|
||||
raise DataFetchError(data.get("message", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
|
||||
async def pre_request_data(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
发送请求进行请求参数签名
|
||||
需要从 localStorage 拿 wbi_img_urls 这参数,值如下:
|
||||
https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
if not req_data:
|
||||
return {}
|
||||
img_key, sub_key = await self.get_wbi_keys()
|
||||
return BilibiliSign(img_key, sub_key).sign(req_data)
|
||||
|
||||
async def get_wbi_keys(self) -> Tuple[str, str]:
|
||||
"""
|
||||
获取最新的 img_key 和 sub_key
|
||||
:return:
|
||||
"""
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
wbi_img_urls = local_storage.get("wbi_img_urls", "")
|
||||
if not wbi_img_urls:
|
||||
img_url_from_storage = local_storage.get("wbi_img_url")
|
||||
sub_url_from_storage = local_storage.get("wbi_sub_url")
|
||||
if img_url_from_storage and sub_url_from_storage:
|
||||
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
|
||||
if wbi_img_urls and "-" in wbi_img_urls:
|
||||
img_url, sub_url = wbi_img_urls.split("-")
|
||||
else:
|
||||
resp = await self.request(method="GET", url=self._host + "/x/web-interface/nav")
|
||||
img_url: str = resp['wbi_img']['img_url']
|
||||
sub_url: str = resp['wbi_img']['sub_url']
|
||||
img_key = img_url.rsplit('/', 1)[1].split('.')[0]
|
||||
sub_key = sub_url.rsplit('/', 1)[1].split('.')[0]
|
||||
return img_key, sub_key
|
||||
|
||||
async def get(self, uri: str, params=None, enable_params_sign: bool = True) -> Dict:
|
||||
final_uri = uri
|
||||
if enable_params_sign:
|
||||
params = await self.pre_request_data(params)
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
data = await self.pre_request_data(data)
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
|
||||
ping_flag = False
|
||||
try:
|
||||
check_login_uri = "/x/web-interface/nav"
|
||||
response = await self.get(check_login_uri)
|
||||
if response.get("isLogin"):
|
||||
utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_video_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
order: SearchOrderType = SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s: int = 0,
|
||||
pubtime_end_s: int = 0,
|
||||
) -> Dict:
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: 搜索关键词
|
||||
:param page: 分页参数具体第几页
|
||||
:param page_size: 每一页参数的数量
|
||||
:param order: 搜索结果排序,默认位综合排序
|
||||
:param pubtime_begin_s: 发布时间开始时间戳
|
||||
:param pubtime_end_s: 发布时间结束时间戳
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/web-interface/wbi/search/type"
|
||||
post_data = {
|
||||
"search_type": "video",
|
||||
"keyword": keyword,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"order": order.value,
|
||||
"pubtime_begin_s": pubtime_begin_s,
|
||||
"pubtime_end_s": pubtime_end_s
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
|
||||
"""
|
||||
Bilibli web video detail api, aid 和 bvid任选一个参数
|
||||
:param aid: 稿件avid
|
||||
:param bvid: 稿件bvid
|
||||
:return:
|
||||
"""
|
||||
if not aid and not bvid:
|
||||
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
|
||||
|
||||
uri = "/x/web-interface/view/detail"
|
||||
params = dict()
|
||||
if aid:
|
||||
params.update({"aid": aid})
|
||||
else:
|
||||
params.update({"bvid": bvid})
|
||||
return await self.get(uri, params, enable_params_sign=False)
|
||||
|
||||
async def get_video_play_url(self, aid: int, cid: int) -> Dict:
|
||||
"""
|
||||
Bilibli web video play url api
|
||||
:param aid: 稿件avid
|
||||
:param cid: cid
|
||||
:return:
|
||||
"""
|
||||
if not aid or not cid or aid <= 0 or cid <= 0:
|
||||
raise ValueError("aid 和 cid 必须存在")
|
||||
uri = "/x/player/wbi/playurl"
|
||||
params = {
|
||||
"avid": aid,
|
||||
"cid": cid,
|
||||
"qn": 80,
|
||||
"fourk": 1,
|
||||
"fnval": 1,
|
||||
"platform": "pc",
|
||||
}
|
||||
|
||||
return await self.get(uri, params, enable_params_sign=True)
|
||||
|
||||
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def get_video_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
order_mode: CommentOrderType = CommentOrderType.DEFAULT,
|
||||
next: int = 0,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param video_id: 视频 ID
|
||||
:param order_mode: 排序方式
|
||||
:param next: 评论页选择
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/wbi/main"
|
||||
post_data = {"oid": video_id, "mode": order_mode.value, "type": 1, "ps": 20, "next": next}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_video_all_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param video_id:
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
is_end = False
|
||||
next_page = 0
|
||||
max_retries = 3
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
||||
break # Success
|
||||
except DataFetchError as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = 5 * (2**attempt) + random.uniform(0, 1)
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
utils.logger.error(f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}")
|
||||
is_end = True
|
||||
break
|
||||
if not comments_res:
|
||||
break
|
||||
|
||||
cursor_info: Dict = comments_res.get("cursor")
|
||||
if not cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
|
||||
break
|
||||
|
||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||
|
||||
# 检查 is_end 和 next 是否存在
|
||||
if "is_end" not in cursor_info or "next" not in cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
else:
|
||||
is_end = cursor_info.get("is_end")
|
||||
next_page = cursor_info.get("next")
|
||||
|
||||
if not isinstance(is_end, bool):
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
if is_fetch_sub_comments:
|
||||
for comment in comment_list:
|
||||
comment_id = comment['rpid']
|
||||
if (comment.get("rcount", 0) > 0):
|
||||
{await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)}
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
result.extend(comment_list)
|
||||
continue
|
||||
return result
|
||||
|
||||
async def get_video_all_level_two_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
level_one_comment_id: int,
|
||||
order_mode: CommentOrderType,
|
||||
ps: int = 10,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
get video all level two comments for a level one comment
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode:
|
||||
:param ps: 一页评论数
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:return:
|
||||
"""
|
||||
|
||||
pn = 1
|
||||
while True:
|
||||
result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode)
|
||||
comment_list: List[Dict] = result.get("replies", [])
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
break
|
||||
|
||||
pn += 1
|
||||
|
||||
async def get_video_level_two_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
level_one_comment_id: int,
|
||||
pn: int,
|
||||
ps: int,
|
||||
order_mode: CommentOrderType,
|
||||
) -> Dict:
|
||||
"""get video level two comments
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode: 排序方式
|
||||
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/reply"
|
||||
post_data = {
|
||||
"oid": video_id,
|
||||
"mode": order_mode.value,
|
||||
"type": 1,
|
||||
"ps": ps,
|
||||
"pn": pn,
|
||||
"root": level_one_comment_id,
|
||||
}
|
||||
result = await self.get(uri, post_data)
|
||||
return result
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
:param ps: 一页视频数
|
||||
:param order_mode: 排序方式
|
||||
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/space/wbi/arc/search"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"order": order_mode,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""
|
||||
get creator info
|
||||
:param creator_id: 作者 ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_fans(
|
||||
self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator fans
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/fans"
|
||||
post_data = {
|
||||
'vmid': creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_followings(
|
||||
self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator followings
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/followings"
|
||||
post_data = {
|
||||
"vmid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
|
||||
"""
|
||||
get creator comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param offset: 发送请求所需参数
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/polymer/web-dynamic/v1/feed/space"
|
||||
post_data = {
|
||||
"offset": offset,
|
||||
"host_mid": creator_id,
|
||||
"platform": "web",
|
||||
}
|
||||
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_all_fans(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all fans
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大粉丝数量
|
||||
|
||||
:return: up主粉丝数列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn)
|
||||
fans_list: List[Dict] = fans_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(fans_list) > max_count:
|
||||
fans_list = fans_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, fans_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not fans_list:
|
||||
break
|
||||
result.extend(fans_list)
|
||||
return result
|
||||
|
||||
async def get_creator_all_followings(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大关注者数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn)
|
||||
followings_list: List[Dict] = followings_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(followings_list) > max_count:
|
||||
followings_list = followings_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, followings_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not followings_list:
|
||||
break
|
||||
result.extend(followings_list)
|
||||
return result
|
||||
|
||||
async def get_creator_all_dynamics(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 20,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大动态数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
offset = ""
|
||||
has_more = True
|
||||
while has_more and len(result) < max_count:
|
||||
dynamics_res = await self.get_creator_dynamics(creator_id, offset)
|
||||
dynamics_list: List[Dict] = dynamics_res["items"]
|
||||
has_more = dynamics_res["has_more"]
|
||||
offset = dynamics_res["offset"]
|
||||
if len(result) + len(dynamics_list) > max_count:
|
||||
dynamics_list = dynamics_list[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(creator_info, dynamics_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(dynamics_list)
|
||||
return result
|
||||
@@ -0,0 +1,657 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : B站爬虫
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from datetime import datetime, timedelta
|
||||
import pandas as pd
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
from playwright._impl._errors import TargetClosedError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import bilibili as bilibili_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import BilibiliClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchOrderType
|
||||
from .login import BilibiliLogin
|
||||
|
||||
|
||||
class BilibiliCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
bili_client: BilibiliClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
|
||||
if not await self.bili_client.pong():
|
||||
login_obj = BilibiliLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
if config.CREATOR_MODE:
|
||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
await self.get_creator_videos(int(creator_id))
|
||||
else:
|
||||
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
||||
else:
|
||||
pass
|
||||
utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search bilibili video
|
||||
"""
|
||||
# Search for video and retrieve their comment information.
|
||||
if config.BILI_SEARCH_MODE == "normal":
|
||||
await self.search_by_keywords()
|
||||
elif config.BILI_SEARCH_MODE == "all_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=False)
|
||||
elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=True)
|
||||
else:
|
||||
utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
|
||||
|
||||
@staticmethod
|
||||
async def get_pubtime_datetime(
|
||||
start: str = config.START_DAY,
|
||||
end: str = config.END_DAY,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||
---
|
||||
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||
|
||||
Note
|
||||
---
|
||||
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
"""
|
||||
# 转换 start 与 end 为 datetime 对象
|
||||
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
|
||||
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
|
||||
if start_day > end_day:
|
||||
raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
async def search_by_keywords(self):
|
||||
"""
|
||||
search bilibili video with keywords in normal mode
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords")
|
||||
bili_limit_count = 20 # bilibili limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||
start_page = config.START_PAGE # start page number
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}")
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||
pubtime_end_s=0, # 作品发布日期结束日期时间戳
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
if not video_list:
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.")
|
||||
break
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = []
|
||||
try:
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}")
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
video_id_list.append(video_item.get("View").get("aid"))
|
||||
await bilibili_store.update_bilibili_video(video_item)
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def search_by_keywords_in_time_range(self, daily_limit: bool):
|
||||
"""
|
||||
Search bilibili video with keywords in a given time range.
|
||||
:param daily_limit: if True, strictly limit the number of notes per day and total.
|
||||
"""
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}")
|
||||
bili_limit_count = 20
|
||||
start_page = config.START_PAGE
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
|
||||
total_notes_crawled_for_keyword = 0
|
||||
|
||||
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq="D"):
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||
break
|
||||
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||
break
|
||||
|
||||
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d"))
|
||||
page = 1
|
||||
notes_count_this_day = 0
|
||||
|
||||
while True:
|
||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
|
||||
break
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
|
||||
break
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=pubtime_begin_s,
|
||||
pubtime_end_s=pubtime_end_s,
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
if not video_list:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.")
|
||||
break
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||
break
|
||||
notes_count_this_day += 1
|
||||
total_notes_crawled_for_keyword += 1
|
||||
video_id_list.append(video_item.get("View").get("aid"))
|
||||
await bilibili_store.update_bilibili_video(video_item)
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}")
|
||||
break
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
"""
|
||||
batch get video comments
|
||||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for video id
|
||||
:param video_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await asyncio.sleep(random.uniform(0.5, 1.5))
|
||||
await self.bili_client.get_video_all_comments(
|
||||
video_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=bilibili_store.batch_update_bilibili_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||
# Propagate the exception to be caught by the main loop
|
||||
raise
|
||||
|
||||
async def get_creator_videos(self, creator_id: int):
|
||||
"""
|
||||
get videos for a creator
|
||||
:return:
|
||||
"""
|
||||
ps = 30
|
||||
pn = 1
|
||||
while True:
|
||||
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
|
||||
video_bvids_list = [video["bvid"] for video in result["list"]["vlist"]]
|
||||
await self.get_specified_videos(video_bvids_list)
|
||||
if int(result["page"]["count"]) <= pn * ps:
|
||||
break
|
||||
await asyncio.sleep(random.random())
|
||||
pn += 1
|
||||
|
||||
async def get_specified_videos(self, bvids_list: List[str]):
|
||||
"""
|
||||
get specified videos info
|
||||
:return:
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
video_aids_list = []
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
video_item_view: Dict = video_detail.get("View")
|
||||
video_aid: str = video_item_view.get("aid")
|
||||
if video_aid:
|
||||
video_aids_list.append(video_aid)
|
||||
await bilibili_store.update_bilibili_video(video_detail)
|
||||
await bilibili_store.update_up_info(video_detail)
|
||||
await self.get_bilibili_video(video_detail, semaphore)
|
||||
await self.batch_get_video_comments(video_aids_list)
|
||||
|
||||
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get video detail task
|
||||
:param aid:
|
||||
:param bvid:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]:
|
||||
"""
|
||||
Get video play url
|
||||
:param aid:
|
||||
:param cid:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_play_url(aid=aid, cid=cid)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
|
||||
"""
|
||||
create bilibili client
|
||||
:param httpx_proxy: httpx proxy
|
||||
:return: bilibili client
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
bilibili_client_obj = BilibiliClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"Cookie": cookie_str,
|
||||
"Origin": "https://www.bilibili.com",
|
||||
"Referer": "https://www.bilibili.com",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return bilibili_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
launch browser and create browser context
|
||||
:param chromium: chromium browser
|
||||
:param playwright_proxy: playwright proxy
|
||||
:param user_agent: user agent
|
||||
:param headless: headless mode
|
||||
:return: browser context
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
# type: ignore
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
|
||||
except TargetClosedError:
|
||||
utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
download bilibili video
|
||||
:param video_item:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled")
|
||||
return
|
||||
video_item_view: Dict = video_item.get("View")
|
||||
aid = video_item_view.get("aid")
|
||||
cid = video_item_view.get("cid")
|
||||
result = await self.get_video_play_url_task(aid, cid, semaphore)
|
||||
if result is None:
|
||||
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed")
|
||||
return
|
||||
durl_list = result.get("durl")
|
||||
max_size = -1
|
||||
video_url = ""
|
||||
for durl in durl_list:
|
||||
size = durl.get("size")
|
||||
if size > max_size:
|
||||
max_size = size
|
||||
video_url = durl.get("url")
|
||||
if video_url == "":
|
||||
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed")
|
||||
return
|
||||
|
||||
content = await self.bili_client.get_video_media(video_url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||
|
||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
||||
"""
|
||||
creator_id_list: get details for creator from creator_id_list
|
||||
"""
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
try:
|
||||
for creator_id in creator_id_list:
|
||||
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
|
||||
task_list.append(task)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get details for creator id
|
||||
:param creator_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
|
||||
creator_info: Dict = {
|
||||
"id": creator_id,
|
||||
"name": creator_unhandled_info.get("name"),
|
||||
"sign": creator_unhandled_info.get("sign"),
|
||||
"avatar": creator_unhandled_info.get("face"),
|
||||
}
|
||||
await self.get_fans(creator_info, semaphore)
|
||||
await self.get_followings(creator_info, semaphore)
|
||||
await self.get_dynamics(creator_info, semaphore)
|
||||
|
||||
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get fans for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
|
||||
await self.bili_client.get_creator_all_fans(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
|
||||
|
||||
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get followings for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
|
||||
await self.bili_client.get_creator_all_followings(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
|
||||
|
||||
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get dynamics for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
|
||||
await self.bili_client.get_creator_all_dynamics(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
|
||||
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc :
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,45 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/3 16:20
|
||||
# @Desc :
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchOrderType(Enum):
|
||||
# 综合排序
|
||||
DEFAULT = ""
|
||||
|
||||
# 最多点击
|
||||
MOST_CLICK = "click"
|
||||
|
||||
# 最新发布
|
||||
LAST_PUBLISH = "pubdate"
|
||||
|
||||
# 最多弹幕
|
||||
MOST_DANMU = "dm"
|
||||
|
||||
# 最多收藏
|
||||
MOST_MARK = "stow"
|
||||
|
||||
|
||||
class CommentOrderType(Enum):
|
||||
# 仅按热度
|
||||
DEFAULT = 0
|
||||
|
||||
# 按热度+按时间
|
||||
MIXED = 1
|
||||
|
||||
# 按时间
|
||||
TIME = 2
|
||||
@@ -0,0 +1,81 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 23:26
|
||||
# @Desc : bilibili 请求参数签名
|
||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
import urllib.parse
|
||||
from hashlib import md5
|
||||
from typing import Dict
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BilibiliSign:
|
||||
def __init__(self, img_key: str, sub_key: str):
|
||||
self.img_key = img_key
|
||||
self.sub_key = sub_key
|
||||
self.map_table = [
|
||||
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
|
||||
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
|
||||
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
|
||||
36, 20, 34, 44, 52
|
||||
]
|
||||
|
||||
def get_salt(self) -> str:
|
||||
"""
|
||||
获取加盐的 key
|
||||
:return:
|
||||
"""
|
||||
salt = ""
|
||||
mixin_key = self.img_key + self.sub_key
|
||||
for mt in self.map_table:
|
||||
salt += mixin_key[mt]
|
||||
return salt[:32]
|
||||
|
||||
def sign(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
请求参数中加上当前时间戳对请求参数中的key进行字典序排序
|
||||
再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
current_ts = utils.get_unix_timestamp()
|
||||
req_data.update({"wts": current_ts})
|
||||
req_data = dict(sorted(req_data.items()))
|
||||
req_data = {
|
||||
# 过滤 value 中的 "!'()*" 字符
|
||||
k: ''.join(filter(lambda ch: ch not in "!'()*", str(v)))
|
||||
for k, v
|
||||
in req_data.items()
|
||||
}
|
||||
query = urllib.parse.urlencode(req_data)
|
||||
salt = self.get_salt()
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid
|
||||
req_data['w_rid'] = wbi_sign
|
||||
return req_data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_key = "7cd084941338484aae1ad9425b84077c"
|
||||
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
|
||||
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
|
||||
_req_data = dict()
|
||||
for params in _search_url.split("&"):
|
||||
kvalues = params.split("=")
|
||||
key = kvalues[0]
|
||||
value = kvalues[1]
|
||||
_req_data[key] = value
|
||||
print("pre req_data", _req_data)
|
||||
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
|
||||
print(_req_data)
|
||||
@@ -0,0 +1,118 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibli登录实现类
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BilibiliLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""Start login bilibili"""
|
||||
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError(
|
||||
"[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"):
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login bilibili website and keep webdriver login state"""
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//div[@class='right-entry__outside go-login-btn']//div"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(1)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='login-scan-box']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".bilibili.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .core import DouYinCrawler
|
||||
@@ -0,0 +1,326 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import urllib.parse
|
||||
from typing import Any, Callable, Dict, Union, Optional
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .exception import *
|
||||
from .field import *
|
||||
from .help import *
|
||||
|
||||
|
||||
class DouYinClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict,
|
||||
playwright_page: Optional[Page],
|
||||
cookie_dict: Dict,
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.douyin.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def __process_req_params(
|
||||
self,
|
||||
uri: str,
|
||||
params: Optional[Dict] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
request_method="GET",
|
||||
):
|
||||
|
||||
if not params:
|
||||
return
|
||||
headers = headers or self.headers
|
||||
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
||||
common_params = {
|
||||
"device_platform": "webapp",
|
||||
"aid": "6383",
|
||||
"channel": "channel_pc_web",
|
||||
"version_code": "190600",
|
||||
"version_name": "19.6.0",
|
||||
"update_version_code": "170400",
|
||||
"pc_client_type": "1",
|
||||
"cookie_enabled": "true",
|
||||
"browser_language": "zh-CN",
|
||||
"browser_platform": "MacIntel",
|
||||
"browser_name": "Chrome",
|
||||
"browser_version": "125.0.0.0",
|
||||
"browser_online": "true",
|
||||
"engine_name": "Blink",
|
||||
"os_name": "Mac OS",
|
||||
"os_version": "10.15.7",
|
||||
"cpu_core_num": "8",
|
||||
"device_memory": "8",
|
||||
"engine_version": "109.0",
|
||||
"platform": "PC",
|
||||
"screen_width": "2560",
|
||||
"screen_height": "1440",
|
||||
'effective_type': '4g',
|
||||
"round_trip_time": "50",
|
||||
"webid": get_web_id(),
|
||||
"msToken": local_storage.get("xmst"),
|
||||
}
|
||||
params.update(common_params)
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
|
||||
# 20240927 a-bogus更新(JS版本)
|
||||
post_data = {}
|
||||
if request_method == "POST":
|
||||
post_data = params
|
||||
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
GET请求
|
||||
"""
|
||||
await self.__process_req_params(uri, params, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
||||
|
||||
async def post(self, uri: str, data: dict, headers: Optional[Dict] = None):
|
||||
await self.__process_req_params(uri, data, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
|
||||
|
||||
async def pong(self, browser_context: BrowserContext) -> bool:
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
return cookie_dict.get("LOGIN_STATUS") == "1"
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_info_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
offset: int = 0,
|
||||
search_channel: SearchChannelType = SearchChannelType.GENERAL,
|
||||
sort_type: SearchSortType = SearchSortType.GENERAL,
|
||||
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
|
||||
search_id: str = "",
|
||||
):
|
||||
"""
|
||||
DouYin Web Search API
|
||||
:param keyword:
|
||||
:param offset:
|
||||
:param search_channel:
|
||||
:param sort_type:
|
||||
:param publish_time: ·
|
||||
:param search_id: ·
|
||||
:return:
|
||||
"""
|
||||
query_params = {
|
||||
'search_channel': search_channel.value,
|
||||
'enable_history': '1',
|
||||
'keyword': keyword,
|
||||
'search_source': 'tab_search',
|
||||
'query_correct_type': '1',
|
||||
'is_filter_search': '0',
|
||||
'from_group_id': '7378810571505847586',
|
||||
'offset': offset,
|
||||
'count': '15',
|
||||
'need_filter_settings': '1',
|
||||
'list_type': 'multi',
|
||||
'search_id': search_id,
|
||||
}
|
||||
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
||||
query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
|
||||
query_params["is_filter_search"] = 1
|
||||
query_params["search_source"] = "tab_search"
|
||||
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
|
||||
|
||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||
"""
|
||||
DouYin Video Detail API
|
||||
:param aweme_id:
|
||||
:return:
|
||||
"""
|
||||
params = {"aweme_id": aweme_id}
|
||||
headers = copy.copy(self.headers)
|
||||
del headers["Origin"]
|
||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
return res.get("aweme_detail", {})
|
||||
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||
"""get note comments
|
||||
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/"
|
||||
params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_sub_comments(self, aweme_id: str, comment_id: str, cursor: int = 0):
|
||||
"""
|
||||
获取子评论
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/reply/"
|
||||
params = {
|
||||
'comment_id': comment_id,
|
||||
"cursor": cursor,
|
||||
"count": 20,
|
||||
"item_type": 0,
|
||||
"item_id": aweme_id,
|
||||
}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_aweme_all_comments(
|
||||
self,
|
||||
aweme_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
获取帖子的所有评论,包括子评论
|
||||
:param aweme_id: 帖子ID
|
||||
:param crawl_interval: 抓取间隔
|
||||
:param is_fetch_sub_comments: 是否抓取子评论
|
||||
:param callback: 回调函数,用于处理抓取到的评论
|
||||
:param max_count: 一次帖子爬取的最大评论数量
|
||||
:return: 评论列表
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = 1
|
||||
comments_cursor = 0
|
||||
while comments_has_more and len(result) < max_count:
|
||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", 0)
|
||||
comments_cursor = comments_res.get("cursor", 0)
|
||||
comments = comments_res.get("comments", [])
|
||||
if not comments:
|
||||
continue
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
result.extend(comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, comments)
|
||||
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
continue
|
||||
# 获取二级评论
|
||||
for comment in comments:
|
||||
reply_comment_total = comment.get("reply_comment_total")
|
||||
|
||||
if reply_comment_total > 0:
|
||||
comment_id = comment.get("cid")
|
||||
sub_comments_has_more = 1
|
||||
sub_comments_cursor = 0
|
||||
|
||||
while sub_comments_has_more:
|
||||
sub_comments_res = await self.get_sub_comments(aweme_id, comment_id, sub_comments_cursor)
|
||||
sub_comments_has_more = sub_comments_res.get("has_more", 0)
|
||||
sub_comments_cursor = sub_comments_res.get("cursor", 0)
|
||||
sub_comments = sub_comments_res.get("comments", [])
|
||||
|
||||
if not sub_comments:
|
||||
continue
|
||||
result.extend(sub_comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
|
||||
async def get_user_info(self, sec_user_id: str):
|
||||
uri = "/aweme/v1/web/user/profile/other/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"publish_video_strategy_type": 2,
|
||||
"personal_center_strategy": 1,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict:
|
||||
uri = "/aweme/v1/web/aweme/post/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"count": 18,
|
||||
"max_cursor": max_cursor,
|
||||
"locate_query": "false",
|
||||
"publish_video_strategy_type": 2,
|
||||
'verifyFp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU',
|
||||
'fp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU'
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None):
|
||||
posts_has_more = 1
|
||||
max_cursor = ""
|
||||
result = []
|
||||
while posts_has_more == 1:
|
||||
aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor)
|
||||
posts_has_more = aweme_post_res.get("has_more", 0)
|
||||
max_cursor = aweme_post_res.get("max_cursor")
|
||||
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
|
||||
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
|
||||
if callback:
|
||||
await callback(aweme_list)
|
||||
result.extend(aweme_list)
|
||||
return result
|
||||
|
||||
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
@@ -0,0 +1,393 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import douyin as douyin_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import DouYinClient
|
||||
from .exception import DataFetchError
|
||||
from .field import PublishTimeType
|
||||
from .login import DouYinLogin
|
||||
|
||||
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
dy_client: DouYinClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
None,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
playwright_proxy_format,
|
||||
user_agent=None,
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
||||
if not await self.dy_client.pong(browser_context=self.browser_context):
|
||||
login_obj = DouYinLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # you phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_awemes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get the information and comments of the specified creator
|
||||
await self.get_creators_and_videos()
|
||||
|
||||
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
|
||||
dy_limit_count = 10 # douyin limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||
start_page = config.START_PAGE # start page number
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
page = 0
|
||||
dy_search_id = ""
|
||||
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
offset=page * dy_limit_count - dy_limit_count,
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
break
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
break
|
||||
|
||||
page += 1
|
||||
if "data" not in posts_res:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
||||
break
|
||||
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
||||
for post_item in posts_res.get("data"):
|
||||
try:
|
||||
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
||||
except TypeError:
|
||||
continue
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||||
await self.get_aweme_media(aweme_item=aweme_info)
|
||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_awemes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
|
||||
aweme_details = await asyncio.gather(*task_list)
|
||||
for aweme_detail in aweme_details:
|
||||
if aweme_detail is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
|
||||
await self.get_aweme_media(aweme_item=aweme_detail)
|
||||
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
"""
|
||||
Batch get note comments
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
if len(task_list) > 0:
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
crawl_interval=random.random(),
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=douyin_store.batch_update_dy_aweme_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""
|
||||
Get the information and videos of the specified creator
|
||||
"""
|
||||
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
|
||||
for user_id in config.DY_CREATOR_ID_LIST:
|
||||
creator_info: Dict = await self.dy_client.get_user_info(user_id)
|
||||
if creator_info:
|
||||
await douyin_store.save_creator(user_id, creator=creator_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
|
||||
|
||||
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
|
||||
await self.batch_get_note_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
|
||||
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for aweme_item in note_details:
|
||||
if aweme_item is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
|
||||
await self.get_aweme_media(aweme_item=aweme_item)
|
||||
|
||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
|
||||
"""Create douyin client"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
douyin_client = DouYinClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||||
"Cookie": cookie_str,
|
||||
"Host": "www.douyin.com",
|
||||
"Origin": "https://www.douyin.com/",
|
||||
"Referer": "https://www.douyin.com/",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return douyin_client
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
) # type: ignore
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 添加反检测脚本
|
||||
await self.cdp_manager.add_stealth_script()
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_aweme_media(self, aweme_item: Dict):
|
||||
"""
|
||||
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
|
||||
return
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
|
||||
if note_download_url:
|
||||
await self.get_aweme_images(aweme_item)
|
||||
else:
|
||||
await self.get_aweme_video(aweme_item)
|
||||
|
||||
async def get_aweme_images(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme images. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
|
||||
if not note_download_url:
|
||||
return
|
||||
picNum = 0
|
||||
for url in note_download_url:
|
||||
if not url:
|
||||
continue
|
||||
content = await self.dy_client.get_aweme_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum:>03d}.jpeg"
|
||||
picNum += 1
|
||||
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
|
||||
|
||||
async def get_aweme_video(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme videos. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
|
||||
if not video_download_url:
|
||||
return
|
||||
content = await self.dy_client.get_aweme_media(video_download_url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,34 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchChannelType(Enum):
|
||||
"""search channel type"""
|
||||
GENERAL = "aweme_general" # 综合
|
||||
VIDEO = "aweme_video_web" # 视频
|
||||
USER = "aweme_user_web" # 用户
|
||||
LIVE = "aweme_live" # 直播
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
GENERAL = 0 # 综合排序
|
||||
MOST_LIKE = 1 # 最多点赞
|
||||
LATEST = 2 # 最新发布
|
||||
|
||||
class PublishTimeType(Enum):
|
||||
"""publish time type"""
|
||||
UNLIMITED = 0 # 不限
|
||||
ONE_DAY = 1 # 一天内
|
||||
ONE_WEEK = 7 # 一周内
|
||||
SIX_MONTH = 180 # 半年内
|
||||
@@ -0,0 +1,85 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
|
||||
import random
|
||||
|
||||
import execjs
|
||||
from playwright.async_api import Page
|
||||
|
||||
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def e(t):
|
||||
if t is not None:
|
||||
return str(t ^ (int(16 * random.random()) >> (t // 4)))
|
||||
else:
|
||||
return ''.join(
|
||||
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
|
||||
)
|
||||
|
||||
web_id = ''.join(
|
||||
e(int(x)) if x in '018' else x for x in e(None)
|
||||
)
|
||||
return web_id.replace('-', '')[:19]
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数, 目前不支持post请求类型的签名
|
||||
"""
|
||||
return get_a_bogus_from_js(url, params, user_agent)
|
||||
|
||||
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
"""
|
||||
通过js获取 a_bogus 参数
|
||||
Args:
|
||||
url:
|
||||
params:
|
||||
user_agent:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
sign_js_name = "sign_datail"
|
||||
if "/reply" in url:
|
||||
sign_js_name = "sign_reply"
|
||||
return douyin_sign_obj.call(sign_js_name, params, user_agent)
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
playwright版本已失效
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
a_bogus = await page.evaluate(
|
||||
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
|
||||
[params, post_data, user_agent])
|
||||
|
||||
return a_bogus
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from cache.cache_factory import CacheFactory
|
||||
from tools import utils
|
||||
|
||||
|
||||
class DouYinLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext, # type: ignore
|
||||
context_page: Page, # type: ignore
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: Optional[str] = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.scan_qrcode_time = 60
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""
|
||||
Start login douyin website
|
||||
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
|
||||
"""
|
||||
|
||||
# popup login dialog
|
||||
await self.popup_login_dialog()
|
||||
|
||||
# select login type
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
await asyncio.sleep(6)
|
||||
current_page_title = await self.context_page.title()
|
||||
if "验证码中间页" in current_page_title:
|
||||
await self.check_page_display_slider(move_step=3, slider_level="hard")
|
||||
|
||||
# check login state
|
||||
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
# wait for redirect
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self):
|
||||
"""Check if the current login status is successful and return True otherwise return False"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
|
||||
for page in self.browser_context.pages:
|
||||
try:
|
||||
local_storage = await page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
except Exception as e:
|
||||
# utils.logger.warn(f"[DouYinLogin] check_login_state waring: {e}")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
if cookie_dict.get("LOGIN_STATUS") == "1":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def popup_login_dialog(self):
|
||||
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
||||
dialog_selector = "xpath=//div[@id='login-panel-new']"
|
||||
try:
|
||||
# check dialog box is auto popup and wait for 10 seconds
|
||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
|
||||
qrcode_img_selector = "xpath=//div[@id='animate_qrcode_container']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
|
||||
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
|
||||
await mobile_tap_ele.click()
|
||||
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
|
||||
mobile_input_ele = self.context_page.locator("xpath=//input[@placeholder='手机号']")
|
||||
await mobile_input_ele.fill(self.login_phone)
|
||||
await asyncio.sleep(0.5)
|
||||
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
|
||||
await send_sms_code_btn.click()
|
||||
|
||||
# 检查是否有滑动验证码
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"dy_{self.login_phone}"
|
||||
sms_code_value = cache_client.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
break
|
||||
|
||||
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
|
||||
"""
|
||||
检查页面是否出现滑动验证码
|
||||
:return:
|
||||
"""
|
||||
# 等待滑动验证码的出现
|
||||
back_selector = "#captcha-verify-image"
|
||||
try:
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
|
||||
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
|
||||
return
|
||||
|
||||
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
|
||||
max_slider_try_times = 20
|
||||
slider_verify_success = False
|
||||
while not slider_verify_success:
|
||||
if max_slider_try_times <= 0:
|
||||
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
|
||||
sys.exit()
|
||||
try:
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
|
||||
await asyncio.sleep(1)
|
||||
max_slider_try_times -= 1
|
||||
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
|
||||
continue
|
||||
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
"""
|
||||
Move the slider to the right to complete the verification
|
||||
:param back_selector: 滑动验证码背景图片的选择器
|
||||
:param gap_selector: 滑动验证码的滑块选择器
|
||||
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
|
||||
:return:
|
||||
"""
|
||||
|
||||
# get slider background image
|
||||
slider_back_elements = await self.context_page.wait_for_selector(
|
||||
selector=back_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
slide_back = str(await slider_back_elements.get_property("src")) # type: ignore
|
||||
|
||||
# get slider gap image
|
||||
gap_elements = await self.context_page.wait_for_selector(
|
||||
selector=gap_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
gap_src = str(await gap_elements.get_property("src")) # type: ignore
|
||||
|
||||
# 识别滑块位置
|
||||
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
|
||||
distance = slide_app.discern()
|
||||
|
||||
# 获取移动轨迹
|
||||
tracks = utils.get_tracks(distance, slider_level)
|
||||
new_1 = tracks[-1] - (sum(tracks) - distance)
|
||||
tracks.pop()
|
||||
tracks.append(new_1)
|
||||
|
||||
# 根据轨迹拖拽滑块到指定位置
|
||||
element = await self.context_page.query_selector(gap_selector)
|
||||
bounding_box = await element.bounding_box() # type: ignore
|
||||
|
||||
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
|
||||
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
|
||||
# 这里获取到x坐标中心点位置
|
||||
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
|
||||
# 模拟滑动操作
|
||||
await element.hover() # type: ignore
|
||||
await self.context_page.mouse.down()
|
||||
|
||||
for track in tracks:
|
||||
# 循环鼠标按照轨迹移动
|
||||
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
await self.context_page.mouse.move(x + track, 0, steps=move_step)
|
||||
x += track
|
||||
await self.context_page.mouse.up()
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".douyin.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import KuaishouCrawler
|
||||
@@ -0,0 +1,313 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .graphql import KuaiShouGraphQL
|
||||
|
||||
|
||||
class KuaiShouClient(AbstractApiClient):
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.kuaishou.com/graphql"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = f"{uri}?" f"{urlencode(params)}"
|
||||
return await self.request(
|
||||
method="GET", url=f"{self._host}{final_uri}", headers=self.headers
|
||||
)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
||||
return await self.request(
|
||||
method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers
|
||||
)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
|
||||
ping_flag = False
|
||||
try:
|
||||
post_data = {
|
||||
"operationName": "visionProfileUserList",
|
||||
"variables": {
|
||||
"ftype": 1,
|
||||
},
|
||||
"query": self.graphql.get("vision_profile_user_list"),
|
||||
}
|
||||
res = await self.post("", post_data)
|
||||
if res.get("visionProfileUserList", {}).get("result") == 1:
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..."
|
||||
)
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_info_by_keyword(
|
||||
self, keyword: str, pcursor: str, search_session_id: str = ""
|
||||
):
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: search keyword
|
||||
:param pcursor: limite page curson
|
||||
:param search_session_id: search session id
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionSearchPhoto",
|
||||
"variables": {
|
||||
"keyword": keyword,
|
||||
"pcursor": pcursor,
|
||||
"page": "search",
|
||||
"searchSessionId": search_session_id,
|
||||
},
|
||||
"query": self.graphql.get("search_query"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_info(self, photo_id: str) -> Dict:
|
||||
"""
|
||||
Kuaishou web video detail api
|
||||
:param photo_id:
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionVideoDetail",
|
||||
"variables": {"photoId": photo_id, "page": "search"},
|
||||
"query": self.graphql.get("video_detail"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
|
||||
"""get video comments
|
||||
:param photo_id: photo id you want to fetch
|
||||
:param pcursor: last you get pcursor, defaults to ""
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "commentListQuery",
|
||||
"variables": {"photoId": photo_id, "pcursor": pcursor},
|
||||
"query": self.graphql.get("comment_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_sub_comments(
|
||||
self, photo_id: str, rootCommentId: str, pcursor: str = ""
|
||||
) -> Dict:
|
||||
"""get video sub comments
|
||||
:param photo_id: photo id you want to fetch
|
||||
:param pcursor: last you get pcursor, defaults to ""
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionSubCommentList",
|
||||
"variables": {
|
||||
"photoId": photo_id,
|
||||
"pcursor": pcursor,
|
||||
"rootCommentId": rootCommentId,
|
||||
},
|
||||
"query": self.graphql.get("vision_sub_comment_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_creator_profile(self, userId: str) -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfile",
|
||||
"variables": {"userId": userId},
|
||||
"query": self.graphql.get("vision_profile"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfilePhotoList",
|
||||
"variables": {"page": "profile", "pcursor": pcursor, "userId": userId},
|
||||
"query": self.graphql.get("vision_profile_photo_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_all_comments(
|
||||
self,
|
||||
photo_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param photo_id:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count:
|
||||
:return:
|
||||
"""
|
||||
|
||||
result = []
|
||||
pcursor = ""
|
||||
|
||||
while pcursor != "no_more" and len(result) < max_count:
|
||||
comments_res = await self.get_video_comments(photo_id, pcursor)
|
||||
vision_commen_list = comments_res.get("visionCommentList", {})
|
||||
pcursor = vision_commen_list.get("pcursor", "")
|
||||
comments = vision_commen_list.get("rootComments", [])
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[: max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(photo_id, comments)
|
||||
result.extend(comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
sub_comments = await self.get_comments_all_sub_comments(
|
||||
comments, photo_id, crawl_interval, callback
|
||||
)
|
||||
result.extend(sub_comments)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[Dict],
|
||||
photo_id,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Args:
|
||||
comments: 评论列表
|
||||
photo_id: 视频id
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
|
||||
)
|
||||
return []
|
||||
|
||||
result = []
|
||||
for comment in comments:
|
||||
sub_comments = comment.get("subComments")
|
||||
if sub_comments and callback:
|
||||
await callback(photo_id, sub_comments)
|
||||
|
||||
sub_comment_pcursor = comment.get("subCommentsPcursor")
|
||||
if sub_comment_pcursor == "no_more":
|
||||
continue
|
||||
|
||||
root_comment_id = comment.get("commentId")
|
||||
sub_comment_pcursor = ""
|
||||
|
||||
while sub_comment_pcursor != "no_more":
|
||||
comments_res = await self.get_video_sub_comments(
|
||||
photo_id, root_comment_id, sub_comment_pcursor
|
||||
)
|
||||
vision_sub_comment_list = comments_res.get("visionSubCommentList", {})
|
||||
sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
|
||||
|
||||
comments = vision_sub_comment_list.get("subComments", {})
|
||||
if callback:
|
||||
await callback(photo_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
return result
|
||||
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
|
||||
快手用户主页
|
||||
"""
|
||||
|
||||
visionProfile = await self.get_creator_profile(user_id)
|
||||
return visionProfile.get("userProfile")
|
||||
|
||||
async def get_all_videos_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
pcursor = ""
|
||||
|
||||
while pcursor != "no_more":
|
||||
videos_res = await self.get_video_by_creater(user_id, pcursor)
|
||||
if not videos_res:
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
|
||||
)
|
||||
break
|
||||
|
||||
vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
|
||||
pcursor = vision_profile_photo_list.get("pcursor", "")
|
||||
|
||||
videos = vision_profile_photo_list.get("feeds", [])
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
|
||||
)
|
||||
|
||||
if callback:
|
||||
await callback(videos)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(videos)
|
||||
return result
|
||||
@@ -0,0 +1,396 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import kuaishou as kuaishou_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import comment_tasks_var, crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import KuaiShouClient
|
||||
from .exception import DataFetchError
|
||||
from .login import KuaishouLogin
|
||||
|
||||
|
||||
class KuaishouCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
ks_client: KuaiShouClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.kuaishou.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||
ip_proxy_info
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium, None, self.user_agent, headless=config.HEADLESS
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
|
||||
# Create a client to interact with the kuaishou website.
|
||||
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
||||
if not await self.ks_client.pong():
|
||||
login_obj = KuaishouLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone=httpx_proxy_format,
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for videos and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their videos and comments
|
||||
await self.get_creators_and_videos()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
|
||||
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
search_session_id = ""
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}"
|
||||
)
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.ks_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
pcursor=str(page),
|
||||
search_session_id=search_session_id,
|
||||
)
|
||||
if not videos_res:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data"
|
||||
)
|
||||
continue
|
||||
|
||||
vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
|
||||
if vision_search_photo.get("result") != 1:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data "
|
||||
)
|
||||
continue
|
||||
search_session_id = vision_search_photo.get("searchSessionId", "")
|
||||
for video_detail in vision_search_photo.get("feeds"):
|
||||
video_id_list.append(video_detail.get("photo", {}).get("id"))
|
||||
await kuaishou_store.update_kuaishou_video(video_item=video_detail)
|
||||
|
||||
# batch fetch video comments
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def get_specified_videos(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
|
||||
for video_id in config.KS_SPECIFIED_ID_LIST
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_video_info_task(
|
||||
self, video_id: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[Dict]:
|
||||
"""Get video detail task"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.ks_client.get_video_info(video_id)
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
|
||||
)
|
||||
return result.get("visionVideoDetail")
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}"
|
||||
)
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
"""
|
||||
batch get video comments
|
||||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled"
|
||||
)
|
||||
return
|
||||
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}"
|
||||
)
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(video_id, semaphore), name=video_id
|
||||
)
|
||||
task_list.append(task)
|
||||
|
||||
comment_tasks_var.set(task_list)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for video id
|
||||
:param video_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
|
||||
)
|
||||
await self.ks_client.get_video_all_comments(
|
||||
photo_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
callback=kuaishou_store.batch_update_ks_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
|
||||
)
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}"
|
||||
)
|
||||
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
|
||||
# maybe kuaishou block our request, we will take a nap and update the cookie again
|
||||
current_running_tasks = comment_tasks_var.get()
|
||||
for task in current_running_tasks:
|
||||
task.cancel()
|
||||
time.sleep(20)
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||
"""Create ks client"""
|
||||
utils.logger.info(
|
||||
"[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
ks_client_obj = KuaiShouClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"Cookie": cookie_str,
|
||||
"Origin": self.index_url,
|
||||
"Referer": self.index_url,
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return ks_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info(
|
||||
"[KuaishouCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}"
|
||||
)
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""Get creator's videos and retrieve their comment information."""
|
||||
utils.logger.info(
|
||||
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
|
||||
)
|
||||
for user_id in config.KS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
user_id=user_id,
|
||||
crawl_interval=random.random(),
|
||||
callback=self.fetch_creator_video_detail,
|
||||
)
|
||||
|
||||
video_ids = [
|
||||
video_item.get("photo", {}).get("id") for video_item in all_video_list
|
||||
]
|
||||
await self.batch_get_video_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore)
|
||||
for post_item in video_list
|
||||
]
|
||||
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,33 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 快手的数据传输是基于GraphQL实现的
|
||||
# 这个类负责获取一些GraphQL的schema
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class KuaiShouGraphQL:
|
||||
graphql_queries: Dict[str, str]= {}
|
||||
|
||||
def __init__(self):
|
||||
self.graphql_dir = "media_platform/kuaishou/graphql/"
|
||||
self.load_graphql_queries()
|
||||
|
||||
def load_graphql_queries(self):
|
||||
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
|
||||
|
||||
for file in graphql_files:
|
||||
with open(self.graphql_dir + file, mode="r") as f:
|
||||
query_name = file.split(".")[0]
|
||||
self.graphql_queries[query_name] = f.read()
|
||||
|
||||
def get(self, query_name: str) -> str:
|
||||
return self.graphql_queries.get(query_name, "Query not found")
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
query commentListQuery($photoId: String, $pcursor: String) {
|
||||
visionCommentList(photoId: $photoId, pcursor: $pcursor) {
|
||||
commentCount
|
||||
pcursor
|
||||
rootComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
subCommentCount
|
||||
subCommentsPcursor
|
||||
subComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
replyToUserName
|
||||
replyTo
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
fragment photoContent on PhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
}
|
||||
|
||||
fragment recoPhotoFragment on recoPhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
}
|
||||
|
||||
fragment feedContent on Feed {
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
headerUrl
|
||||
following
|
||||
headerUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
...photoContent
|
||||
...recoPhotoFragment
|
||||
__typename
|
||||
}
|
||||
canAddComment
|
||||
llsid
|
||||
status
|
||||
currentPcursor
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
|
||||
query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
|
||||
visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
|
||||
result
|
||||
llsid
|
||||
webPageArea
|
||||
feeds {
|
||||
...feedContent
|
||||
__typename
|
||||
}
|
||||
searchSessionId
|
||||
pcursor
|
||||
aladdinBanner {
|
||||
imgUrl
|
||||
link
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+80
@@ -0,0 +1,80 @@
|
||||
query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {
|
||||
visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {
|
||||
status
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
following
|
||||
headerUrl
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
id
|
||||
duration
|
||||
caption
|
||||
likeCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
liked
|
||||
timestamp
|
||||
expTag
|
||||
llsid
|
||||
viewCount
|
||||
videoRatio
|
||||
stereoType
|
||||
musicBlocked
|
||||
manifest {
|
||||
mediaType
|
||||
businessType
|
||||
version
|
||||
adaptationSet {
|
||||
id
|
||||
duration
|
||||
representation {
|
||||
id
|
||||
defaultSelect
|
||||
backupUrl
|
||||
codecs
|
||||
url
|
||||
height
|
||||
width
|
||||
avgBitrate
|
||||
maxBitrate
|
||||
m3u8Slice
|
||||
qualityType
|
||||
qualityLabel
|
||||
frameRate
|
||||
featureP2sp
|
||||
hidden
|
||||
disableAdaptive
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
manifestH265
|
||||
photoH265Url
|
||||
coronaCropManifest
|
||||
coronaCropManifestH265
|
||||
croppedPhotoH265Url
|
||||
croppedPhotoUrl
|
||||
videoResource
|
||||
__typename
|
||||
}
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
commentLimit {
|
||||
canAddComment
|
||||
__typename
|
||||
}
|
||||
llsid
|
||||
danmakuSwitch
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
query visionProfile($userId: String) {
|
||||
visionProfile(userId: $userId) {
|
||||
result
|
||||
hostName
|
||||
userProfile {
|
||||
ownerCount {
|
||||
fan
|
||||
photo
|
||||
follow
|
||||
photo_public
|
||||
__typename
|
||||
}
|
||||
profile {
|
||||
gender
|
||||
user_name
|
||||
user_id
|
||||
headurl
|
||||
user_text
|
||||
user_profile_bg_url
|
||||
__typename
|
||||
}
|
||||
isFollowing
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+110
@@ -0,0 +1,110 @@
|
||||
fragment photoContent on PhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment recoPhotoFragment on recoPhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment feedContent on Feed {
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
headerUrl
|
||||
following
|
||||
headerUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
...photoContent
|
||||
...recoPhotoFragment
|
||||
__typename
|
||||
}
|
||||
canAddComment
|
||||
llsid
|
||||
status
|
||||
currentPcursor
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
|
||||
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
|
||||
visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
|
||||
result
|
||||
llsid
|
||||
webPageArea
|
||||
feeds {
|
||||
...feedContent
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
query visionProfileUserList($pcursor: String, $ftype: Int) {
|
||||
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
|
||||
result
|
||||
fols {
|
||||
user_name
|
||||
headurl
|
||||
user_text
|
||||
isFollowing
|
||||
user_id
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
|
||||
visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
|
||||
pcursor
|
||||
subComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
replyToUserName
|
||||
replyTo
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class KuaishouLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
kuaishou_pass_token = cookie_dict.get("passToken")
|
||||
if kuaishou_pass_token:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login kuaishou website and keep webdriver login state"""
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//p[text()='登录']"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='qrcode-img']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".kuaishou.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import TieBaCrawler
|
||||
@@ -0,0 +1,385 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
|
||||
|
||||
class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
ip_pool=None,
|
||||
default_ip_proxy=None,
|
||||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
self.headers = {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookies": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
return_ori_content: 是否返回原始内容
|
||||
proxies: 代理IP
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
actual_proxy = proxy if proxy else self.default_ip_proxy
|
||||
async with httpx.AsyncClient(proxy=actual_proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
utils.logger.error(f"Request failed, response: {response.text}")
|
||||
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
|
||||
if return_ori_content:
|
||||
return response.text
|
||||
|
||||
return response.json()
|
||||
|
||||
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
return_ori_content: 是否返回原始内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
try:
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, **kwargs)
|
||||
return res
|
||||
except RetryError as e:
|
||||
if self.ip_pool:
|
||||
proxie_model = await self.ip_pool.get_proxy()
|
||||
_, proxy = utils.format_proxy_info(proxie_model)
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, proxy=proxy, **kwargs)
|
||||
self.default_ip_proxy = proxy
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
|
||||
try:
|
||||
uri = "/mo/q/sync"
|
||||
res: Dict = await self.get(uri)
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
|
||||
if res and res.get("no") == 0:
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
|
||||
ping_flag = False
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
sort: SearchSortType = SearchSortType.TIME_DESC,
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 分页第几页
|
||||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/f/search/res"
|
||||
params = {
|
||||
"isnew": 1,
|
||||
"qw": keyword,
|
||||
"rn": page_size,
|
||||
"pn": page,
|
||||
"sm": sort.value,
|
||||
"only_thread": note_type.value,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
return self._page_extractor.extract_search_note_list(page_content)
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
"""
|
||||
根据帖子ID获取帖子详情
|
||||
Args:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_id}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_note_detail(page_content)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_detail: TiebaNote,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_detail: 帖子详情对象
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次帖子爬取的最大评论数量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_detail.note_id}"
|
||||
result: List[TiebaComment] = []
|
||||
current_page = 1
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
params = {
|
||||
"pn": current_page,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
|
||||
if not comments:
|
||||
break
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(note_detail.note_id, comments)
|
||||
result.extend(comments)
|
||||
# 获取所有子评论
|
||||
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[TiebaComment],
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/p/comment"
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
# # 贴吧获取所有子评论需要登录态
|
||||
# if self.headers.get("Cookies") == "" or not self.pong():
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": parment_comment.note_id, # 帖子ID
|
||||
"pid": parment_comment.comment_id, # 父级评论ID
|
||||
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页数量
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||
"""
|
||||
根据创作者ID获取创作者信息
|
||||
Args:
|
||||
creator_url: 创作者主页URL
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||
return page_content
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
根据创作者获取创作者的所有帖子
|
||||
Args:
|
||||
user_name:
|
||||
page_number:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/home/get/getthread"
|
||||
params = {
|
||||
"un": user_name,
|
||||
"pn": page_number,
|
||||
"id": "utf-8",
|
||||
"_": utils.get_current_timestamp(),
|
||||
}
|
||||
return await self.get(uri, params=params)
|
||||
|
||||
async def get_all_notes_by_creator_user_name(
|
||||
self,
|
||||
user_name: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_note_count: int = 0,
|
||||
creator_page_html_content: str = None,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据创作者用户名获取创作者所有帖子
|
||||
Args:
|
||||
user_name: 创作者用户名
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
||||
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
||||
creator_page_html_content: 创作者主页HTML内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
|
||||
result: List[TiebaNote] = []
|
||||
if creator_page_html_content:
|
||||
thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content))
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}")
|
||||
note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
result.extend(notes)
|
||||
|
||||
notes_has_more = 1
|
||||
page_number = 1
|
||||
page_per_count = 20
|
||||
total_get_count = 0
|
||||
while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count):
|
||||
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||
if not notes_res or notes_res.get("no") != 0:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
break
|
||||
notes_data = notes_res.get("data")
|
||||
notes_has_more = notes_data.get("has_more")
|
||||
notes = notes_data["thread_list"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
|
||||
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
page_number += 1
|
||||
total_get_count += page_per_count
|
||||
return result
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_baidu_tieba import TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import tieba as tieba_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import BaiduTieBaClient
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
from .login import BaiduTieBaLogin
|
||||
|
||||
|
||||
class TieBaCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
tieba_client: BaiduTieBaClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://tieba.baidu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
ip_proxy_pool, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
|
||||
)
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
|
||||
)
|
||||
|
||||
# Create a client to interact with the baidutieba website.
|
||||
self.tieba_client = BaiduTieBaClient(
|
||||
ip_pool=ip_proxy_pool,
|
||||
default_ip_proxy=httpx_proxy_format,
|
||||
)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
await self.get_specified_tieba_notes()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""
|
||||
Search for notes and retrieve their comment information.
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.search] Begin search baidu tieba keywords"
|
||||
)
|
||||
tieba_limit_count = 10 # tieba limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}"
|
||||
)
|
||||
notes_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=tieba_limit_count,
|
||||
sort=SearchSortType.TIME_DESC,
|
||||
note_type=SearchNoteType.FIXED_THREAD,
|
||||
)
|
||||
)
|
||||
if not notes_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Search note list is empty"
|
||||
)
|
||||
break
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}"
|
||||
)
|
||||
await self.get_specified_notes(
|
||||
note_id_list=[note_detail.note_id for note_detail in notes_list]
|
||||
)
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}"
|
||||
)
|
||||
break
|
||||
|
||||
async def get_specified_tieba_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post by tieba name
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}"
|
||||
)
|
||||
page_number = 0
|
||||
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_tieba_name(
|
||||
tieba_name=tieba_name, page_num=page_number
|
||||
)
|
||||
)
|
||||
if not note_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty"
|
||||
)
|
||||
break
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
|
||||
)
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(
|
||||
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
|
||||
):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Args:
|
||||
note_id_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
|
||||
for note_id in note_id_list
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
note_details_model: List[TiebaNote] = []
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
note_details_model.append(note_detail)
|
||||
await tieba_store.update_tieba_note(note_detail)
|
||||
await self.batch_get_note_comments(note_details_model)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self, note_id: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[TiebaNote]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
note_id: baidu tieba note id
|
||||
semaphore: asyncio semaphore
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
|
||||
)
|
||||
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
|
||||
if not note_detail:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
|
||||
)
|
||||
return None
|
||||
return note_detail
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}"
|
||||
)
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch get note comments
|
||||
Args:
|
||||
note_detail_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_detail in note_detail_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments_async_task(note_detail, semaphore),
|
||||
name=note_detail.note_id,
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments_async_task(
|
||||
self, note_detail: TiebaNote, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""
|
||||
Get comments async task
|
||||
Args:
|
||||
note_detail:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
|
||||
)
|
||||
await self.tieba_client.get_note_all_comments(
|
||||
note_detail=note_detail,
|
||||
crawl_interval=random.random(),
|
||||
callback=tieba_store.batch_update_tieba_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
|
||||
)
|
||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
|
||||
creator_url=creator_url
|
||||
)
|
||||
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
|
||||
creator_page_html_content
|
||||
)
|
||||
if creator_info:
|
||||
utils.logger.info(
|
||||
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
|
||||
)
|
||||
if not creator_info:
|
||||
raise Exception("Get creator info error")
|
||||
|
||||
await tieba_store.save_creator(user_info=creator_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = (
|
||||
await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||
user_name=creator_info.user_name,
|
||||
crawl_interval=0,
|
||||
callback=tieba_store.batch_update_tieba_notes,
|
||||
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
|
||||
creator_page_html_content=creator_page_html_content,
|
||||
)
|
||||
)
|
||||
|
||||
await self.batch_get_note_comments(all_notes_list)
|
||||
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
|
||||
)
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
Launch browser and create browser
|
||||
Args:
|
||||
chromium:
|
||||
playwright_proxy:
|
||||
user_agent:
|
||||
headless:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close browser context
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,29 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# 按时间倒序
|
||||
TIME_DESC = "1"
|
||||
# 按时间顺序
|
||||
TIME_ASC = "0"
|
||||
# 按相关性顺序
|
||||
RELEVANCE_ORDER = "2"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
# 只看主题贴
|
||||
MAIN_THREAD = "1"
|
||||
# 混合模式(帖子+回复)
|
||||
FIXED_THREAD = "0"
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from urllib.parse import parse_qs, unquote
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from tools import utils
|
||||
|
||||
GENDER_MALE = "sex_male"
|
||||
GENDER_FEMALE = "sex_female"
|
||||
|
||||
|
||||
class TieBaExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
|
||||
Args:
|
||||
page_content: 页面内容的HTML字符串
|
||||
|
||||
Returns:
|
||||
包含帖子信息的字典列表
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post in post_list:
|
||||
tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
|
||||
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
|
||||
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(
|
||||
default=''),
|
||||
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(
|
||||
default='').strip(), user_link=const.TIEBA_URL + post.xpath(
|
||||
".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
|
||||
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(
|
||||
default=''),
|
||||
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(
|
||||
default='').strip(), )
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(
|
||||
".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get(
|
||||
"author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0))
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
# 帖子回复数、回复页数
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
|
||||
# IP地理位置、发表时间
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + first_floor_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=first_floor_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "")
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子一级评论
|
||||
Args:
|
||||
page_content:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaComment] = []
|
||||
for comment_selector in comment_list:
|
||||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
continue
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(
|
||||
comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=comment_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location, publish_time=publish_time, note_id=note_id, )
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")), content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id, note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name,
|
||||
tieba_link=parent_comment.tieba_link)
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
|
||||
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||
"""
|
||||
提取贴吧创作者信息
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||
user_link: str = user_link_selector.xpath("./@href").get(default='')
|
||||
user_link_params: Dict = parse_qs(unquote(user_link.split("?")[-1]))
|
||||
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
|
||||
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
|
||||
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
|
||||
follow_fans_selector = selector.xpath("//span[@class='concern_num']")
|
||||
follows, fans = 0, 0
|
||||
if len(follow_fans_selector) == 2:
|
||||
follows, fans = self.extract_follow_and_fans(follow_fans_selector)
|
||||
user_content = userinfo_userdata_selector.get(default='')
|
||||
return TiebaCreator(user_id=user_id, user_name=user_name,
|
||||
nickname=selector.xpath(".//span[@class='userinfo_username ']/text()").get(
|
||||
default='').strip(),
|
||||
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(
|
||||
default='').strip(),
|
||||
gender=self.extract_gender(user_content),
|
||||
ip_location=self.extract_ip(user_content),
|
||||
follows=follows,
|
||||
fans=fans,
|
||||
registration_duration=self.extract_registration_duration(user_content)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_tieba_thread_id_list_from_creator_page(
|
||||
html_content: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
提取贴吧创作者主页的帖子列表
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
thread_id_list = []
|
||||
xpath_selector = (
|
||||
"//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
|
||||
)
|
||||
thread_url_list = selector.xpath(xpath_selector).getall()
|
||||
for thread_url in thread_url_list:
|
||||
thread_id = thread_url.split("?")[0].split("/")[-1]
|
||||
thread_id_list.append(thread_id)
|
||||
return thread_id_list
|
||||
|
||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
pub_time = time_match.group(1) if time_match else ""
|
||||
return self.extract_ip(html_content), pub_time
|
||||
|
||||
@staticmethod
|
||||
def extract_ip(html_content: str) -> str:
|
||||
"""
|
||||
提取IP
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
return ip
|
||||
|
||||
@staticmethod
|
||||
def extract_gender(html_content: str) -> str:
|
||||
"""
|
||||
提取性别
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if GENDER_MALE in html_content:
|
||||
return '男'
|
||||
elif GENDER_FEMALE in html_content:
|
||||
return '女'
|
||||
return '未知'
|
||||
|
||||
@staticmethod
|
||||
def extract_follow_and_fans(selectors: List[Selector]) -> Tuple[str, str]:
|
||||
"""
|
||||
提取关注数和粉丝数
|
||||
Args:
|
||||
selectors:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span class="concern_num">\(<a[^>]*>(\d+)</a>\)</span>')
|
||||
follow_match = pattern.findall(selectors[0].get())
|
||||
fans_match = pattern.findall(selectors[1].get())
|
||||
follows = follow_match[0] if follow_match else 0
|
||||
fans = fans_match[0] if fans_match else 0
|
||||
return follows, fans
|
||||
|
||||
@staticmethod
|
||||
def extract_registration_duration(html_content: str) -> str:
|
||||
"""
|
||||
"<span>吧龄:1.9年</span>"
|
||||
Returns: 1.9年
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span>吧龄:(\S+)</span>')
|
||||
match = pattern.search(html_content)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
"""
|
||||
提取data-field的值
|
||||
Args:
|
||||
selector:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
data_field_value = selector.xpath("./@data-field").get(default='').strip()
|
||||
if not data_field_value or data_field_value == "{}":
|
||||
return {}
|
||||
try:
|
||||
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
|
||||
unescaped_json_str = html.unescape(data_field_value)
|
||||
data_field_dict_value = json.loads(unescaped_json_str)
|
||||
except Exception as ex:
|
||||
print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析")
|
||||
data_field_dict_value = {}
|
||||
return data_field_dict_value
|
||||
|
||||
|
||||
def test_extract_search_note_list():
|
||||
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_search_note_list(content)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_note_detail():
|
||||
with open("test_data/note_detail.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_note_detail(content)
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
def test_extract_tieba_note_parment_comments():
|
||||
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
fake_parment_comment = TiebaComment(comment_id="123456", content="content", user_link="user_link",
|
||||
user_nickname="user_nickname", user_avatar="user_avatar",
|
||||
publish_time="publish_time", parent_comment_id="parent_comment_id",
|
||||
note_id="note_id", note_url="note_url", tieba_id="tieba_id",
|
||||
tieba_name="tieba_name", )
|
||||
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_list():
|
||||
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_list(content)
|
||||
print(result)
|
||||
pass
|
||||
|
||||
|
||||
def test_extract_creator_info():
|
||||
with open("test_data/creator_info.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_creator_info(content)
|
||||
print(result.model_dump_json())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
# test_extract_tieba_note_list()
|
||||
test_extract_creator_info()
|
||||
@@ -0,0 +1,123 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BaiduTieBaLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
轮训检查登录状态是否成功,成功返回True否则返回False
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
ptoken = cookie_dict.get("PTOKEN")
|
||||
if stoken or ptoken:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login baidutieba"""
|
||||
utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login baidutieba by mobile"""
|
||||
pass
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login baidutieba website and keep webdriver login state"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
|
||||
qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login baidutieba website by cookies"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".baidu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
+874
File diff suppressed because one or more lines are too long
+839
File diff suppressed because one or more lines are too long
+189
@@ -0,0 +1,189 @@
|
||||
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":150726504693,"showname":"heinzfrentzen","user_name":"heinzfrentzen","portrait":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}'>
|
||||
<a rel="noopener" name="150726504693"></a>
|
||||
<a rel="noopener" data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:11</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726506822,"showname":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","user_name":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","portrait":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}'>
|
||||
<a rel="noopener" name="150726506822"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
|
||||
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726508024,"showname":"\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d","user_name":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","portrait":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}'>
|
||||
<a rel="noopener" name="150726508024"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
|
||||
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726509762,"showname":"\u8317\u82b1\u5c11\u5e05","user_name":"\u8317\u82b1\u5c11\u5e05","portrait":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}'>
|
||||
<a rel="noopener" name="150726509762"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1421248220","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1421248220","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
|
||||
:<span class="lzl_content_main" data-username="">你就只看水花,不看空中姿态吗 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726510645,"showname":"\u4e1c\u534e\u6b66\u5170","user_name":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","portrait":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}'>
|
||||
<a rel="noopener" name="150726510645"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1644033630","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1644033630","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
|
||||
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726514057,"showname":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","user_name":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","portrait":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}'>
|
||||
<a rel="noopener" name="150726514057"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
|
||||
:<span class="lzl_content_main" data-username="">额,分数正常吧 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:13</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726520372,"showname":"\u9759\u770b\u8682\u8681\u4e0a\u6811","user_name":"\u9759\u770b\u8682\u8681\u4e0a\u6811","portrait":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}'>
|
||||
<a rel="noopener" name="150726520372"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
|
||||
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:14</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726524963,"showname":"\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c","user_name":"\u9ec4\u5c0f\u6e2forz","portrait":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}'>
|
||||
<a rel="noopener" name="150726524963"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:15</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726535666,"showname":"\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf","user_name":"\u5feb\u770b\u5361\u5361\u5361\u5361","portrait":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}'>
|
||||
<a rel="noopener" name="150726535666"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1539783937","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1539783937","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
|
||||
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了,空中姿态明显全红婵更好 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726536076,"showname":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36","user_name":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","portrait":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}'>
|
||||
<a rel="noopener" name="150726536076"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":null,"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
|
||||
:你不会看起跳高度和空中姿态?
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":16,"total_page":2}'>
|
||||
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
|
||||
<i class="icon-reply"></i>
|
||||
我也说一句
|
||||
</a>
|
||||
<p class="j_pager l_pager pager_theme_2">
|
||||
<span class="tP">1</span>
|
||||
<a href="#2">2</a>
|
||||
<a href="#2">下一页</a>
|
||||
<a href="#2">尾页</a>
|
||||
</p>
|
||||
</li>
|
||||
+96
@@ -0,0 +1,96 @@
|
||||
<div class="s_post_list">
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
|
||||
href="/p/9117888152?pid=150718967291&cid=0#150718967291"
|
||||
target="_blank">武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠</a></span>
|
||||
<div class="p_content">
|
||||
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。
|
||||
项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、
|
||||
</div>
|
||||
贴吧:<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
|
||||
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者:<a
|
||||
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
|
||||
<font class="p_green p_date">2024-08-05 16:45</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
|
||||
href="/p/9114743782?pid=150705176739&cid=0#150705176739"
|
||||
target="_blank">请各位急用玛尼的小心,骗子最多</a></span>
|
||||
<div class="p_content">
|
||||
这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收
|
||||
</div>
|
||||
贴吧:<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者:<a
|
||||
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
|
||||
<font class="p_green p_date">2024-08-03 07:35</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
|
||||
href="/p/9095684158?pid=150616716870&cid=0#150616716870"
|
||||
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
|
||||
<div class="p_content">**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
|
||||
展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司
|
||||
人员跟团观展补贴!为您节省成本,寻找适合您的市场:
|
||||
本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市
|
||||
</div>
|
||||
贴吧:<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
|
||||
class="p_violet">国际展会</font></a>作者:<a href="/home/main?un=zhaot_188" target="_blank"><font
|
||||
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
|
||||
href="/p/9093564752?pid=150606964195&cid=0#150606964195"
|
||||
target="_blank">京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承</a></span>
|
||||
<div class="p_content">来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫
|
||||
京湘楼,KING
|
||||
HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品
|
||||
</div>
|
||||
贴吧:<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
|
||||
class="p_violet">京湘楼</font></a>作者:<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
|
||||
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
|
||||
href="/p/9088419293?pid=150582471307&cid=0#150582471307"
|
||||
target="_blank">广州能争取到迪士尼与环球落户吗?</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
|
||||
class="p_violet">地理</font></a>作者:<a href="/home/main?un=SeaRoutes" target="_blank"><font
|
||||
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
|
||||
href="/p/9088416365?pid=150582456551&cid=0#150582456551"
|
||||
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者:<a
|
||||
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
|
||||
class="p_green p_date">2024-07-13 20:14</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
|
||||
href="/p/9087419039?pid=150577861626&cid=0#150577861626"
|
||||
target="_blank">云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
|
||||
<div class="p_content">
|
||||
7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省
|
||||
</div>
|
||||
贴吧:<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
|
||||
class="p_violet">昆明</font></a>作者:<a href="/home/main?un=%8F%EC" target="_blank"><font
|
||||
class="p_violet">忟</font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
|
||||
href="/p/9085102046?pid=150567555367&cid=0#150567555367"
|
||||
target="_blank">寻找弟弟,很久没跟家里联系</a></span>
|
||||
<div class="p_content">Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团</div>
|
||||
贴吧:<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
|
||||
class="p_violet">东南亚</font></a>作者:<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
|
||||
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
|
||||
<font class="p_green p_date">2024-07-11 07:53</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
|
||||
href="/p/9083888071?pid=150562129935&cid=0#150562129935"
|
||||
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧?</a></span>
|
||||
<div class="p_content">拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
|
||||
跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。
|
||||
</div>
|
||||
贴吧:<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
|
||||
class="p_violet">历史</font></a>作者:<a href="/home/main?un=yoursagain" target="_blank"><font
|
||||
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
|
||||
href="/p/9071937582?pid=150510120873&cid=0#150510120873"
|
||||
target="_blank">东南亚,园区【 工 价 低 】</a></span>
|
||||
<div class="p_content"></div>
|
||||
贴吧:<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
|
||||
class="p_violet">园区招商</font></a>作者:<a href="/home/main?un=QQ59052966" target="_blank"><font
|
||||
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
|
||||
</div>
|
||||
+3627
File diff suppressed because one or more lines are too long
@@ -0,0 +1,18 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc :
|
||||
from .client import WeiboClient
|
||||
from .core import WeiboCrawler
|
||||
from .login import WeiboLogin
|
||||
@@ -0,0 +1,381 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc : 微博爬虫 API 请求 client
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import parse_qs, unquote, urlencode
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
|
||||
|
||||
class WeiboClient:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://m.weibo.cn"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if enable_return_response:
|
||||
return response
|
||||
|
||||
data: Dict = response.json()
|
||||
ok_code = data.get("ok")
|
||||
if ok_code == 0: # response error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "response error"))
|
||||
elif ok_code != 1: # unknown error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "unknown error"))
|
||||
else: # response right
|
||||
return data.get("data", {})
|
||||
|
||||
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
|
||||
if headers is None:
|
||||
headers = self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
||||
ping_flag = False
|
||||
try:
|
||||
uri = "/api/config"
|
||||
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
|
||||
if resp_data.get("login"):
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
search_type: SearchType = SearchType.DEFAULT,
|
||||
) -> Dict:
|
||||
"""
|
||||
search note by keyword
|
||||
:param keyword: 微博搜搜的关键词
|
||||
:param page: 分页参数 -当前页码
|
||||
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
||||
:return:
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
containerid = f"100103type={search_type.value}&q={keyword}"
|
||||
params = {
|
||||
"containerid": containerid,
|
||||
"page_type": "searchall",
|
||||
"page": page,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
|
||||
"""get notes comments
|
||||
:param mid_id: 微博ID
|
||||
:param max_id: 分页参数ID
|
||||
:param max_id_type: 分页参数ID类型
|
||||
:return:
|
||||
"""
|
||||
uri = "/comments/hotflow"
|
||||
params = {
|
||||
"id": mid_id,
|
||||
"mid": mid_id,
|
||||
"max_id_type": max_id_type,
|
||||
}
|
||||
if max_id > 0:
|
||||
params.update({"max_id": max_id})
|
||||
referer_url = f"https://m.weibo.cn/detail/{mid_id}"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = referer_url
|
||||
|
||||
return await self.get(uri, params, headers=headers)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get note all comments include sub comments
|
||||
:param note_id:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
is_end = False
|
||||
max_id = -1
|
||||
max_id_type = 0
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = await self.get_note_comments(note_id, max_id, max_id_type)
|
||||
max_id: int = comments_res.get("max_id")
|
||||
max_id_type: int = comments_res.get("max_id_type")
|
||||
comment_list: List[Dict] = comments_res.get("data", [])
|
||||
is_end = max_id == 0
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comment_list)
|
||||
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
||||
result.extend(sub_comment_result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
async def get_comments_all_sub_comments(
|
||||
note_id: str,
|
||||
comment_list: List[Dict],
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
res_sub_comments = []
|
||||
for comment in comment_list:
|
||||
sub_comments = comment.get("comments")
|
||||
if sub_comments and isinstance(sub_comments, list):
|
||||
await callback(note_id, sub_comments)
|
||||
res_sub_comments.extend(sub_comments)
|
||||
return res_sub_comments
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
:param note_id:
|
||||
:return:
|
||||
"""
|
||||
url = f"{self._host}/detail/{note_id}"
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
if response.status_code != 200:
|
||||
raise DataFetchError(f"get weibo detail err: {response.text}")
|
||||
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
|
||||
if match:
|
||||
render_data_json = match.group(1)
|
||||
render_data_dict = json.loads(render_data_json)
|
||||
note_detail = render_data_dict[0].get("status")
|
||||
note_item = {"mblog": note_detail}
|
||||
return note_item
|
||||
else:
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||
return dict()
|
||||
|
||||
async def get_note_image(self, image_url: str) -> bytes:
|
||||
image_url = image_url[8:] # 去掉 https://
|
||||
sub_url = image_url.split("/")
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" # 都获取高清大图
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
image_url += sub_url[i] + "/"
|
||||
# 微博图床对外存在防盗链,所以需要代理访问
|
||||
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||
final_uri = (f"{self._image_agent_host}"
|
||||
f"{image_url}")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
||||
fid_container_id:用户的微博详情API的容器ID
|
||||
lfid_container_id:用户的微博列表API的容器ID
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns: {
|
||||
|
||||
"""
|
||||
response = await self.get(f"/u/{creator_id}", return_response=True)
|
||||
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
|
||||
if not m_weibocn_params:
|
||||
raise DataFetchError("get containerid failed")
|
||||
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
|
||||
return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
|
||||
|
||||
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
根据用户ID获取用户详情
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
container_info = await self.get_creator_container_info(creator_id)
|
||||
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
|
||||
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
|
||||
raise DataFetchError("get containerid failed")
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator_id,
|
||||
"containerid": container_info["fid_container_id"],
|
||||
}
|
||||
|
||||
user_res = await self.get(uri, params)
|
||||
|
||||
if user_res.get("tabsInfo"):
|
||||
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
|
||||
for tab in tabs:
|
||||
if tab.get("tabKey") == "weibo":
|
||||
container_info["lfid_container_id"] = tab.get("containerid")
|
||||
break
|
||||
|
||||
user_res.update(container_info)
|
||||
return user_res
|
||||
|
||||
async def get_notes_by_creator(
|
||||
self,
|
||||
creator: str,
|
||||
container_id: str,
|
||||
since_id: str = "0",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Args:
|
||||
creator: 博主ID
|
||||
container_id: 容器ID
|
||||
since_id: 上一页最后一条笔记的ID
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
uri = "/api/container/getIndex"
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator,
|
||||
"containerid": container_id,
|
||||
"since_id": since_id,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_notes_by_creator_id(
|
||||
self,
|
||||
creator_id: str,
|
||||
container_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
creator_id:
|
||||
container_id:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = True
|
||||
since_id = ""
|
||||
crawler_total_count = 0
|
||||
while notes_has_more:
|
||||
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
break
|
||||
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
||||
if "cards" not in notes_res:
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||
break
|
||||
|
||||
notes = notes_res["cards"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
|
||||
notes = [note for note in notes if note.get("card_type") == 9]
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
crawler_total_count += 10
|
||||
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
|
||||
return result
|
||||
@@ -0,0 +1,373 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc : 微博爬虫主流程代码
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import weibo as weibo_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import WeiboClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
from .help import filter_search_result_card
|
||||
from .login import WeiboLogin
|
||||
|
||||
|
||||
class WeiboCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
wb_client: WeiboClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.weibo.com"
|
||||
self.mobile_index_url = "https://m.weibo.cn"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.mobile_user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
||||
if not await self.wb_client.pong():
|
||||
login_obj = WeiboLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
await asyncio.sleep(2)
|
||||
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for video and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search weibo note with keywords
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
|
||||
weibo_limit_count = 10 # weibo limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||
start_page = config.START_PAGE
|
||||
|
||||
# Set the search type based on the configuration for weibo
|
||||
if config.WEIBO_SEARCH_TYPE == "default":
|
||||
search_type = SearchType.DEFAULT
|
||||
elif config.WEIBO_SEARCH_TYPE == "real_time":
|
||||
search_type = SearchType.REAL_TIME
|
||||
elif config.WEIBO_SEARCH_TYPE == "popular":
|
||||
search_type = SearchType.POPULAR
|
||||
elif config.WEIBO_SEARCH_TYPE == "video":
|
||||
search_type = SearchType.VIDEO
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
|
||||
return
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
mblog: Dict = note_item.get("mblog")
|
||||
if mblog:
|
||||
note_id_list.append(mblog.get("id"))
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
|
||||
page += 1
|
||||
await self.batch_get_notes_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
get specified notes info
|
||||
:return:
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for note_item in video_details:
|
||||
if note_item:
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get note detail task
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.wb_client.get_note_info_by_id(note_id)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_notes_comments(self, note_id_list: List[str]):
|
||||
"""
|
||||
batch get notes comments
|
||||
:param note_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_id in note_id_list:
|
||||
task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for note id
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||
await self.wb_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
|
||||
callback=weibo_store.batch_update_weibo_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
|
||||
|
||||
async def get_note_images(self, mblog: Dict):
|
||||
"""
|
||||
get note images
|
||||
:param mblog:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
|
||||
return
|
||||
|
||||
pics: Dict = mblog.get("pics")
|
||||
if not pics:
|
||||
return
|
||||
for pic in pics:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.wb_client.get_note_image(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content != None:
|
||||
extension_file_name = url.split(".")[-1]
|
||||
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||
for user_id in config.WEIBO_CREATOR_ID_LIST:
|
||||
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
|
||||
if createor_info_res:
|
||||
createor_info: Dict = createor_info_res.get("userInfo", {})
|
||||
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
|
||||
if not createor_info:
|
||||
raise DataFetchError("Get creator info error")
|
||||
await weibo_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
|
||||
creator_id=user_id,
|
||||
container_id=createor_info_res.get("lfid_container_id"),
|
||||
crawl_interval=0,
|
||||
callback=weibo_store.batch_update_weibo_notes,
|
||||
)
|
||||
|
||||
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
|
||||
await self.batch_get_notes_comments(note_ids)
|
||||
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
|
||||
|
||||
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
weibo_client_obj = WeiboClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": utils.get_mobile_user_agent(),
|
||||
"Cookie": cookie_str,
|
||||
"Origin": "https://m.weibo.cn",
|
||||
"Referer": "https://m.weibo.cn",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc :
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,30 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc :
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
# 综合
|
||||
DEFAULT = "1"
|
||||
|
||||
# 实时
|
||||
REAL_TIME = "61"
|
||||
|
||||
# 热门
|
||||
POPULAR = "60"
|
||||
|
||||
# 视频
|
||||
VIDEO = "64"
|
||||
@@ -0,0 +1,36 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/24 17:37
|
||||
# @Desc :
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
过滤微博搜索的结果,只保留card_type为9类型的数据
|
||||
:param card_list:
|
||||
:return:
|
||||
"""
|
||||
note_list: List[Dict] = []
|
||||
for card_item in card_list:
|
||||
if card_item.get("card_type") == 9:
|
||||
note_list.append(card_item)
|
||||
if len(card_item.get("card_group", [])) > 0:
|
||||
card_group = card_item.get("card_group")
|
||||
for card_group_item in card_group:
|
||||
if card_group_item.get("card_type") == 9:
|
||||
note_list.append(card_group_item)
|
||||
|
||||
return note_list
|
||||
@@ -0,0 +1,123 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:42
|
||||
# @Desc : 微博登录实现
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class WeiboLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
||||
|
||||
async def begin(self):
|
||||
"""Start login weibo"""
|
||||
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError(
|
||||
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SSOLoginState"):
|
||||
return True
|
||||
current_web_session = cookie_dict.get("WBPSESS")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login weibo website and keep webdriver login state"""
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||
await self.context_page.goto(self.weibo_sso_login_url)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("WBPSESS")
|
||||
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".weibo.cn",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .core import XiaoHongShuCrawler
|
||||
from .field import *
|
||||
@@ -0,0 +1,592 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from html import unescape
|
||||
|
||||
from .exception import DataFetchError, IPBlockError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id, sign
|
||||
|
||||
|
||||
class XiaoHongShuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://edith.xiaohongshu.com"
|
||||
self._domain = "https://www.xiaohongshu.com"
|
||||
self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
|
||||
self.IP_ERROR_CODE = 300012
|
||||
self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
|
||||
self.NOTE_ABNORMAL_CODE = -510001
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def _pre_headers(self, url: str, data=None) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Args:
|
||||
url:
|
||||
data:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
signs = sign(
|
||||
a1=self.cookie_dict.get("a1", ""),
|
||||
b1=local_storage.get("b1", ""),
|
||||
x_s=encrypt_params.get("X-s", ""),
|
||||
x_t=str(encrypt_params.get("X-t", "")),
|
||||
)
|
||||
|
||||
headers = {
|
||||
"X-S": signs["x-s"],
|
||||
"X-T": signs["x-t"],
|
||||
"x-S-Common": signs["x-s-common"],
|
||||
"X-B3-Traceid": signs["x-b3-traceid"],
|
||||
}
|
||||
self.headers.update(headers)
|
||||
return self.headers
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# return response.text
|
||||
return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if response.status_code == 471 or response.status_code == 461:
|
||||
# someday someone maybe will bypass captcha
|
||||
verify_type = response.headers["Verifytype"]
|
||||
verify_uuid = response.headers["Verifyuuid"]
|
||||
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
utils.logger.error(msg)
|
||||
raise Exception(msg)
|
||||
|
||||
if return_response:
|
||||
return response.text
|
||||
data: Dict = response.json()
|
||||
if data["success"]:
|
||||
return data.get("data", data.get("success", {}))
|
||||
elif data["code"] == self.IP_ERROR_CODE:
|
||||
raise IPBlockError(self.IP_ERROR_STR)
|
||||
else:
|
||||
raise DataFetchError(data.get("msg", None))
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = f"{uri}?" f"{urlencode(params)}"
|
||||
headers = await self._pre_headers(final_uri)
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
headers = await self._pre_headers(uri, data)
|
||||
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
||||
return await self.request(
|
||||
method="POST",
|
||||
url=f"{self._host}{uri}",
|
||||
data=json_str,
|
||||
headers=headers,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def get_note_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
search_id: str = get_search_id(),
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
sort: SearchSortType = SearchSortType.GENERAL,
|
||||
note_type: SearchNoteType = SearchNoteType.ALL,
|
||||
) -> Dict:
|
||||
"""
|
||||
根据关键词搜索笔记
|
||||
Args:
|
||||
keyword: 关键词参数
|
||||
page: 分页第几页
|
||||
page_size: 分页数据长度
|
||||
sort: 搜索结果排序指定
|
||||
note_type: 搜索的笔记类型
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v1/search/notes"
|
||||
data = {
|
||||
"keyword": keyword,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"search_id": search_id,
|
||||
"sort": sort.value,
|
||||
"note_type": note_type.value,
|
||||
}
|
||||
return await self.post(uri, data)
|
||||
|
||||
async def get_note_by_id(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
) -> Dict:
|
||||
"""
|
||||
获取笔记详情API
|
||||
Args:
|
||||
note_id:笔记ID
|
||||
xsec_source: 渠道来源
|
||||
xsec_token: 搜索关键字之后返回的比较列表中返回的token
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if xsec_source == "":
|
||||
xsec_source = "pc_search"
|
||||
|
||||
data = {
|
||||
"source_note_id": note_id,
|
||||
"image_formats": ["jpg", "webp", "avif"],
|
||||
"extra": {
|
||||
"need_body_topic": 1
|
||||
},
|
||||
"xsec_source": xsec_source,
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
uri = "/api/sns/web/v1/feed"
|
||||
res = await self.post(uri, data)
|
||||
if res and res.get("items"):
|
||||
res_dict: Dict = res["items"][0]["note_card"]
|
||||
return res_dict
|
||||
# 爬取频繁了可能会出现有的笔记能有结果有的没有
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
|
||||
return dict()
|
||||
|
||||
async def get_note_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_token: str,
|
||||
cursor: str = "",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论的API
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
cursor: 分页游标
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v2/comment/page"
|
||||
params = {
|
||||
"note_id": note_id,
|
||||
"cursor": cursor,
|
||||
"top_comment_id": "",
|
||||
"image_formats": "jpg,webp,avif",
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_sub_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
root_comment_id: str,
|
||||
xsec_token: str,
|
||||
num: int = 10,
|
||||
cursor: str = "",
|
||||
):
|
||||
"""
|
||||
获取指定父评论下的子评论的API
|
||||
Args:
|
||||
note_id: 子评论的帖子ID
|
||||
root_comment_id: 根评论ID
|
||||
xsec_token: 验证token
|
||||
num: 分页数量
|
||||
cursor: 分页游标
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v2/comment/sub/page"
|
||||
params = {
|
||||
"note_id": note_id,
|
||||
"root_comment_id": root_comment_id,
|
||||
"num": num,
|
||||
"cursor": cursor,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
"top_comment_id": "",
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_token: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = True
|
||||
comments_cursor = ""
|
||||
while comments_has_more and len(result) < max_count:
|
||||
comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", False)
|
||||
comments_cursor = comments_res.get("cursor", "")
|
||||
if "comments" not in comments_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(note_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
sub_comments = await self.get_comments_all_sub_comments(
|
||||
comments=comments,
|
||||
xsec_token=xsec_token,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=callback,
|
||||
)
|
||||
result.extend(sub_comments)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[Dict],
|
||||
xsec_token: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Args:
|
||||
comments: 评论列表
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
result = []
|
||||
for comment in comments:
|
||||
note_id = comment.get("note_id")
|
||||
sub_comments = comment.get("sub_comments")
|
||||
if sub_comments and callback:
|
||||
await callback(note_id, sub_comments)
|
||||
|
||||
sub_comment_has_more = comment.get("sub_comment_has_more")
|
||||
if not sub_comment_has_more:
|
||||
continue
|
||||
|
||||
root_comment_id = comment.get("id")
|
||||
sub_comment_cursor = comment.get("sub_comment_cursor")
|
||||
|
||||
while sub_comment_has_more:
|
||||
comments_res = await self.get_note_sub_comments(
|
||||
note_id=note_id,
|
||||
root_comment_id=root_comment_id,
|
||||
xsec_token=xsec_token,
|
||||
num=10,
|
||||
cursor=sub_comment_cursor,
|
||||
)
|
||||
|
||||
if comments_res is None:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
|
||||
continue
|
||||
sub_comment_has_more = comments_res.get("has_more", False)
|
||||
sub_comment_cursor = comments_res.get("cursor", "")
|
||||
if "comments" not in comments_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if callback:
|
||||
await callback(note_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
return result
|
||||
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
通过解析网页版的用户主页HTML,获取用户个人简要信息
|
||||
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
|
||||
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
|
||||
"""
|
||||
uri = f"/user/profile/{user_id}"
|
||||
html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
|
||||
match = re.search(r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html_content, re.M)
|
||||
|
||||
if match is None:
|
||||
return {}
|
||||
|
||||
info = json.loads(match.group(1).replace(":undefined", ":null"), strict=False)
|
||||
if info is None:
|
||||
return {}
|
||||
return info.get("user").get("userPageData")
|
||||
|
||||
async def get_notes_by_creator(
|
||||
self,
|
||||
creator: str,
|
||||
cursor: str,
|
||||
page_size: int = 30,
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Args:
|
||||
creator: 博主ID
|
||||
cursor: 上一页最后一条笔记的ID
|
||||
page_size: 分页数据长度
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v1/user_posted"
|
||||
data = {
|
||||
"user_id": creator,
|
||||
"cursor": cursor,
|
||||
"num": page_size,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
}
|
||||
return await self.get(uri, data)
|
||||
|
||||
async def get_all_notes_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = True
|
||||
notes_cursor = ""
|
||||
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
|
||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
break
|
||||
|
||||
notes_has_more = notes_res.get("has_more", False)
|
||||
notes_cursor = notes_res.get("cursor", "")
|
||||
if "notes" not in notes_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||
break
|
||||
|
||||
notes = notes_res["notes"]
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
||||
|
||||
remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result)
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
notes_to_add = notes[:remaining]
|
||||
if callback:
|
||||
await callback(notes_to_add)
|
||||
|
||||
result.extend(notes_to_add)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}")
|
||||
return result
|
||||
|
||||
async def get_note_short_url(self, note_id: str) -> Dict:
|
||||
"""
|
||||
获取笔记的短链接
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/sns/web/short_url"
|
||||
data = {"original_url": f"{self._domain}/discovery/item/{note_id}"}
|
||||
return await self.post(uri, data=data, return_response=True)
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def get_note_by_id_from_html(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
enable_cookie: bool = False,
|
||||
) -> Optional[Dict]:
|
||||
"""
|
||||
通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次
|
||||
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
|
||||
thanks for ReaJason
|
||||
Args:
|
||||
note_id:
|
||||
xsec_source:
|
||||
xsec_token:
|
||||
enable_cookie:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def camel_to_underscore(key):
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
|
||||
|
||||
def transform_json_keys(json_data):
|
||||
data_dict = json.loads(json_data)
|
||||
dict_new = {}
|
||||
for key, value in data_dict.items():
|
||||
new_key = camel_to_underscore(key)
|
||||
if not value:
|
||||
dict_new[new_key] = value
|
||||
elif isinstance(value, dict):
|
||||
dict_new[new_key] = transform_json_keys(json.dumps(value))
|
||||
elif isinstance(value, list):
|
||||
dict_new[new_key] = [(transform_json_keys(json.dumps(item)) if (item and isinstance(item, dict)) else item) for item in value]
|
||||
else:
|
||||
dict_new[new_key] = value
|
||||
return dict_new
|
||||
|
||||
url = ("https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}")
|
||||
copy_headers = self.headers.copy()
|
||||
if not enable_cookie:
|
||||
del copy_headers["Cookie"]
|
||||
|
||||
html = await self.request(method="GET", url=url, return_response=True, headers=copy_headers)
|
||||
|
||||
def get_note_dict(html):
|
||||
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
|
||||
|
||||
if state != "{}":
|
||||
note_dict = transform_json_keys(state)
|
||||
return note_dict["note"]["note_detail_map"][note_id]["note"]
|
||||
return {}
|
||||
|
||||
try:
|
||||
return get_note_dict(html)
|
||||
except:
|
||||
return None
|
||||
@@ -0,0 +1,485 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
from tenacity import RetryError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import xhs as xhs_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import XiaoHongShuClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchSortType
|
||||
from .help import parse_note_info_from_note_url, get_search_id
|
||||
from .login import XiaoHongShuLogin
|
||||
|
||||
|
||||
class XiaoHongShuCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
xhs_client: XiaoHongShuClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.xiaohongshu.com"
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
||||
if not await self.xhs_client.pong():
|
||||
login_obj = XiaoHongShuLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # input your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
search_id = get_search_id()
|
||||
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
|
||||
note_ids: List[str] = []
|
||||
xsec_tokens: List[str] = []
|
||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
search_id=search_id,
|
||||
page=page,
|
||||
sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL),
|
||||
)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
if not notes_res or not notes_res.get("has_more", False):
|
||||
utils.logger.info("No more content!")
|
||||
break
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(
|
||||
note_id=post_item.get("id"),
|
||||
xsec_source=post_item.get("xsec_source"),
|
||||
xsec_token=post_item.get("xsec_token"),
|
||||
semaphore=semaphore,
|
||||
) for post_item in notes_res.get("items", {}) if post_item.get("model_type") not in ("rec_query", "hot_query")
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
note_ids.append(note_detail.get("note_id"))
|
||||
xsec_tokens.append(note_detail.get("xsec_token"))
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
|
||||
break
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""Get creator's notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
||||
for user_id in config.XHS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
|
||||
user_id=user_id,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=self.fetch_creator_notes_detail,
|
||||
)
|
||||
|
||||
note_ids = []
|
||||
xsec_tokens = []
|
||||
for note_item in all_notes_list:
|
||||
note_ids.append(note_item.get("note_id"))
|
||||
xsec_tokens.append(note_item.get("xsec_token"))
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
|
||||
async def fetch_creator_notes_detail(self, note_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(
|
||||
note_id=post_item.get("note_id"),
|
||||
xsec_source=post_item.get("xsec_source"),
|
||||
xsec_token=post_item.get("xsec_token"),
|
||||
semaphore=semaphore,
|
||||
) for post_item in note_list
|
||||
]
|
||||
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
|
||||
Returns:
|
||||
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
|
||||
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
|
||||
crawler_task = self.get_note_detail_async_task(
|
||||
note_id=note_url_info.note_id,
|
||||
xsec_source=note_url_info.xsec_source,
|
||||
xsec_token=note_url_info.xsec_token,
|
||||
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||
)
|
||||
get_note_detail_task_list.append(crawler_task)
|
||||
|
||||
need_get_comment_note_ids = []
|
||||
xsec_tokens = []
|
||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
need_get_comment_note_ids.append(note_detail.get("note_id", ""))
|
||||
xsec_tokens.append(note_detail.get("xsec_token", ""))
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> Optional[Dict]:
|
||||
"""Get note detail
|
||||
|
||||
Args:
|
||||
note_id:
|
||||
xsec_source:
|
||||
xsec_token:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
Dict: note detail
|
||||
"""
|
||||
note_detail = None
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||
except RetryError as e:
|
||||
pass
|
||||
|
||||
if not note_detail:
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||
return note_detail
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_list: List[str], xsec_tokens: List[str]):
|
||||
"""Batch get note comments"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for index, note_id in enumerate(note_list):
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore),
|
||||
name=note_id,
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
await self.xhs_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
xsec_token=xsec_token,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=xhs_store.batch_update_xhs_note_comments,
|
||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = XiaoHongShuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cache-control": "no-cache",
|
||||
"content-type": "application/json;charset=UTF-8",
|
||||
"origin": "https://www.xiaohongshu.com",
|
||||
"pragma": "no-cache",
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.xiaohongshu.com/",
|
||||
"sec-ch-ua": '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-site",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
||||
"Cookie": cookie_str,
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return xhs_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_notice_media(self, note_detail: Dict):
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
|
||||
return
|
||||
await self.get_note_images(note_detail)
|
||||
await self.get_notice_video(note_detail)
|
||||
|
||||
async def get_note_images(self, note_item: Dict):
|
||||
"""
|
||||
get note images. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
image_list: List[Dict] = note_item.get("image_list", [])
|
||||
|
||||
for img in image_list:
|
||||
if img.get("url_default") != "":
|
||||
img.update({"url": img.get("url_default")})
|
||||
|
||||
if not image_list:
|
||||
return
|
||||
picNum = 0
|
||||
for pic in image_list:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum}.jpg"
|
||||
picNum += 1
|
||||
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||
|
||||
async def get_notice_video(self, note_item: Dict):
|
||||
"""
|
||||
get note videos. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
|
||||
videos = xhs_store.get_video_url_arr(note_item)
|
||||
|
||||
if not videos:
|
||||
return
|
||||
videoNum = 0
|
||||
for url in videos:
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{videoNum}.mp4"
|
||||
videoNum += 1
|
||||
await xhs_store.update_xhs_note_video(note_id, content, extension_file_name)
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,83 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
class FeedType(Enum):
|
||||
# 推荐
|
||||
RECOMMEND = "homefeed_recommend"
|
||||
# 穿搭
|
||||
FASION = "homefeed.fashion_v3"
|
||||
# 美食
|
||||
FOOD = "homefeed.food_v3"
|
||||
# 彩妆
|
||||
COSMETICS = "homefeed.cosmetics_v3"
|
||||
# 影视
|
||||
MOVIE = "homefeed.movie_and_tv_v3"
|
||||
# 职场
|
||||
CAREER = "homefeed.career_v3"
|
||||
# 情感
|
||||
EMOTION = "homefeed.love_v3"
|
||||
# 家居
|
||||
HOURSE = "homefeed.household_product_v3"
|
||||
# 游戏
|
||||
GAME = "homefeed.gaming_v3"
|
||||
# 旅行
|
||||
TRAVEL = "homefeed.travel_v3"
|
||||
# 健身
|
||||
FITNESS = "homefeed.fitness_v3"
|
||||
|
||||
|
||||
class NoteType(Enum):
|
||||
NORMAL = "normal"
|
||||
VIDEO = "video"
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# default
|
||||
GENERAL = "general"
|
||||
# most popular
|
||||
MOST_POPULAR = "popularity_descending"
|
||||
# Latest
|
||||
LATEST = "time_descending"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
"""search note type
|
||||
"""
|
||||
# default
|
||||
ALL = 0
|
||||
# only video
|
||||
VIDEO = 1
|
||||
# only image
|
||||
IMAGE = 2
|
||||
|
||||
|
||||
class Note(NamedTuple):
|
||||
"""note tuple"""
|
||||
note_id: str
|
||||
title: str
|
||||
desc: str
|
||||
type: str
|
||||
user: dict
|
||||
img_urls: list
|
||||
video_url: str
|
||||
tag_list: list
|
||||
at_user_list: list
|
||||
collected_count: str
|
||||
comment_count: str
|
||||
liked_count: str
|
||||
share_count: str
|
||||
time: int
|
||||
last_update_time: int
|
||||
@@ -0,0 +1,316 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import ctypes
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from tools.crawler_util import extract_url_params_to_dict
|
||||
|
||||
|
||||
def sign(a1="", b1="", x_s="", x_t=""):
|
||||
"""
|
||||
takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
|
||||
"""
|
||||
common = {
|
||||
"s0": 3, # getPlatformCode
|
||||
"s1": "",
|
||||
"x0": "1", # localStorage.getItem("b1b1")
|
||||
"x1": "3.7.8-2", # version
|
||||
"x2": "Mac OS",
|
||||
"x3": "xhs-pc-web",
|
||||
"x4": "4.27.2",
|
||||
"x5": a1, # cookie of a1
|
||||
"x6": x_t,
|
||||
"x7": x_s,
|
||||
"x8": b1, # localStorage.getItem("b1")
|
||||
"x9": mrc(x_t + x_s + b1),
|
||||
"x10": 154, # getSigCount
|
||||
}
|
||||
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
||||
x_s_common = b64Encode(encode_str)
|
||||
x_b3_traceid = get_b3_trace_id()
|
||||
return {
|
||||
"x-s": x_s,
|
||||
"x-t": x_t,
|
||||
"x-s-common": x_s_common,
|
||||
"x-b3-traceid": x_b3_traceid
|
||||
}
|
||||
|
||||
|
||||
def get_b3_trace_id():
|
||||
re = "abcdef0123456789"
|
||||
je = 16
|
||||
e = ""
|
||||
for t in range(16):
|
||||
e += re[random.randint(0, je - 1)]
|
||||
return e
|
||||
|
||||
|
||||
def mrc(e):
|
||||
ie = [
|
||||
0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685,
|
||||
2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995,
|
||||
2125561021, 3887607047, 2428444049, 498536548, 1789927666, 4089016648,
|
||||
2227061214, 450548861, 1843258603, 4107580753, 2211677639, 325883990,
|
||||
1684777152, 4251122042, 2321926636, 335633487, 1661365465, 4195302755,
|
||||
2366115317, 997073096, 1281953886, 3579855332, 2724688242, 1006888145,
|
||||
1258607687, 3524101629, 2768942443, 901097722, 1119000684, 3686517206,
|
||||
2898065728, 853044451, 1172266101, 3705015759, 2882616665, 651767980,
|
||||
1373503546, 3369554304, 3218104598, 565507253, 1454621731, 3485111705,
|
||||
3099436303, 671266974, 1594198024, 3322730930, 2970347812, 795835527,
|
||||
1483230225, 3244367275, 3060149565, 1994146192, 31158534, 2563907772,
|
||||
4023717930, 1907459465, 112637215, 2680153253, 3904427059, 2013776290,
|
||||
251722036, 2517215374, 3775830040, 2137656763, 141376813, 2439277719,
|
||||
3865271297, 1802195444, 476864866, 2238001368, 4066508878, 1812370925,
|
||||
453092731, 2181625025, 4111451223, 1706088902, 314042704, 2344532202,
|
||||
4240017532, 1658658271, 366619977, 2362670323, 4224994405, 1303535960,
|
||||
984961486, 2747007092, 3569037538, 1256170817, 1037604311, 2765210733,
|
||||
3554079995, 1131014506, 879679996, 2909243462, 3663771856, 1141124467,
|
||||
855842277, 2852801631, 3708648649, 1342533948, 654459306, 3188396048,
|
||||
3373015174, 1466479909, 544179635, 3110523913, 3462522015, 1591671054,
|
||||
702138776, 2966460450, 3352799412, 1504918807, 783551873, 3082640443,
|
||||
3233442989, 3988292384, 2596254646, 62317068, 1957810842, 3939845945,
|
||||
2647816111, 81470997, 1943803523, 3814918930, 2489596804, 225274430,
|
||||
2053790376, 3826175755, 2466906013, 167816743, 2097651377, 4027552580,
|
||||
2265490386, 503444072, 1762050814, 4150417245, 2154129355, 426522225,
|
||||
1852507879, 4275313526, 2312317920, 282753626, 1742555852, 4189708143,
|
||||
2394877945, 397917763, 1622183637, 3604390888, 2714866558, 953729732,
|
||||
1340076626, 3518719985, 2797360999, 1068828381, 1219638859, 3624741850,
|
||||
2936675148, 906185462, 1090812512, 3747672003, 2825379669, 829329135,
|
||||
1181335161, 3412177804, 3160834842, 628085408, 1382605366, 3423369109,
|
||||
3138078467, 570562233, 1426400815, 3317316542, 2998733608, 733239954,
|
||||
1555261956, 3268935591, 3050360625, 752459403, 1541320221, 2607071920,
|
||||
3965973030, 1969922972, 40735498, 2617837225, 3943577151, 1913087877,
|
||||
83908371, 2512341634, 3803740692, 2075208622, 213261112, 2463272603,
|
||||
3855990285, 2094854071, 198958881, 2262029012, 4057260610, 1759359992,
|
||||
534414190, 2176718541, 4139329115, 1873836001, 414664567, 2282248934,
|
||||
4279200368, 1711684554, 285281116, 2405801727, 4167216745, 1634467795,
|
||||
376229701, 2685067896, 3608007406, 1308918612, 956543938, 2808555105,
|
||||
3495958263, 1231636301, 1047427035, 2932959818, 3654703836, 1088359270,
|
||||
936918000, 2847714899, 3736837829, 1202900863, 817233897, 3183342108,
|
||||
3401237130, 1404277552, 615818150, 3134207493, 3453421203, 1423857449,
|
||||
601450431, 3009837614, 3294710456, 1567103746, 711928724, 3020668471,
|
||||
3272380065, 1510334235, 755167117,
|
||||
]
|
||||
o = -1
|
||||
|
||||
def right_without_sign(num: int, bit: int=0) -> int:
|
||||
val = ctypes.c_uint32(num).value >> bit
|
||||
MAX32INT = 4294967295
|
||||
return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1
|
||||
|
||||
for n in range(57):
|
||||
o = ie[(o & 255) ^ ord(e[n])] ^ right_without_sign(o, 8)
|
||||
return o ^ -1 ^ 3988292384
|
||||
|
||||
|
||||
lookup = [
|
||||
"Z",
|
||||
"m",
|
||||
"s",
|
||||
"e",
|
||||
"r",
|
||||
"b",
|
||||
"B",
|
||||
"o",
|
||||
"H",
|
||||
"Q",
|
||||
"t",
|
||||
"N",
|
||||
"P",
|
||||
"+",
|
||||
"w",
|
||||
"O",
|
||||
"c",
|
||||
"z",
|
||||
"a",
|
||||
"/",
|
||||
"L",
|
||||
"p",
|
||||
"n",
|
||||
"g",
|
||||
"G",
|
||||
"8",
|
||||
"y",
|
||||
"J",
|
||||
"q",
|
||||
"4",
|
||||
"2",
|
||||
"K",
|
||||
"W",
|
||||
"Y",
|
||||
"j",
|
||||
"0",
|
||||
"D",
|
||||
"S",
|
||||
"f",
|
||||
"d",
|
||||
"i",
|
||||
"k",
|
||||
"x",
|
||||
"3",
|
||||
"V",
|
||||
"T",
|
||||
"1",
|
||||
"6",
|
||||
"I",
|
||||
"l",
|
||||
"U",
|
||||
"A",
|
||||
"F",
|
||||
"M",
|
||||
"9",
|
||||
"7",
|
||||
"h",
|
||||
"E",
|
||||
"C",
|
||||
"v",
|
||||
"u",
|
||||
"R",
|
||||
"X",
|
||||
"5",
|
||||
]
|
||||
|
||||
|
||||
def tripletToBase64(e):
|
||||
return (
|
||||
lookup[63 & (e >> 18)] +
|
||||
lookup[63 & (e >> 12)] +
|
||||
lookup[(e >> 6) & 63] +
|
||||
lookup[e & 63]
|
||||
)
|
||||
|
||||
|
||||
def encodeChunk(e, t, r):
|
||||
m = []
|
||||
for b in range(t, r, 3):
|
||||
n = (16711680 & (e[b] << 16)) + \
|
||||
((e[b + 1] << 8) & 65280) + (e[b + 2] & 255)
|
||||
m.append(tripletToBase64(n))
|
||||
return ''.join(m)
|
||||
|
||||
|
||||
def b64Encode(e):
|
||||
P = len(e)
|
||||
W = P % 3
|
||||
U = []
|
||||
z = 16383
|
||||
H = 0
|
||||
Z = P - W
|
||||
while H < Z:
|
||||
U.append(encodeChunk(e, H, Z if H + z > Z else H + z))
|
||||
H += z
|
||||
if 1 == W:
|
||||
F = e[P - 1]
|
||||
U.append(lookup[F >> 2] + lookup[(F << 4) & 63] + "==")
|
||||
elif 2 == W:
|
||||
F = (e[P - 2] << 8) + e[P - 1]
|
||||
U.append(lookup[F >> 10] + lookup[63 & (F >> 4)] +
|
||||
lookup[(F << 2) & 63] + "=")
|
||||
return "".join(U)
|
||||
|
||||
|
||||
def encodeUtf8(e):
|
||||
b = []
|
||||
m = urllib.parse.quote(e, safe='~()*!.\'')
|
||||
w = 0
|
||||
while w < len(m):
|
||||
T = m[w]
|
||||
if T == "%":
|
||||
E = m[w + 1] + m[w + 2]
|
||||
S = int(E, 16)
|
||||
b.append(S)
|
||||
w += 2
|
||||
else:
|
||||
b.append(ord(T[0]))
|
||||
w += 1
|
||||
return b
|
||||
|
||||
|
||||
def base36encode(number, alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
|
||||
"""Converts an integer to a base36 string."""
|
||||
if not isinstance(number, int):
|
||||
raise TypeError('number must be an integer')
|
||||
|
||||
base36 = ''
|
||||
sign = ''
|
||||
|
||||
if number < 0:
|
||||
sign = '-'
|
||||
number = -number
|
||||
|
||||
if 0 <= number < len(alphabet):
|
||||
return sign + alphabet[number]
|
||||
|
||||
while number != 0:
|
||||
number, i = divmod(number, len(alphabet))
|
||||
base36 = alphabet[i] + base36
|
||||
|
||||
return sign + base36
|
||||
|
||||
|
||||
def base36decode(number):
|
||||
return int(number, 36)
|
||||
|
||||
|
||||
def get_search_id():
|
||||
e = int(time.time() * 1000) << 64
|
||||
t = int(random.uniform(0, 2147483646))
|
||||
return base36encode((e + t))
|
||||
|
||||
|
||||
img_cdns = [
|
||||
"https://sns-img-qc.xhscdn.com",
|
||||
"https://sns-img-hw.xhscdn.com",
|
||||
"https://sns-img-bd.xhscdn.com",
|
||||
"https://sns-img-qn.xhscdn.com",
|
||||
]
|
||||
|
||||
def get_img_url_by_trace_id(trace_id: str, format_type: str = "png"):
|
||||
return f"{random.choice(img_cdns)}/{trace_id}?imageView2/format/{format_type}"
|
||||
|
||||
|
||||
def get_img_urls_by_trace_id(trace_id: str, format_type: str = "png"):
|
||||
return [f"{cdn}/{trace_id}?imageView2/format/{format_type}" for cdn in img_cdns]
|
||||
|
||||
|
||||
def get_trace_id(img_url: str):
|
||||
# 浏览器端上传的图片多了 /spectrum/ 这个路径
|
||||
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
|
||||
|
||||
|
||||
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||
"""
|
||||
从小红书笔记url中解析出笔记信息
|
||||
Args:
|
||||
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
Returns:
|
||||
|
||||
"""
|
||||
note_id = url.split("/")[-1].split("?")[0]
|
||||
params = extract_url_params_to_dict(url)
|
||||
xsec_token = params.get("xsec_token", "")
|
||||
xsec_source = params.get("xsec_source", "")
|
||||
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||
# 获取一个图片地址在多个cdn下的url地址
|
||||
# final_img_urls = get_img_urls_by_trace_id(get_trace_id(_img_url))
|
||||
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
||||
print(final_img_url)
|
||||
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from cache.cache_factory import CacheFactory
|
||||
from tools import utils
|
||||
|
||||
|
||||
class XiaoHongShuLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
|
||||
if "请通过验证" in await self.context_page.content():
|
||||
utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
current_web_session = cookie_dict.get("web_session")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login xiaohongshu by mobile"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||
login_button_ele = await self.context_page.wait_for_selector(
|
||||
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||||
timeout=5000
|
||||
)
|
||||
await login_button_ele.click()
|
||||
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
|
||||
# 另一种是需要点击切换到手机登录的
|
||||
element = await self.context_page.wait_for_selector(
|
||||
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||||
timeout=5000
|
||||
)
|
||||
await element.click()
|
||||
except Exception as e:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||
input_ele = await login_container_ele.query_selector("label.phone > input")
|
||||
await input_ele.fill(self.login_phone)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
|
||||
await send_btn_ele.click() # 点击发送验证码
|
||||
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||||
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"xhs_{self.login_phone}"
|
||||
sms_code_value = cache_client.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||||
await asyncio.sleep(0.5)
|
||||
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||||
await agree_privacy_ele.click() # 点击同意隐私协议
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
break
|
||||
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||||
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
sys.exit()
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login xiaohongshu website by cookies"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
if key != "web_session": # only set web_session cookie attr
|
||||
continue
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".xiaohongshu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import ZhihuCrawler
|
||||
@@ -0,0 +1,568 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from constant import zhihu as zhihu_constant
|
||||
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError, ForbiddenError
|
||||
from .field import SearchSort, SearchTime, SearchType
|
||||
from .help import ZhihuExtractor, sign
|
||||
|
||||
|
||||
class ZhiHuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.default_headers = headers
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
|
||||
async def _pre_headers(self, url: str) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Args:
|
||||
url: 请求的URL需要包含请求的参数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
d_c0 = self.cookie_dict.get("d_c0")
|
||||
if not d_c0:
|
||||
raise Exception("d_c0 not found in cookies")
|
||||
sign_res = sign(url, self.default_headers["cookie"])
|
||||
headers = self.default_headers.copy()
|
||||
headers['x-zst-81'] = sign_res["x-zst-81"]
|
||||
headers['x-zse-96'] = sign_res["x-zse-96"]
|
||||
return headers
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# return response.text
|
||||
return_response = kwargs.pop('return_response', False)
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}")
|
||||
if response.status_code == 403:
|
||||
raise ForbiddenError(response.text)
|
||||
elif response.status_code == 404: # 如果一个content没有评论也是404
|
||||
return {}
|
||||
|
||||
raise DataFetchError(response.text)
|
||||
|
||||
if return_response:
|
||||
return response.text
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
if data.get("error"):
|
||||
utils.logger.error(f"[ZhiHuClient.request] Request error: {data}")
|
||||
raise DataFetchError(data.get("error", {}).get("message"))
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
utils.logger.error(f"[ZhiHuClient.request] Request error: {response.text}")
|
||||
raise DataFetchError(response.text)
|
||||
|
||||
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri += '?' + urlencode(params)
|
||||
headers = await self._pre_headers(final_uri)
|
||||
base_url = (zhihu_constant.ZHIHU_URL if "/p/" not in uri else zhihu_constant.ZHIHU_ZHUANLAN_URL)
|
||||
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[ZhiHuClient.pong] Begin to pong zhihu...")
|
||||
ping_flag = False
|
||||
try:
|
||||
res = await self.get_current_user_info()
|
||||
if res.get("uid") and res.get("name"):
|
||||
ping_flag = True
|
||||
utils.logger.info("[ZhiHuClient.pong] Ping zhihu successfully")
|
||||
else:
|
||||
utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed, response data: {res}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.default_headers["cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_current_user_info(self) -> Dict:
|
||||
"""
|
||||
获取当前登录用户信息
|
||||
Returns:
|
||||
|
||||
"""
|
||||
params = {"include": "email,is_active,is_bind_phone"}
|
||||
return await self.get("/api/v4/me", params)
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
sort: SearchSort = SearchSort.DEFAULT,
|
||||
note_type: SearchType = SearchType.DEFAULT,
|
||||
search_time: SearchTime = SearchTime.DEFAULT,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
根据关键词搜索
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 第几页
|
||||
page_size: 分页size
|
||||
sort: 排序
|
||||
note_type: 搜索结果类型
|
||||
search_time: 搜索多久时间的结果
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/v4/search_v3"
|
||||
params = {
|
||||
"gk_version": "gz-gaokao",
|
||||
"t": "general",
|
||||
"q": keyword,
|
||||
"correction": 1,
|
||||
"offset": (page - 1) * page_size,
|
||||
"limit": page_size,
|
||||
"filter_fields": "",
|
||||
"lc_idx": (page - 1) * page_size,
|
||||
"show_all_topics": 0,
|
||||
"search_source": "Filter",
|
||||
"time_interval": search_time.value,
|
||||
"sort": sort.value,
|
||||
"vertical": note_type.value,
|
||||
}
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
|
||||
return self._extractor.extract_contents_from_search(search_res)
|
||||
|
||||
async def get_root_comments(
|
||||
self,
|
||||
content_id: str,
|
||||
content_type: str,
|
||||
offset: str = "",
|
||||
limit: int = 10,
|
||||
order_by: str = "score",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取内容的一级评论
|
||||
Args:
|
||||
content_id: 内容ID
|
||||
content_type: 内容类型(answer, article, zvideo)
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
|
||||
params = {"order": order_by, "offset": offset, "limit": limit}
|
||||
return await self.get(uri, params)
|
||||
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
||||
# params = {
|
||||
# "order": order_by,
|
||||
# "offset": offset,
|
||||
# "limit": limit
|
||||
# }
|
||||
# return await self.get(uri, params)
|
||||
|
||||
async def get_child_comments(
|
||||
self,
|
||||
root_comment_id: str,
|
||||
offset: str = "",
|
||||
limit: int = 10,
|
||||
order_by: str = "sort",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论下的子评论
|
||||
Args:
|
||||
root_comment_id:
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/comment_v5/comment/{root_comment_id}/child_comment"
|
||||
params = {
|
||||
"order": order_by,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
content: ZhihuContent,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result: List[ZhihuComment] = []
|
||||
is_end: bool = False
|
||||
offset: str = ""
|
||||
limit: int = 10
|
||||
while not is_end:
|
||||
root_comment_res = await self.get_root_comments(content.content_id, content.content_type, offset, limit)
|
||||
if not root_comment_res:
|
||||
break
|
||||
paging_info = root_comment_res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
offset = self._extractor.extract_offset(paging_info)
|
||||
comments = self._extractor.extract_comments(content, root_comment_res.get("data"))
|
||||
|
||||
if not comments:
|
||||
break
|
||||
|
||||
if callback:
|
||||
await callback(comments)
|
||||
|
||||
result.extend(comments)
|
||||
await self.get_comments_all_sub_comments(content, comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
content: ZhihuContent,
|
||||
comments: List[ZhihuComment],
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
all_sub_comments: List[ZhihuComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
is_end: bool = False
|
||||
offset: str = ""
|
||||
limit: int = 10
|
||||
while not is_end:
|
||||
child_comment_res = await self.get_child_comments(parment_comment.comment_id, offset, limit)
|
||||
if not child_comment_res:
|
||||
break
|
||||
paging_info = child_comment_res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
offset = self._extractor.extract_offset(paging_info)
|
||||
sub_comments = self._extractor.extract_comments(content, child_comment_res.get("data"))
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
|
||||
if callback:
|
||||
await callback(sub_comments)
|
||||
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_sub_comments
|
||||
|
||||
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
获取创作者信息
|
||||
Args:
|
||||
url_token:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/people/{url_token}"
|
||||
html_content: str = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_creator(url_token, html_content)
|
||||
|
||||
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的回答
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/answers"
|
||||
params = {
|
||||
"include":
|
||||
"data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"order_by": "created"
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的文章
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/articles"
|
||||
params = {
|
||||
"include":
|
||||
"data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"order_by": "created"
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的视频
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/zvideos"
|
||||
params = {
|
||||
"include": "similar_zvideo,creation_relationship,reaction_instruction",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"similar_aggregation": "true",
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有回答
|
||||
Args:
|
||||
creator: 创作者信息
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_answers(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
utils.logger.info(f"[ZhiHuClient.get_all_anwser_by_creator] Get creator {creator.url_token} answers: {res}")
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_all_articles_by_creator(
|
||||
self,
|
||||
creator: ZhihuCreator,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有文章
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_articles(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_all_videos_by_creator(
|
||||
self,
|
||||
creator: ZhihuCreator,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有视频
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_videos(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_answer_info(
|
||||
self,
|
||||
question_id: str,
|
||||
answer_id: str,
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取回答信息
|
||||
Args:
|
||||
question_id:
|
||||
answer_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/question/{question_id}/answer/{answer_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_answer_content_from_html(response_html)
|
||||
|
||||
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取文章信息
|
||||
Args:
|
||||
article_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{article_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_article_content_from_html(response_html)
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取视频信息
|
||||
Args:
|
||||
video_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/zvideo/{video_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_zvideo_content_from_html(response_html)
|
||||
@@ -0,0 +1,455 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from constant import zhihu as constant
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_zhihu import ZhihuContent, ZhihuCreator
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import zhihu as zhihu_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import ZhiHuClient
|
||||
from .exception import DataFetchError
|
||||
from .help import ZhihuExtractor, judge_zhihu_url
|
||||
from .login import ZhiHuLogin
|
||||
|
||||
|
||||
class ZhihuCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
zhihu_client: ZhiHuClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.zhihu.com"
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||
ip_proxy_info
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium, None, self.user_agent, headless=config.HEADLESS
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
|
||||
# Create a client to interact with the zhihu website.
|
||||
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
|
||||
if not await self.zhihu_client.pong():
|
||||
login_obj = ZhiHuLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # input your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
|
||||
zhihu_limit_count = 20 # zhihu limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * zhihu_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[ZhihuCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] search zhihu keyword: {keyword}, page: {page}"
|
||||
)
|
||||
content_list: List[ZhihuContent] = (
|
||||
await self.zhihu_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
)
|
||||
)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Search contents :{content_list}"
|
||||
)
|
||||
if not content_list:
|
||||
utils.logger.info("No more content!")
|
||||
break
|
||||
|
||||
page += 1
|
||||
for content in content_list:
|
||||
await zhihu_store.update_zhihu_content(content)
|
||||
|
||||
await self.batch_get_content_comments(content_list)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[ZhihuCrawler.search] Search content error")
|
||||
return
|
||||
|
||||
async def batch_get_content_comments(self, content_list: List[ZhihuContent]):
|
||||
"""
|
||||
Batch get content comments
|
||||
Args:
|
||||
content_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.batch_get_content_comments] Crawling comment mode is not enabled"
|
||||
)
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for content_item in content_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(content_item, semaphore), name=content_item.content_id
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(
|
||||
self, content_item: ZhihuContent, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""
|
||||
Get note comments with keyword filtering and quantity limitation
|
||||
Args:
|
||||
content_item:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}"
|
||||
)
|
||||
await self.zhihu_client.get_note_all_comments(
|
||||
content=content_item,
|
||||
crawl_interval=random.random(),
|
||||
callback=zhihu_store.batch_update_zhihu_note_comments,
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.get_creators_and_notes] Begin get xiaohongshu creators"
|
||||
)
|
||||
for user_link in config.ZHIHU_CREATOR_URL_LIST:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Begin get creator {user_link}"
|
||||
)
|
||||
user_url_token = user_link.split("/")[-1]
|
||||
# get creator detail info from web html content
|
||||
createor_info: ZhihuCreator = await self.zhihu_client.get_creator_info(
|
||||
url_token=user_url_token
|
||||
)
|
||||
if not createor_info:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Creator {user_url_token} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Creator info: {createor_info}"
|
||||
)
|
||||
await zhihu_store.save_creator(creator=createor_info)
|
||||
|
||||
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
|
||||
|
||||
# Get all anwser information of the creator
|
||||
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
|
||||
creator=createor_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=zhihu_store.batch_update_zhihu_contents,
|
||||
)
|
||||
|
||||
# Get all articles of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_articles_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
# Get all videos of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_videos_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
# Get all comments of the creator's contents
|
||||
await self.batch_get_content_comments(all_content_list)
|
||||
|
||||
async def get_note_detail(
|
||||
self, full_note_url: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
full_note_url: str
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||
)
|
||||
# judge note type
|
||||
note_type: str = judge_zhihu_url(full_note_url)
|
||||
if note_type == constant.ANSWER_NAME:
|
||||
question_id = full_note_url.split("/")[-3]
|
||||
answer_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||
|
||||
elif note_type == constant.ARTICLE_NAME:
|
||||
article_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_article_info(article_id)
|
||||
|
||||
elif note_type == constant.VIDEO_NAME:
|
||||
video_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_video_info(video_id)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Returns:
|
||||
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
|
||||
# remove query params
|
||||
full_note_url = full_note_url.split("?")[0]
|
||||
crawler_task = self.get_note_detail(
|
||||
full_note_url=full_note_url,
|
||||
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||
)
|
||||
get_note_detail_task_list.append(crawler_task)
|
||||
|
||||
need_get_comment_notes: List[ZhihuContent] = []
|
||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||
for index, note_detail in enumerate(note_details):
|
||||
if not note_detail:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
note_detail = cast(ZhihuContent, note_detail) # only for type check
|
||||
need_get_comment_notes.append(note_detail)
|
||||
await zhihu_store.update_zhihu_content(note_detail)
|
||||
|
||||
await self.batch_get_content_comments(need_get_comment_notes)
|
||||
|
||||
async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient:
|
||||
"""Create zhihu client"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
zhihu_client_obj = ZhiHuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"accept": "*/*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cookie": cookie_str,
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
|
||||
"user-agent": self.user_agent,
|
||||
"x-api-version": "3.0.91",
|
||||
"x-app-za": "OS=Web",
|
||||
"x-requested-with": "fetch",
|
||||
"x-zse-93": "101_3_3.0",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return zhihu_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,23 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
|
||||
class ForbiddenError(RequestError):
|
||||
"""Forbidden"""
|
||||
@@ -0,0 +1,47 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import NamedTuple
|
||||
|
||||
from constant import zhihu as zhihu_constant
|
||||
|
||||
|
||||
class SearchTime(Enum):
|
||||
"""
|
||||
搜索时间范围
|
||||
"""
|
||||
DEFAULT = "" # 不限时间
|
||||
ONE_DAY = "a_day" # 一天内
|
||||
ONE_WEEK = "a_week" # 一周内
|
||||
ONE_MONTH = "a_month" # 一个月内
|
||||
THREE_MONTH = "three_months" # 三个月内
|
||||
HALF_YEAR = "half_a_year" # 半年内
|
||||
ONE_YEAR = "a_year" # 一年内
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
"""
|
||||
搜索结果类型
|
||||
"""
|
||||
DEFAULT = "" # 不限类型
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # 只看回答
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # 只看视频
|
||||
|
||||
|
||||
class SearchSort(Enum):
|
||||
"""
|
||||
搜索结果排序
|
||||
"""
|
||||
DEFAULT = "" # 综合排序
|
||||
UPVOTED_COUNT = "upvoted_count" # 最多赞同
|
||||
CREATE_TIME = "created_time" # 最新发布
|
||||
@@ -0,0 +1,467 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import execjs
|
||||
from parsel import Selector
|
||||
|
||||
from constant import zhihu as zhihu_constant
|
||||
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from tools import utils
|
||||
from tools.crawler_util import extract_text_from_html
|
||||
|
||||
ZHIHU_SGIN_JS = None
|
||||
|
||||
|
||||
def sign(url: str, cookies: str) -> Dict:
|
||||
"""
|
||||
zhihu sign algorithm
|
||||
Args:
|
||||
url: request url with query string
|
||||
cookies: request cookies with d_c0 key
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
global ZHIHU_SGIN_JS
|
||||
if not ZHIHU_SGIN_JS:
|
||||
with open("libs/zhihu.js", mode="r", encoding="utf-8-sig") as f:
|
||||
ZHIHU_SGIN_JS = execjs.compile(f.read())
|
||||
|
||||
return ZHIHU_SGIN_JS.call("get_sign", url, cookies)
|
||||
|
||||
|
||||
class ZhihuExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def extract_contents_from_search(self, json_data: Dict) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu contents
|
||||
Args:
|
||||
json_data: zhihu json data
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not json_data:
|
||||
return []
|
||||
|
||||
search_result: List[Dict] = json_data.get("data", [])
|
||||
search_result = [s_item for s_item in search_result if s_item.get("type") in ['search_result', 'zvideo']]
|
||||
return self._extract_content_list([sr_item.get("object") for sr_item in search_result if sr_item.get("object")])
|
||||
|
||||
|
||||
def _extract_content_list(self, content_list: List[Dict]) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu content list
|
||||
Args:
|
||||
content_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not content_list:
|
||||
return []
|
||||
|
||||
res: List[ZhihuContent] = []
|
||||
for content in content_list:
|
||||
if content.get("type") == zhihu_constant.ANSWER_NAME:
|
||||
res.append(self._extract_answer_content(content))
|
||||
elif content.get("type") == zhihu_constant.ARTICLE_NAME:
|
||||
res.append(self._extract_article_content(content))
|
||||
elif content.get("type") == zhihu_constant.VIDEO_NAME:
|
||||
res.append(self._extract_zvideo_content(content))
|
||||
else:
|
||||
continue
|
||||
return res
|
||||
|
||||
def _extract_answer_content(self, answer: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu answer content
|
||||
Args:
|
||||
answer: zhihu answer
|
||||
|
||||
Returns:
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
res.content_id = answer.get("id")
|
||||
res.content_type = answer.get("type")
|
||||
res.content_text = extract_text_from_html(answer.get("content", ""))
|
||||
res.question_id = answer.get("question").get("id")
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/question/{res.question_id}/answer/{res.content_id}"
|
||||
res.title = extract_text_from_html(answer.get("title", ""))
|
||||
res.desc = extract_text_from_html(answer.get("description", "") or answer.get("excerpt", ""))
|
||||
res.created_time = answer.get("created_time")
|
||||
res.updated_time = answer.get("updated_time")
|
||||
res.voteup_count = answer.get("voteup_count", 0)
|
||||
res.comment_count = answer.get("comment_count", 0)
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(answer.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
def _extract_article_content(self, article: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu article content
|
||||
Args:
|
||||
article: zhihu article
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
res.content_id = article.get("id")
|
||||
res.content_type = article.get("type")
|
||||
res.content_text = extract_text_from_html(article.get("content"))
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_ZHUANLAN_URL}/p/{res.content_id}"
|
||||
res.title = extract_text_from_html(article.get("title"))
|
||||
res.desc = extract_text_from_html(article.get("excerpt"))
|
||||
res.created_time = article.get("created_time", 0) or article.get("created", 0)
|
||||
res.updated_time = article.get("updated_time", 0) or article.get("updated", 0)
|
||||
res.voteup_count = article.get("voteup_count", 0)
|
||||
res.comment_count = article.get("comment_count", 0)
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(article.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu zvideo content
|
||||
Args:
|
||||
zvideo:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||
res.created_time = zvideo.get("published_at")
|
||||
res.updated_time = zvideo.get("updated_at")
|
||||
else:
|
||||
res.content_url = zvideo.get("video_url")
|
||||
res.created_time = zvideo.get("created_at")
|
||||
res.content_id = zvideo.get("id")
|
||||
res.content_type = zvideo.get("type")
|
||||
res.title = extract_text_from_html(zvideo.get("title"))
|
||||
res.desc = extract_text_from_html(zvideo.get("description"))
|
||||
res.voteup_count = zvideo.get("voteup_count")
|
||||
res.comment_count = zvideo.get("comment_count")
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(zvideo.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _extract_content_or_comment_author(author: Dict) -> ZhihuCreator:
|
||||
"""
|
||||
extract zhihu author
|
||||
Args:
|
||||
author:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuCreator()
|
||||
try:
|
||||
if not author:
|
||||
return res
|
||||
if not author.get("id"):
|
||||
author = author.get("member")
|
||||
res.user_id = author.get("id")
|
||||
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{author.get('url_token')}"
|
||||
res.user_nickname = author.get("name")
|
||||
res.user_avatar = author.get("avatar_url")
|
||||
res.url_token = author.get("url_token")
|
||||
|
||||
except Exception as e :
|
||||
utils.logger.warning(
|
||||
f"[ZhihuExtractor._extract_content_or_comment_author] User Maybe Blocked. {e}"
|
||||
)
|
||||
return res
|
||||
|
||||
def extract_comments(self, page_content: ZhihuContent, comments: List[Dict]) -> List[ZhihuComment]:
|
||||
"""
|
||||
extract zhihu comments
|
||||
Args:
|
||||
page_content: zhihu content object
|
||||
comments: zhihu comments
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not comments:
|
||||
return []
|
||||
res: List[ZhihuComment] = []
|
||||
for comment in comments:
|
||||
if comment.get("type") != "comment":
|
||||
continue
|
||||
res.append(self._extract_comment(page_content, comment))
|
||||
return res
|
||||
|
||||
def _extract_comment(self, page_content: ZhihuContent, comment: Dict) -> ZhihuComment:
|
||||
"""
|
||||
extract zhihu comment
|
||||
Args:
|
||||
page_content: comment with content object
|
||||
comment: zhihu comment
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuComment()
|
||||
res.comment_id = str(comment.get("id", ""))
|
||||
res.parent_comment_id = comment.get("reply_comment_id")
|
||||
res.content = extract_text_from_html(comment.get("content"))
|
||||
res.publish_time = comment.get("created_time")
|
||||
res.ip_location = self._extract_comment_ip_location(comment.get("comment_tag", []))
|
||||
res.sub_comment_count = comment.get("child_comment_count")
|
||||
res.like_count = comment.get("like_count") if comment.get("like_count") else 0
|
||||
res.dislike_count = comment.get("dislike_count") if comment.get("dislike_count") else 0
|
||||
res.content_id = page_content.content_id
|
||||
res.content_type = page_content.content_type
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(comment.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _extract_comment_ip_location(comment_tags: List[Dict]) -> str:
|
||||
"""
|
||||
extract comment ip location
|
||||
Args:
|
||||
comment_tags:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not comment_tags:
|
||||
return ""
|
||||
|
||||
for ct in comment_tags:
|
||||
if ct.get("type") == "ip_info":
|
||||
return ct.get("text")
|
||||
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def extract_offset(paging_info: Dict) -> str:
|
||||
"""
|
||||
extract offset
|
||||
Args:
|
||||
paging_info:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# https://www.zhihu.com/api/v4/comment_v5/zvideos/1424368906836807681/root_comment?limit=10&offset=456770961_10125996085_0&order_by=score
|
||||
next_url = paging_info.get("next")
|
||||
if not next_url:
|
||||
return ""
|
||||
|
||||
parsed_url = urlparse(next_url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
offset = query_params.get('offset', [""])[0]
|
||||
return offset
|
||||
|
||||
@staticmethod
|
||||
def _foramt_gender_text(gender: int) -> str:
|
||||
"""
|
||||
format gender text
|
||||
Args:
|
||||
gender:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if gender == 1:
|
||||
return "男"
|
||||
elif gender == 0:
|
||||
return "女"
|
||||
else:
|
||||
return "未知"
|
||||
|
||||
|
||||
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
extract zhihu creator
|
||||
Args:
|
||||
user_url_token : zhihu creator url token
|
||||
html_content: zhihu creator html content
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
js_init_data = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="").strip()
|
||||
if not js_init_data:
|
||||
return None
|
||||
|
||||
js_init_data_dict: Dict = json.loads(js_init_data)
|
||||
users_info: Dict = js_init_data_dict.get("initialState", {}).get("entities", {}).get("users", {})
|
||||
if not users_info:
|
||||
return None
|
||||
|
||||
creator_info: Dict = users_info.get(user_url_token)
|
||||
if not creator_info:
|
||||
return None
|
||||
|
||||
res = ZhihuCreator()
|
||||
res.user_id = creator_info.get("id")
|
||||
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{user_url_token}"
|
||||
res.user_nickname = creator_info.get("name")
|
||||
res.user_avatar = creator_info.get("avatarUrl")
|
||||
res.url_token = creator_info.get("urlToken") or user_url_token
|
||||
res.gender = self._foramt_gender_text(creator_info.get("gender"))
|
||||
res.ip_location = creator_info.get("ipInfo")
|
||||
res.follows = creator_info.get("followingCount")
|
||||
res.fans = creator_info.get("followerCount")
|
||||
res.anwser_count = creator_info.get("answerCount")
|
||||
res.video_count = creator_info.get("zvideoCount")
|
||||
res.question_count = creator_info.get("questionCount")
|
||||
res.article_count = creator_info.get("articlesCount")
|
||||
res.column_count = creator_info.get("columnsCount")
|
||||
res.get_voteup_count = creator_info.get("voteupCount")
|
||||
return res
|
||||
|
||||
|
||||
def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract content list from creator
|
||||
Args:
|
||||
anwser_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not anwser_list:
|
||||
return []
|
||||
|
||||
return self._extract_content_list(anwser_list)
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu answer content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
|
||||
if not answer_info:
|
||||
return None
|
||||
|
||||
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
|
||||
|
||||
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu article content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
|
||||
if not article_info:
|
||||
return None
|
||||
|
||||
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
|
||||
|
||||
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu zvideo content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
|
||||
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
|
||||
if not zvideo_info:
|
||||
return None
|
||||
|
||||
# handler user info and video info
|
||||
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
|
||||
if not video_detail_info:
|
||||
return None
|
||||
if isinstance(video_detail_info.get("author"), str):
|
||||
author_name: str = video_detail_info.get("author")
|
||||
video_detail_info["author"] = users.get(author_name)
|
||||
|
||||
return self._extract_zvideo_content(video_detail_info)
|
||||
|
||||
|
||||
def judge_zhihu_url(note_detail_url: str) -> str:
|
||||
"""
|
||||
judge zhihu url type
|
||||
Args:
|
||||
note_detail_url:
|
||||
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
|
||||
eg2: https://www.zhihu.com/p/123456789 # article
|
||||
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if "/answer/" in note_detail_url:
|
||||
return zhihu_constant.ANSWER_NAME
|
||||
elif "/p/" in note_detail_url:
|
||||
return zhihu_constant.ARTICLE_NAME
|
||||
elif "/zvideo/" in note_detail_url:
|
||||
return zhihu_constant.VIDEO_NAME
|
||||
else:
|
||||
return ""
|
||||
@@ -0,0 +1,115 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class ZhiHuLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
current_web_session = cookie_dict.get("z_c0")
|
||||
if current_web_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login zhihu"""
|
||||
utils.logger.info("[ZhiHu.begin] Begin login zhihu ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[ZhiHu.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login zhihu by mobile"""
|
||||
# todo implement login by mobile
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login zhihu website and keep webdriver login state"""
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] Begin login zhihu by qrcode ...")
|
||||
qrcode_img_selector = "canvas.Qrcode-qrcode"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_qrcode_img_from_canvas(
|
||||
self.context_page,
|
||||
canvas_selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
if not base64_qrcode_img:
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[ZhiHu.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
|
||||
except RetryError:
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] Login zhihu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[ZhiHu.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login zhihu website by cookies"""
|
||||
utils.logger.info("[ZhiHu.login_by_cookies] Begin login zhihu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".zhihu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
Reference in New Issue
Block a user