Uploading the AI Crawler System: MindSpider

This commit is contained in:
戒酒的李白
2025-08-27 13:49:07 +08:00
parent 822bad557f
commit 587e709e82
174 changed files with 34562 additions and 25 deletions
@@ -0,0 +1,12 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from .core import DouYinCrawler
@@ -0,0 +1,326 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import copy
import json
import urllib.parse
from typing import Any, Callable, Dict, Union, Optional
import httpx
from playwright.async_api import BrowserContext
from base.base_crawler import AbstractApiClient
from tools import utils
from var import request_keyword_var
from .exception import *
from .field import *
from .help import *
class DouYinClient(AbstractApiClient):
def __init__(
self,
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
proxy=None,
*,
headers: Dict,
playwright_page: Optional[Page],
cookie_dict: Dict,
):
self.proxy = proxy
self.timeout = timeout
self.headers = headers
self._host = "https://www.douyin.com"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def __process_req_params(
self,
uri: str,
params: Optional[Dict] = None,
headers: Optional[Dict] = None,
request_method="GET",
):
if not params:
return
headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
common_params = {
"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"version_code": "190600",
"version_name": "19.6.0",
"update_version_code": "170400",
"pc_client_type": "1",
"cookie_enabled": "true",
"browser_language": "zh-CN",
"browser_platform": "MacIntel",
"browser_name": "Chrome",
"browser_version": "125.0.0.0",
"browser_online": "true",
"engine_name": "Blink",
"os_name": "Mac OS",
"os_version": "10.15.7",
"cpu_core_num": "8",
"device_memory": "8",
"engine_version": "109.0",
"platform": "PC",
"screen_width": "2560",
"screen_height": "1440",
'effective_type': '4g',
"round_trip_time": "50",
"webid": get_web_id(),
"msToken": local_storage.get("xmst"),
}
params.update(common_params)
query_string = urllib.parse.urlencode(params)
# 20240927 a-bogus更新(JS版本)
post_data = {}
if request_method == "POST":
post_data = params
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs):
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
try:
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
GET请求
"""
await self.__process_req_params(uri, params, headers)
headers = headers or self.headers
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
async def post(self, uri: str, data: dict, headers: Optional[Dict] = None):
await self.__process_req_params(uri, data, headers)
headers = headers or self.headers
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
async def pong(self, browser_context: BrowserContext) -> bool:
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
if local_storage.get("HasUserLogin", "") == "1":
return True
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
return cookie_dict.get("LOGIN_STATUS") == "1"
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword(
self,
keyword: str,
offset: int = 0,
search_channel: SearchChannelType = SearchChannelType.GENERAL,
sort_type: SearchSortType = SearchSortType.GENERAL,
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
search_id: str = "",
):
"""
DouYin Web Search API
:param keyword:
:param offset:
:param search_channel:
:param sort_type:
:param publish_time: ·
:param search_id: ·
:return:
"""
query_params = {
'search_channel': search_channel.value,
'enable_history': '1',
'keyword': keyword,
'search_source': 'tab_search',
'query_correct_type': '1',
'is_filter_search': '0',
'from_group_id': '7378810571505847586',
'offset': offset,
'count': '15',
'need_filter_settings': '1',
'list_type': 'multi',
'search_id': search_id,
}
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
query_params["is_filter_search"] = 1
query_params["search_source"] = "tab_search"
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
async def get_video_by_id(self, aweme_id: str) -> Any:
"""
DouYin Video Detail API
:param aweme_id:
:return:
"""
params = {"aweme_id": aweme_id}
headers = copy.copy(self.headers)
del headers["Origin"]
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
return res.get("aweme_detail", {})
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
"""get note comments
"""
uri = "/aweme/v1/web/comment/list/"
params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0}
keywords = request_keyword_var.get()
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get(uri, params)
async def get_sub_comments(self, aweme_id: str, comment_id: str, cursor: int = 0):
"""
获取子评论
"""
uri = "/aweme/v1/web/comment/list/reply/"
params = {
'comment_id': comment_id,
"cursor": cursor,
"count": 20,
"item_type": 0,
"item_id": aweme_id,
}
keywords = request_keyword_var.get()
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get(uri, params)
async def get_aweme_all_comments(
self,
aweme_id: str,
crawl_interval: float = 1.0,
is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10,
):
"""
获取帖子的所有评论,包括子评论
:param aweme_id: 帖子ID
:param crawl_interval: 抓取间隔
:param is_fetch_sub_comments: 是否抓取子评论
:param callback: 回调函数,用于处理抓取到的评论
:param max_count: 一次帖子爬取的最大评论数量
:return: 评论列表
"""
result = []
comments_has_more = 1
comments_cursor = 0
while comments_has_more and len(result) < max_count:
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
comments_has_more = comments_res.get("has_more", 0)
comments_cursor = comments_res.get("cursor", 0)
comments = comments_res.get("comments", [])
if not comments:
continue
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
result.extend(comments)
if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, comments)
await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments:
continue
# 获取二级评论
for comment in comments:
reply_comment_total = comment.get("reply_comment_total")
if reply_comment_total > 0:
comment_id = comment.get("cid")
sub_comments_has_more = 1
sub_comments_cursor = 0
while sub_comments_has_more:
sub_comments_res = await self.get_sub_comments(aweme_id, comment_id, sub_comments_cursor)
sub_comments_has_more = sub_comments_res.get("has_more", 0)
sub_comments_cursor = sub_comments_res.get("cursor", 0)
sub_comments = sub_comments_res.get("comments", [])
if not sub_comments:
continue
result.extend(sub_comments)
if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, sub_comments)
await asyncio.sleep(crawl_interval)
return result
async def get_user_info(self, sec_user_id: str):
uri = "/aweme/v1/web/user/profile/other/"
params = {
"sec_user_id": sec_user_id,
"publish_video_strategy_type": 2,
"personal_center_strategy": 1,
}
return await self.get(uri, params)
async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict:
uri = "/aweme/v1/web/aweme/post/"
params = {
"sec_user_id": sec_user_id,
"count": 18,
"max_cursor": max_cursor,
"locate_query": "false",
"publish_video_strategy_type": 2,
'verifyFp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU',
'fp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU'
}
return await self.get(uri, params)
async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None):
posts_has_more = 1
max_cursor = ""
result = []
while posts_has_more == 1:
aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor)
posts_has_more = aweme_post_res.get("has_more", 0)
max_cursor = aweme_post_res.get("max_cursor")
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
if callback:
await callback(aweme_list)
result.extend(aweme_list)
return result
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
async with httpx.AsyncClient(proxy=self.proxy) as client:
try:
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
response.raise_for_status()
if not response.reason_phrase == "OK":
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
return None
else:
return response.content
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
return None
@@ -0,0 +1,393 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import os
import random
from asyncio import Task
from typing import Any, Dict, List, Optional, Tuple
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import douyin as douyin_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var
from .client import DouYinClient
from .exception import DataFetchError
from .field import PublishTimeType
from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler):
context_page: Page
dy_client: DouYinClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self) -> None:
self.index_url = "https://www.douyin.com"
self.cdp_manager = None
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
None,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium,
playwright_proxy_format,
user_agent=None,
headless=config.HEADLESS,
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
if not await self.dy_client.pong(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=config.LOGIN_TYPE,
login_phone="", # you phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_awemes()
elif config.CRAWLER_TYPE == "creator":
# Get the information and comments of the specified creator
await self.get_creators_and_videos()
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
async def search(self) -> None:
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
page = 0
dy_search_id = ""
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1
continue
try:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(
keyword=keyword,
offset=page * dy_limit_count - dy_limit_count,
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
break
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break
page += 1
if "data" not in posts_res:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
break
dy_search_id = posts_res.get("extra", {}).get("logid", "")
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
except TypeError:
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info)
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
"""Get the information and comments of the specified post"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
aweme_details = await asyncio.gather(*task_list)
for aweme_detail in aweme_details:
if aweme_detail is not None:
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
await self.get_aweme_media(aweme_item=aweme_detail)
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
"""Get note detail"""
async with semaphore:
try:
return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
"""
Batch get note comments
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return
task_list: List[Task] = []
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
task_list.append(task)
if len(task_list) > 0:
await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
async with semaphore:
try:
# 将关键词列表传递给 get_aweme_all_comments 方法
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
crawl_interval=random.random(),
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
async def get_creators_and_videos(self) -> None:
"""
Get the information and videos of the specified creator
"""
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
for user_id in config.DY_CREATOR_ID_LIST:
creator_info: Dict = await self.dy_client.get_user_info(user_id)
if creator_info:
await douyin_store.save_creator(user_id, creator=creator_info)
# Get all video information of the creator
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
await self.batch_get_note_comments(video_ids)
async def fetch_creator_video_detail(self, video_list: List[Dict]):
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
note_details = await asyncio.gather(*task_list)
for aweme_item in note_details:
if aweme_item is not None:
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
await self.get_aweme_media(aweme_item=aweme_item)
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
douyin_client = DouYinClient(
proxy=httpx_proxy,
headers={
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return douyin_client
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent,
) # type: ignore
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless,
)
# 添加反检测脚本
await self.cdp_manager.add_stealth_script()
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
async def close(self) -> None:
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
async def get_aweme_media(self, aweme_item: Dict):
"""
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
return
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
if note_download_url:
await self.get_aweme_images(aweme_item)
else:
await self.get_aweme_video(aweme_item)
async def get_aweme_images(self, aweme_item: Dict):
"""
get aweme images. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
if not note_download_url:
return
picNum = 0
for url in note_download_url:
if not url:
continue
content = await self.dy_client.get_aweme_media(url)
await asyncio.sleep(random.random())
if content is None:
continue
extension_file_name = f"{picNum:>03d}.jpeg"
picNum += 1
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
async def get_aweme_video(self, aweme_item: Dict):
"""
get aweme videos. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
if not video_download_url:
return
content = await self.dy_client.get_aweme_media(video_download_url)
await asyncio.sleep(random.random())
if content is None:
return
extension_file_name = f"video.mp4"
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)
@@ -0,0 +1,20 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from httpx import RequestError
class DataFetchError(RequestError):
"""something error when fetch"""
class IPBlockError(RequestError):
"""fetch so fast that the server block us ip"""
@@ -0,0 +1,34 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from enum import Enum
class SearchChannelType(Enum):
"""search channel type"""
GENERAL = "aweme_general" # 综合
VIDEO = "aweme_video_web" # 视频
USER = "aweme_user_web" # 用户
LIVE = "aweme_live" # 直播
class SearchSortType(Enum):
"""search sort type"""
GENERAL = 0 # 综合排序
MOST_LIKE = 1 # 最多点赞
LATEST = 2 # 最新发布
class PublishTimeType(Enum):
"""publish time type"""
UNLIMITED = 0 # 不限
ONE_DAY = 1 # 一天内
ONE_WEEK = 7 # 一周内
SIX_MONTH = 180 # 半年内
@@ -0,0 +1,85 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Name : 程序员阿江-Relakkes
# @Time : 2024/6/10 02:24
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
import random
import execjs
from playwright.async_api import Page
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
def get_web_id():
"""
生成随机的webid
Returns:
"""
def e(t):
if t is not None:
return str(t ^ (int(16 * random.random()) >> (t // 4)))
else:
return ''.join(
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
)
web_id = ''.join(
e(int(x)) if x in '018' else x for x in e(None)
)
return web_id.replace('-', '')[:19]
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
"""
获取 a_bogus 参数, 目前不支持post请求类型的签名
"""
return get_a_bogus_from_js(url, params, user_agent)
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
"""
通过js获取 a_bogus 参数
Args:
url:
params:
user_agent:
Returns:
"""
sign_js_name = "sign_datail"
if "/reply" in url:
sign_js_name = "sign_reply"
return douyin_sign_obj.call(sign_js_name, params, user_agent)
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
"""
通过playright获取 a_bogus 参数
playwright版本已失效
Returns:
"""
if not post_data:
post_data = ""
a_bogus = await page.evaluate(
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
[params, post_data, user_agent])
return a_bogus
@@ -0,0 +1,265 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import functools
import sys
from typing import Optional
from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from cache.cache_factory import CacheFactory
from tools import utils
class DouYinLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext, # type: ignore
context_page: Page, # type: ignore
login_phone: Optional[str] = "",
cookie_str: Optional[str] = ""
):
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.scan_qrcode_time = 60
self.cookie_str = cookie_str
async def begin(self):
"""
Start login douyin website
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
"""
# popup login dialog
await self.popup_login_dialog()
# select login type
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(6)
current_page_title = await self.context_page.title()
if "验证码中间页" in current_page_title:
await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
sys.exit()
# wait for redirect
wait_redirect_seconds = 5
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self):
"""Check if the current login status is successful and return True otherwise return False"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
for page in self.browser_context.pages:
try:
local_storage = await page.evaluate("() => window.localStorage")
if local_storage.get("HasUserLogin", "") == "1":
return True
except Exception as e:
# utils.logger.warn(f"[DouYinLogin] check_login_state waring: {e}")
await asyncio.sleep(0.1)
if cookie_dict.get("LOGIN_STATUS") == "1":
return True
return False
async def popup_login_dialog(self):
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
dialog_selector = "xpath=//div[@id='login-panel-new']"
try:
# check dialog box is auto popup and wait for 10 seconds
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e:
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click()
await asyncio.sleep(0.5)
async def login_by_qrcode(self):
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//div[@id='animate_qrcode_container']//img"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
sys.exit()
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
await asyncio.sleep(2)
async def login_by_mobile(self):
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
mobile_input_ele = self.context_page.locator("xpath=//input[@placeholder='手机号']")
await mobile_input_ele.fill(self.login_phone)
await asyncio.sleep(0.5)
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
await send_sms_code_btn.click()
# 检查是否有滑动验证码
await self.check_page_display_slider(move_step=10, slider_level="easy")
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = cache_client.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
await sms_code_input_ele.fill(value=sms_code_value.decode())
await asyncio.sleep(0.5)
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
await submit_btn_ele.click() # 点击登录
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
break
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
"""
检查页面是否出现滑动验证码
:return:
"""
# 等待滑动验证码的出现
back_selector = "#captcha-verify-image"
try:
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
return
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
max_slider_try_times = 20
slider_verify_success = False
while not slider_verify_success:
if max_slider_try_times <= 0:
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
sys.exit()
try:
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
await asyncio.sleep(1)
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content:
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue
# 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
slider_verify_success = True
except Exception as e:
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
await asyncio.sleep(1)
max_slider_try_times -= 1
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
"""
Move the slider to the right to complete the verification
:param back_selector: 滑动验证码背景图片的选择器
:param gap_selector: 滑动验证码的滑块选择器
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
:return:
"""
# get slider background image
slider_back_elements = await self.context_page.wait_for_selector(
selector=back_selector,
timeout=1000 * 10, # wait 10 seconds
)
slide_back = str(await slider_back_elements.get_property("src")) # type: ignore
# get slider gap image
gap_elements = await self.context_page.wait_for_selector(
selector=gap_selector,
timeout=1000 * 10, # wait 10 seconds
)
gap_src = str(await gap_elements.get_property("src")) # type: ignore
# 识别滑块位置
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
distance = slide_app.discern()
# 获取移动轨迹
tracks = utils.get_tracks(distance, slider_level)
new_1 = tracks[-1] - (sum(tracks) - distance)
tracks.pop()
tracks.append(new_1)
# 根据轨迹拖拽滑块到指定位置
element = await self.context_page.query_selector(gap_selector)
bounding_box = await element.bounding_box() # type: ignore
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
# 这里获取到x坐标中心点位置
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
# 模拟滑动操作
await element.hover() # type: ignore
await self.context_page.mouse.down()
for track in tracks:
# 循环鼠标按照轨迹移动
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
await self.context_page.mouse.move(x + track, 0, steps=move_step)
x += track
await self.context_page.mouse.up()
async def login_by_cookies(self):
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".douyin.com",
'path': "/"
}])