Uploading the AI Crawler System: MindSpider
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .core import DouYinCrawler
|
||||
@@ -0,0 +1,326 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import urllib.parse
|
||||
from typing import Any, Callable, Dict, Union, Optional
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .exception import *
|
||||
from .field import *
|
||||
from .help import *
|
||||
|
||||
|
||||
class DouYinClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict,
|
||||
playwright_page: Optional[Page],
|
||||
cookie_dict: Dict,
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.douyin.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def __process_req_params(
|
||||
self,
|
||||
uri: str,
|
||||
params: Optional[Dict] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
request_method="GET",
|
||||
):
|
||||
|
||||
if not params:
|
||||
return
|
||||
headers = headers or self.headers
|
||||
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
||||
common_params = {
|
||||
"device_platform": "webapp",
|
||||
"aid": "6383",
|
||||
"channel": "channel_pc_web",
|
||||
"version_code": "190600",
|
||||
"version_name": "19.6.0",
|
||||
"update_version_code": "170400",
|
||||
"pc_client_type": "1",
|
||||
"cookie_enabled": "true",
|
||||
"browser_language": "zh-CN",
|
||||
"browser_platform": "MacIntel",
|
||||
"browser_name": "Chrome",
|
||||
"browser_version": "125.0.0.0",
|
||||
"browser_online": "true",
|
||||
"engine_name": "Blink",
|
||||
"os_name": "Mac OS",
|
||||
"os_version": "10.15.7",
|
||||
"cpu_core_num": "8",
|
||||
"device_memory": "8",
|
||||
"engine_version": "109.0",
|
||||
"platform": "PC",
|
||||
"screen_width": "2560",
|
||||
"screen_height": "1440",
|
||||
'effective_type': '4g',
|
||||
"round_trip_time": "50",
|
||||
"webid": get_web_id(),
|
||||
"msToken": local_storage.get("xmst"),
|
||||
}
|
||||
params.update(common_params)
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
|
||||
# 20240927 a-bogus更新(JS版本)
|
||||
post_data = {}
|
||||
if request_method == "POST":
|
||||
post_data = params
|
||||
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
GET请求
|
||||
"""
|
||||
await self.__process_req_params(uri, params, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
||||
|
||||
async def post(self, uri: str, data: dict, headers: Optional[Dict] = None):
|
||||
await self.__process_req_params(uri, data, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
|
||||
|
||||
async def pong(self, browser_context: BrowserContext) -> bool:
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
return cookie_dict.get("LOGIN_STATUS") == "1"
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_info_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
offset: int = 0,
|
||||
search_channel: SearchChannelType = SearchChannelType.GENERAL,
|
||||
sort_type: SearchSortType = SearchSortType.GENERAL,
|
||||
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
|
||||
search_id: str = "",
|
||||
):
|
||||
"""
|
||||
DouYin Web Search API
|
||||
:param keyword:
|
||||
:param offset:
|
||||
:param search_channel:
|
||||
:param sort_type:
|
||||
:param publish_time: ·
|
||||
:param search_id: ·
|
||||
:return:
|
||||
"""
|
||||
query_params = {
|
||||
'search_channel': search_channel.value,
|
||||
'enable_history': '1',
|
||||
'keyword': keyword,
|
||||
'search_source': 'tab_search',
|
||||
'query_correct_type': '1',
|
||||
'is_filter_search': '0',
|
||||
'from_group_id': '7378810571505847586',
|
||||
'offset': offset,
|
||||
'count': '15',
|
||||
'need_filter_settings': '1',
|
||||
'list_type': 'multi',
|
||||
'search_id': search_id,
|
||||
}
|
||||
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
||||
query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
|
||||
query_params["is_filter_search"] = 1
|
||||
query_params["search_source"] = "tab_search"
|
||||
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
|
||||
|
||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||
"""
|
||||
DouYin Video Detail API
|
||||
:param aweme_id:
|
||||
:return:
|
||||
"""
|
||||
params = {"aweme_id": aweme_id}
|
||||
headers = copy.copy(self.headers)
|
||||
del headers["Origin"]
|
||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
return res.get("aweme_detail", {})
|
||||
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||
"""get note comments
|
||||
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/"
|
||||
params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_sub_comments(self, aweme_id: str, comment_id: str, cursor: int = 0):
|
||||
"""
|
||||
获取子评论
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/reply/"
|
||||
params = {
|
||||
'comment_id': comment_id,
|
||||
"cursor": cursor,
|
||||
"count": 20,
|
||||
"item_type": 0,
|
||||
"item_id": aweme_id,
|
||||
}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_aweme_all_comments(
|
||||
self,
|
||||
aweme_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
获取帖子的所有评论,包括子评论
|
||||
:param aweme_id: 帖子ID
|
||||
:param crawl_interval: 抓取间隔
|
||||
:param is_fetch_sub_comments: 是否抓取子评论
|
||||
:param callback: 回调函数,用于处理抓取到的评论
|
||||
:param max_count: 一次帖子爬取的最大评论数量
|
||||
:return: 评论列表
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = 1
|
||||
comments_cursor = 0
|
||||
while comments_has_more and len(result) < max_count:
|
||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", 0)
|
||||
comments_cursor = comments_res.get("cursor", 0)
|
||||
comments = comments_res.get("comments", [])
|
||||
if not comments:
|
||||
continue
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
result.extend(comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, comments)
|
||||
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
continue
|
||||
# 获取二级评论
|
||||
for comment in comments:
|
||||
reply_comment_total = comment.get("reply_comment_total")
|
||||
|
||||
if reply_comment_total > 0:
|
||||
comment_id = comment.get("cid")
|
||||
sub_comments_has_more = 1
|
||||
sub_comments_cursor = 0
|
||||
|
||||
while sub_comments_has_more:
|
||||
sub_comments_res = await self.get_sub_comments(aweme_id, comment_id, sub_comments_cursor)
|
||||
sub_comments_has_more = sub_comments_res.get("has_more", 0)
|
||||
sub_comments_cursor = sub_comments_res.get("cursor", 0)
|
||||
sub_comments = sub_comments_res.get("comments", [])
|
||||
|
||||
if not sub_comments:
|
||||
continue
|
||||
result.extend(sub_comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
|
||||
async def get_user_info(self, sec_user_id: str):
|
||||
uri = "/aweme/v1/web/user/profile/other/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"publish_video_strategy_type": 2,
|
||||
"personal_center_strategy": 1,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict:
|
||||
uri = "/aweme/v1/web/aweme/post/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"count": 18,
|
||||
"max_cursor": max_cursor,
|
||||
"locate_query": "false",
|
||||
"publish_video_strategy_type": 2,
|
||||
'verifyFp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU',
|
||||
'fp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU'
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None):
|
||||
posts_has_more = 1
|
||||
max_cursor = ""
|
||||
result = []
|
||||
while posts_has_more == 1:
|
||||
aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor)
|
||||
posts_has_more = aweme_post_res.get("has_more", 0)
|
||||
max_cursor = aweme_post_res.get("max_cursor")
|
||||
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
|
||||
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
|
||||
if callback:
|
||||
await callback(aweme_list)
|
||||
result.extend(aweme_list)
|
||||
return result
|
||||
|
||||
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
@@ -0,0 +1,393 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import douyin as douyin_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import DouYinClient
|
||||
from .exception import DataFetchError
|
||||
from .field import PublishTimeType
|
||||
from .login import DouYinLogin
|
||||
|
||||
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
dy_client: DouYinClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
None,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
playwright_proxy_format,
|
||||
user_agent=None,
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
||||
if not await self.dy_client.pong(browser_context=self.browser_context):
|
||||
login_obj = DouYinLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # you phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_awemes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get the information and comments of the specified creator
|
||||
await self.get_creators_and_videos()
|
||||
|
||||
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
|
||||
dy_limit_count = 10 # douyin limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||
start_page = config.START_PAGE # start page number
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
page = 0
|
||||
dy_search_id = ""
|
||||
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
offset=page * dy_limit_count - dy_limit_count,
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
break
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
break
|
||||
|
||||
page += 1
|
||||
if "data" not in posts_res:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
||||
break
|
||||
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
||||
for post_item in posts_res.get("data"):
|
||||
try:
|
||||
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
||||
except TypeError:
|
||||
continue
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||||
await self.get_aweme_media(aweme_item=aweme_info)
|
||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_awemes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
|
||||
aweme_details = await asyncio.gather(*task_list)
|
||||
for aweme_detail in aweme_details:
|
||||
if aweme_detail is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
|
||||
await self.get_aweme_media(aweme_item=aweme_detail)
|
||||
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
"""
|
||||
Batch get note comments
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
if len(task_list) > 0:
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
crawl_interval=random.random(),
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=douyin_store.batch_update_dy_aweme_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""
|
||||
Get the information and videos of the specified creator
|
||||
"""
|
||||
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
|
||||
for user_id in config.DY_CREATOR_ID_LIST:
|
||||
creator_info: Dict = await self.dy_client.get_user_info(user_id)
|
||||
if creator_info:
|
||||
await douyin_store.save_creator(user_id, creator=creator_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
|
||||
|
||||
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
|
||||
await self.batch_get_note_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
|
||||
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for aweme_item in note_details:
|
||||
if aweme_item is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
|
||||
await self.get_aweme_media(aweme_item=aweme_item)
|
||||
|
||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
|
||||
"""Create douyin client"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
douyin_client = DouYinClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||||
"Cookie": cookie_str,
|
||||
"Host": "www.douyin.com",
|
||||
"Origin": "https://www.douyin.com/",
|
||||
"Referer": "https://www.douyin.com/",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return douyin_client
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
) # type: ignore
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 添加反检测脚本
|
||||
await self.cdp_manager.add_stealth_script()
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_aweme_media(self, aweme_item: Dict):
|
||||
"""
|
||||
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
|
||||
return
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
|
||||
if note_download_url:
|
||||
await self.get_aweme_images(aweme_item)
|
||||
else:
|
||||
await self.get_aweme_video(aweme_item)
|
||||
|
||||
async def get_aweme_images(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme images. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
|
||||
if not note_download_url:
|
||||
return
|
||||
picNum = 0
|
||||
for url in note_download_url:
|
||||
if not url:
|
||||
continue
|
||||
content = await self.dy_client.get_aweme_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum:>03d}.jpeg"
|
||||
picNum += 1
|
||||
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
|
||||
|
||||
async def get_aweme_video(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme videos. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
|
||||
if not video_download_url:
|
||||
return
|
||||
content = await self.dy_client.get_aweme_media(video_download_url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,34 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchChannelType(Enum):
|
||||
"""search channel type"""
|
||||
GENERAL = "aweme_general" # 综合
|
||||
VIDEO = "aweme_video_web" # 视频
|
||||
USER = "aweme_user_web" # 用户
|
||||
LIVE = "aweme_live" # 直播
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
GENERAL = 0 # 综合排序
|
||||
MOST_LIKE = 1 # 最多点赞
|
||||
LATEST = 2 # 最新发布
|
||||
|
||||
class PublishTimeType(Enum):
|
||||
"""publish time type"""
|
||||
UNLIMITED = 0 # 不限
|
||||
ONE_DAY = 1 # 一天内
|
||||
ONE_WEEK = 7 # 一周内
|
||||
SIX_MONTH = 180 # 半年内
|
||||
@@ -0,0 +1,85 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
|
||||
import random
|
||||
|
||||
import execjs
|
||||
from playwright.async_api import Page
|
||||
|
||||
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def e(t):
|
||||
if t is not None:
|
||||
return str(t ^ (int(16 * random.random()) >> (t // 4)))
|
||||
else:
|
||||
return ''.join(
|
||||
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
|
||||
)
|
||||
|
||||
web_id = ''.join(
|
||||
e(int(x)) if x in '018' else x for x in e(None)
|
||||
)
|
||||
return web_id.replace('-', '')[:19]
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数, 目前不支持post请求类型的签名
|
||||
"""
|
||||
return get_a_bogus_from_js(url, params, user_agent)
|
||||
|
||||
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
"""
|
||||
通过js获取 a_bogus 参数
|
||||
Args:
|
||||
url:
|
||||
params:
|
||||
user_agent:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
sign_js_name = "sign_datail"
|
||||
if "/reply" in url:
|
||||
sign_js_name = "sign_reply"
|
||||
return douyin_sign_obj.call(sign_js_name, params, user_agent)
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
playwright版本已失效
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
a_bogus = await page.evaluate(
|
||||
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
|
||||
[params, post_data, user_agent])
|
||||
|
||||
return a_bogus
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from cache.cache_factory import CacheFactory
|
||||
from tools import utils
|
||||
|
||||
|
||||
class DouYinLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext, # type: ignore
|
||||
context_page: Page, # type: ignore
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: Optional[str] = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.scan_qrcode_time = 60
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""
|
||||
Start login douyin website
|
||||
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
|
||||
"""
|
||||
|
||||
# popup login dialog
|
||||
await self.popup_login_dialog()
|
||||
|
||||
# select login type
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
await asyncio.sleep(6)
|
||||
current_page_title = await self.context_page.title()
|
||||
if "验证码中间页" in current_page_title:
|
||||
await self.check_page_display_slider(move_step=3, slider_level="hard")
|
||||
|
||||
# check login state
|
||||
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
# wait for redirect
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self):
|
||||
"""Check if the current login status is successful and return True otherwise return False"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
|
||||
for page in self.browser_context.pages:
|
||||
try:
|
||||
local_storage = await page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
except Exception as e:
|
||||
# utils.logger.warn(f"[DouYinLogin] check_login_state waring: {e}")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
if cookie_dict.get("LOGIN_STATUS") == "1":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def popup_login_dialog(self):
|
||||
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
||||
dialog_selector = "xpath=//div[@id='login-panel-new']"
|
||||
try:
|
||||
# check dialog box is auto popup and wait for 10 seconds
|
||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
|
||||
qrcode_img_selector = "xpath=//div[@id='animate_qrcode_container']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
|
||||
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
|
||||
await mobile_tap_ele.click()
|
||||
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
|
||||
mobile_input_ele = self.context_page.locator("xpath=//input[@placeholder='手机号']")
|
||||
await mobile_input_ele.fill(self.login_phone)
|
||||
await asyncio.sleep(0.5)
|
||||
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
|
||||
await send_sms_code_btn.click()
|
||||
|
||||
# 检查是否有滑动验证码
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"dy_{self.login_phone}"
|
||||
sms_code_value = cache_client.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
break
|
||||
|
||||
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
|
||||
"""
|
||||
检查页面是否出现滑动验证码
|
||||
:return:
|
||||
"""
|
||||
# 等待滑动验证码的出现
|
||||
back_selector = "#captcha-verify-image"
|
||||
try:
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
|
||||
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
|
||||
return
|
||||
|
||||
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
|
||||
max_slider_try_times = 20
|
||||
slider_verify_success = False
|
||||
while not slider_verify_success:
|
||||
if max_slider_try_times <= 0:
|
||||
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
|
||||
sys.exit()
|
||||
try:
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
|
||||
await asyncio.sleep(1)
|
||||
max_slider_try_times -= 1
|
||||
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
|
||||
continue
|
||||
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
"""
|
||||
Move the slider to the right to complete the verification
|
||||
:param back_selector: 滑动验证码背景图片的选择器
|
||||
:param gap_selector: 滑动验证码的滑块选择器
|
||||
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
|
||||
:return:
|
||||
"""
|
||||
|
||||
# get slider background image
|
||||
slider_back_elements = await self.context_page.wait_for_selector(
|
||||
selector=back_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
slide_back = str(await slider_back_elements.get_property("src")) # type: ignore
|
||||
|
||||
# get slider gap image
|
||||
gap_elements = await self.context_page.wait_for_selector(
|
||||
selector=gap_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
gap_src = str(await gap_elements.get_property("src")) # type: ignore
|
||||
|
||||
# 识别滑块位置
|
||||
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
|
||||
distance = slide_app.discern()
|
||||
|
||||
# 获取移动轨迹
|
||||
tracks = utils.get_tracks(distance, slider_level)
|
||||
new_1 = tracks[-1] - (sum(tracks) - distance)
|
||||
tracks.pop()
|
||||
tracks.append(new_1)
|
||||
|
||||
# 根据轨迹拖拽滑块到指定位置
|
||||
element = await self.context_page.query_selector(gap_selector)
|
||||
bounding_box = await element.bounding_box() # type: ignore
|
||||
|
||||
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
|
||||
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
|
||||
# 这里获取到x坐标中心点位置
|
||||
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
|
||||
# 模拟滑动操作
|
||||
await element.hover() # type: ignore
|
||||
await self.context_page.mouse.down()
|
||||
|
||||
for track in tracks:
|
||||
# 循环鼠标按照轨迹移动
|
||||
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
await self.context_page.mouse.move(x + track, 0, steps=move_step)
|
||||
x += track
|
||||
await self.context_page.mouse.up()
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".douyin.com",
|
||||
'path': "/"
|
||||
}])
|
||||
Reference in New Issue
Block a user