Uploading the AI Crawler System: MindSpider

This commit is contained in:
戒酒的李白
2025-08-27 13:49:07 +08:00
parent 822bad557f
commit 587e709e82
174 changed files with 34562 additions and 25 deletions
@@ -0,0 +1,13 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
from .core import KuaishouCrawler
@@ -0,0 +1,313 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
import config
from base.base_crawler import AbstractApiClient
from tools import utils
from .exception import DataFetchError
from .graphql import KuaiShouGraphQL
class KuaiShouClient(AbstractApiClient):
def __init__(
self,
timeout=10,
proxy=None,
*,
headers: Dict[str, str],
playwright_page: Page,
cookie_dict: Dict[str, str],
):
self.proxy = proxy
self.timeout = timeout
self.headers = headers
self._host = "https://www.kuaishou.com/graphql"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
self.graphql = KuaiShouGraphQL()
async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
data: Dict = response.json()
if data.get("errors"):
raise DataFetchError(data.get("errors", "unkonw error"))
else:
return data.get("data", {})
async def get(self, uri: str, params=None) -> Dict:
final_uri = uri
if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}"
return await self.request(
method="GET", url=f"{self._host}{final_uri}", headers=self.headers
)
async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
return await self.request(
method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers
)
async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
ping_flag = False
try:
post_data = {
"operationName": "visionProfileUserList",
"variables": {
"ftype": 1,
},
"query": self.graphql.get("vision_profile_user_list"),
}
res = await self.post("", post_data)
if res.get("visionProfileUserList", {}).get("result") == 1:
ping_flag = True
except Exception as e:
utils.logger.error(
f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..."
)
ping_flag = False
return ping_flag
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword(
self, keyword: str, pcursor: str, search_session_id: str = ""
):
"""
KuaiShou web search api
:param keyword: search keyword
:param pcursor: limite page curson
:param search_session_id: search session id
:return:
"""
post_data = {
"operationName": "visionSearchPhoto",
"variables": {
"keyword": keyword,
"pcursor": pcursor,
"page": "search",
"searchSessionId": search_session_id,
},
"query": self.graphql.get("search_query"),
}
return await self.post("", post_data)
async def get_video_info(self, photo_id: str) -> Dict:
"""
Kuaishou web video detail api
:param photo_id:
:return:
"""
post_data = {
"operationName": "visionVideoDetail",
"variables": {"photoId": photo_id, "page": "search"},
"query": self.graphql.get("video_detail"),
}
return await self.post("", post_data)
async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
"""get video comments
:param photo_id: photo id you want to fetch
:param pcursor: last you get pcursor, defaults to ""
:return:
"""
post_data = {
"operationName": "commentListQuery",
"variables": {"photoId": photo_id, "pcursor": pcursor},
"query": self.graphql.get("comment_list"),
}
return await self.post("", post_data)
async def get_video_sub_comments(
self, photo_id: str, rootCommentId: str, pcursor: str = ""
) -> Dict:
"""get video sub comments
:param photo_id: photo id you want to fetch
:param pcursor: last you get pcursor, defaults to ""
:return:
"""
post_data = {
"operationName": "visionSubCommentList",
"variables": {
"photoId": photo_id,
"pcursor": pcursor,
"rootCommentId": rootCommentId,
},
"query": self.graphql.get("vision_sub_comment_list"),
}
return await self.post("", post_data)
async def get_creator_profile(self, userId: str) -> Dict:
post_data = {
"operationName": "visionProfile",
"variables": {"userId": userId},
"query": self.graphql.get("vision_profile"),
}
return await self.post("", post_data)
async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
post_data = {
"operationName": "visionProfilePhotoList",
"variables": {"page": "profile", "pcursor": pcursor, "userId": userId},
"query": self.graphql.get("vision_profile_photo_list"),
}
return await self.post("", post_data)
async def get_video_all_comments(
self,
photo_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 10,
):
"""
get video all comments include sub comments
:param photo_id:
:param crawl_interval:
:param callback:
:param max_count:
:return:
"""
result = []
pcursor = ""
while pcursor != "no_more" and len(result) < max_count:
comments_res = await self.get_video_comments(photo_id, pcursor)
vision_commen_list = comments_res.get("visionCommentList", {})
pcursor = vision_commen_list.get("pcursor", "")
comments = vision_commen_list.get("rootComments", [])
if len(result) + len(comments) > max_count:
comments = comments[: max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数
await callback(photo_id, comments)
result.extend(comments)
await asyncio.sleep(crawl_interval)
sub_comments = await self.get_comments_all_sub_comments(
comments, photo_id, crawl_interval, callback
)
result.extend(sub_comments)
return result
async def get_comments_all_sub_comments(
self,
comments: List[Dict],
photo_id,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
Args:
comments: 评论列表
photo_id: 视频id
crawl_interval: 爬取一次评论的延迟单位(秒)
callback: 一次评论爬取结束后
Returns:
"""
if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info(
f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
)
return []
result = []
for comment in comments:
sub_comments = comment.get("subComments")
if sub_comments and callback:
await callback(photo_id, sub_comments)
sub_comment_pcursor = comment.get("subCommentsPcursor")
if sub_comment_pcursor == "no_more":
continue
root_comment_id = comment.get("commentId")
sub_comment_pcursor = ""
while sub_comment_pcursor != "no_more":
comments_res = await self.get_video_sub_comments(
photo_id, root_comment_id, sub_comment_pcursor
)
vision_sub_comment_list = comments_res.get("visionSubCommentList", {})
sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
comments = vision_sub_comment_list.get("subComments", {})
if callback:
await callback(photo_id, comments)
await asyncio.sleep(crawl_interval)
result.extend(comments)
return result
async def get_creator_info(self, user_id: str) -> Dict:
"""
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
快手用户主页
"""
visionProfile = await self.get_creator_profile(user_id)
return visionProfile.get("userProfile")
async def get_all_videos_by_creator(
self,
user_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
Args:
user_id: 用户ID
crawl_interval: 爬取一次的延迟单位(秒)
callback: 一次分页爬取结束后的更新回调函数
Returns:
"""
result = []
pcursor = ""
while pcursor != "no_more":
videos_res = await self.get_video_by_creater(user_id, pcursor)
if not videos_res:
utils.logger.error(
f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
)
break
vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
pcursor = vision_profile_photo_list.get("pcursor", "")
videos = vision_profile_photo_list.get("feeds", [])
utils.logger.info(
f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
)
if callback:
await callback(videos)
await asyncio.sleep(crawl_interval)
result.extend(videos)
return result
@@ -0,0 +1,396 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import os
import random
import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import (
BrowserContext,
BrowserType,
Page,
Playwright,
async_playwright,
)
import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import kuaishou as kuaishou_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
from var import comment_tasks_var, crawler_type_var, source_keyword_var
from .client import KuaiShouClient
from .exception import DataFetchError
from .login import KuaishouLogin
class KuaishouCrawler(AbstractCrawler):
context_page: Page
ks_client: KuaiShouClient
browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager]
def __init__(self):
self.index_url = "https://www.kuaishou.com"
self.user_agent = utils.get_user_agent()
self.cdp_manager = None
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
ip_proxy_info
)
async with async_playwright() as playwright:
# 根据配置选择启动模式
if config.ENABLE_CDP_MODE:
utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器")
self.browser_context = await self.launch_browser_with_cdp(
playwright,
playwright_proxy_format,
self.user_agent,
headless=config.CDP_HEADLESS,
)
else:
utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器")
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(
chromium, None, self.user_agent, headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(f"{self.index_url}?isHome=1")
# Create a client to interact with the kuaishou website.
self.ks_client = await self.create_ks_client(httpx_proxy_format)
if not await self.ks_client.pong():
login_obj = KuaishouLogin(
login_type=config.LOGIN_TYPE,
login_phone=httpx_proxy_format,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
await self.ks_client.update_cookies(
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for videos and retrieve their comment information.
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their videos and comments
await self.get_creators_and_videos()
else:
pass
utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
async def search(self):
utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
ks_limit_count = 20 # kuaishou limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
search_session_id = ""
source_keyword_var.set(keyword)
utils.logger.info(
f"[KuaishouCrawler.search] Current search keyword: {keyword}"
)
page = 1
while (
page - start_page + 1
) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
page += 1
continue
utils.logger.info(
f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}"
)
video_id_list: List[str] = []
videos_res = await self.ks_client.search_info_by_keyword(
keyword=keyword,
pcursor=str(page),
search_session_id=search_session_id,
)
if not videos_res:
utils.logger.error(
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data"
)
continue
vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
if vision_search_photo.get("result") != 1:
utils.logger.error(
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data "
)
continue
search_session_id = vision_search_photo.get("searchSessionId", "")
for video_detail in vision_search_photo.get("feeds"):
video_id_list.append(video_detail.get("photo", {}).get("id"))
await kuaishou_store.update_kuaishou_video(video_item=video_detail)
# batch fetch video comments
page += 1
await self.batch_get_video_comments(video_id_list)
async def get_specified_videos(self):
"""Get the information and comments of the specified post"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
for video_id in config.KS_SPECIFIED_ID_LIST
]
video_details = await asyncio.gather(*task_list)
for video_detail in video_details:
if video_detail is not None:
await kuaishou_store.update_kuaishou_video(video_detail)
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
async def get_video_info_task(
self, video_id: str, semaphore: asyncio.Semaphore
) -> Optional[Dict]:
"""Get video detail task"""
async with semaphore:
try:
result = await self.ks_client.get_video_info(video_id)
utils.logger.info(
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
)
return result.get("visionVideoDetail")
except DataFetchError as ex:
utils.logger.error(
f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}"
)
return None
except KeyError as ex:
utils.logger.error(
f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}"
)
return None
async def batch_get_video_comments(self, video_id_list: List[str]):
"""
batch get video comments
:param video_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(
f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled"
)
return
utils.logger.info(
f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for video_id in video_id_list:
task = asyncio.create_task(
self.get_comments(video_id, semaphore), name=video_id
)
task_list.append(task)
comment_tasks_var.set(task_list)
await asyncio.gather(*task_list)
async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
"""
get comment for video id
:param video_id:
:param semaphore:
:return:
"""
async with semaphore:
try:
utils.logger.info(
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
)
await self.ks_client.get_video_all_comments(
photo_id=video_id,
crawl_interval=random.random(),
callback=kuaishou_store.batch_update_ks_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
except DataFetchError as ex:
utils.logger.error(
f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
)
except Exception as e:
utils.logger.error(
f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}"
)
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
# maybe kuaishou block our request, we will take a nap and update the cookie again
current_running_tasks = comment_tasks_var.get()
for task in current_running_tasks:
task.cancel()
time.sleep(20)
await self.context_page.goto(f"{self.index_url}?isHome=1")
await self.ks_client.update_cookies(
browser_context=self.browser_context
)
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create ks client"""
utils.logger.info(
"[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..."
)
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
ks_client_obj = KuaiShouClient(
proxy=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Origin": self.index_url,
"Referer": self.index_url,
"Content-Type": "application/json;charset=UTF-8",
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return ks_client_obj
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info(
"[KuaishouCrawler.launch_browser] Begin create browser context ..."
)
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent,
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context
async def launch_browser_with_cdp(
self,
playwright: Playwright,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True,
) -> BrowserContext:
"""
使用CDP模式启动浏览器
"""
try:
self.cdp_manager = CDPBrowserManager()
browser_context = await self.cdp_manager.launch_and_connect(
playwright=playwright,
playwright_proxy=playwright_proxy,
user_agent=user_agent,
headless=headless,
)
# 显示浏览器信息
browser_info = await self.cdp_manager.get_browser_info()
utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}")
return browser_context
except Exception as e:
utils.logger.error(
f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}"
)
# 回退到标准模式
chromium = playwright.chromium
return await self.launch_browser(
chromium, playwright_proxy, user_agent, headless
)
async def get_creators_and_videos(self) -> None:
"""Get creator's videos and retrieve their comment information."""
utils.logger.info(
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
)
for user_id in config.KS_CREATOR_ID_LIST:
# get creator detail info from web html content
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
if createor_info:
await kuaishou_store.save_creator(user_id, creator=createor_info)
# Get all video information of the creator
all_video_list = await self.ks_client.get_all_videos_by_creator(
user_id=user_id,
crawl_interval=random.random(),
callback=self.fetch_creator_video_detail,
)
video_ids = [
video_item.get("photo", {}).get("id") for video_item in all_video_list
]
await self.batch_get_video_comments(video_ids)
async def fetch_creator_video_detail(self, video_list: List[Dict]):
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore)
for post_item in video_list
]
video_details = await asyncio.gather(*task_list)
for video_detail in video_details:
if video_detail is not None:
await kuaishou_store.update_kuaishou_video(video_detail)
async def close(self):
"""Close browser context"""
# 如果使用CDP模式,需要特殊处理
if self.cdp_manager:
await self.cdp_manager.cleanup()
self.cdp_manager = None
else:
await self.browser_context.close()
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
@@ -0,0 +1,20 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from httpx import RequestError
class DataFetchError(RequestError):
"""something error when fetch"""
class IPBlockError(RequestError):
"""fetch so fast that the server block us ip"""
@@ -0,0 +1,12 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
@@ -0,0 +1,33 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 快手的数据传输是基于GraphQL实现的
# 这个类负责获取一些GraphQL的schema
from typing import Dict
class KuaiShouGraphQL:
graphql_queries: Dict[str, str]= {}
def __init__(self):
self.graphql_dir = "media_platform/kuaishou/graphql/"
self.load_graphql_queries()
def load_graphql_queries(self):
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
for file in graphql_files:
with open(self.graphql_dir + file, mode="r") as f:
query_name = file.split(".")[0]
self.graphql_queries[query_name] = f.read()
def get(self, query_name: str) -> str:
return self.graphql_queries.get(query_name, "Query not found")
@@ -0,0 +1,39 @@
query commentListQuery($photoId: String, $pcursor: String) {
visionCommentList(photoId: $photoId, pcursor: $pcursor) {
commentCount
pcursor
rootComments {
commentId
authorId
authorName
content
headurl
timestamp
likedCount
realLikedCount
liked
status
authorLiked
subCommentCount
subCommentsPcursor
subComments {
commentId
authorId
authorName
content
headurl
timestamp
likedCount
realLikedCount
liked
status
authorLiked
replyToUserName
replyTo
__typename
}
__typename
}
__typename
}
}
@@ -0,0 +1,111 @@
fragment photoContent on PhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
}
fragment recoPhotoFragment on recoPhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
}
fragment feedContent on Feed {
type
author {
id
name
headerUrl
following
headerUrls {
url
__typename
}
__typename
}
photo {
...photoContent
...recoPhotoFragment
__typename
}
canAddComment
llsid
status
currentPcursor
tags {
type
name
__typename
}
__typename
}
query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
result
llsid
webPageArea
feeds {
...feedContent
__typename
}
searchSessionId
pcursor
aladdinBanner {
imgUrl
link
__typename
}
__typename
}
}
@@ -0,0 +1,80 @@
query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {
visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {
status
type
author {
id
name
following
headerUrl
__typename
}
photo {
id
duration
caption
likeCount
realLikeCount
coverUrl
photoUrl
liked
timestamp
expTag
llsid
viewCount
videoRatio
stereoType
musicBlocked
manifest {
mediaType
businessType
version
adaptationSet {
id
duration
representation {
id
defaultSelect
backupUrl
codecs
url
height
width
avgBitrate
maxBitrate
m3u8Slice
qualityType
qualityLabel
frameRate
featureP2sp
hidden
disableAdaptive
__typename
}
__typename
}
__typename
}
manifestH265
photoH265Url
coronaCropManifest
coronaCropManifestH265
croppedPhotoH265Url
croppedPhotoUrl
videoResource
__typename
}
tags {
type
name
__typename
}
commentLimit {
canAddComment
__typename
}
llsid
danmakuSwitch
__typename
}
}
@@ -0,0 +1,27 @@
query visionProfile($userId: String) {
visionProfile(userId: $userId) {
result
hostName
userProfile {
ownerCount {
fan
photo
follow
photo_public
__typename
}
profile {
gender
user_name
user_id
headurl
user_text
user_profile_bg_url
__typename
}
isFollowing
__typename
}
__typename
}
}
@@ -0,0 +1,110 @@
fragment photoContent on PhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
riskTagContent
riskTagUrl
}
fragment recoPhotoFragment on recoPhotoEntity {
__typename
id
duration
caption
originCaption
likeCount
viewCount
commentCount
realLikeCount
coverUrl
photoUrl
photoH265Url
manifest
manifestH265
videoResource
coverUrls {
url
__typename
}
timestamp
expTag
animatedCoverUrl
distance
videoRatio
liked
stereoType
profileUserTopPhoto
musicBlocked
riskTagContent
riskTagUrl
}
fragment feedContent on Feed {
type
author {
id
name
headerUrl
following
headerUrls {
url
__typename
}
__typename
}
photo {
...photoContent
...recoPhotoFragment
__typename
}
canAddComment
llsid
status
currentPcursor
tags {
type
name
__typename
}
__typename
}
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
result
llsid
webPageArea
feeds {
...feedContent
__typename
}
hostName
pcursor
__typename
}
}
@@ -0,0 +1,16 @@
query visionProfileUserList($pcursor: String, $ftype: Int) {
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
result
fols {
user_name
headurl
user_text
isFollowing
user_id
__typename
}
hostName
pcursor
__typename
}
}
@@ -0,0 +1,22 @@
mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
pcursor
subComments {
commentId
authorId
authorName
content
headurl
timestamp
likedCount
realLikedCount
liked
status
authorLiked
replyToUserName
replyTo
__typename
}
__typename
}
}
@@ -0,0 +1,113 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import functools
import sys
from typing import Optional
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from tools import utils
class KuaishouLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: Optional[str] = "",
cookie_str: str = ""
):
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
"""
Check if the current login status is successful and return True otherwise return False
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
if max retry times reached, raise RetryError
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
kuaishou_pass_token = cookie_dict.get("passToken")
if kuaishou_pass_token:
return True
return False
async def login_by_qrcode(self):
"""login kuaishou website and keep webdriver login state"""
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
# click login button
login_button_ele = self.context_page.locator(
"xpath=//p[text()='登录']"
)
await login_button_ele.click()
# find login qrcode
qrcode_img_selector = "//div[@class='qrcode-img']//img"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()
# show login qrcode
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self):
pass
async def login_by_cookies(self):
utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".kuaishou.com",
'path': "/"
}])