1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -189,10 +189,11 @@ class BilibiliClient(AbstractApiClient):
|
||||
if not aid or not cid or aid <= 0 or cid <= 0:
|
||||
raise ValueError("aid 和 cid 必须存在")
|
||||
uri = "/x/player/wbi/playurl"
|
||||
qn_value = getattr(config, "BILI_QN", 80)
|
||||
params = {
|
||||
"avid": aid,
|
||||
"cid": cid,
|
||||
"qn": 80,
|
||||
"qn": qn_value,
|
||||
"fourk": 1,
|
||||
"fnval": 1,
|
||||
"platform": "pc",
|
||||
@@ -201,15 +202,17 @@ class BilibiliClient(AbstractApiClient):
|
||||
return await self.get(uri, params, enable_params_sign=True)
|
||||
|
||||
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
# Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206)
|
||||
async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
if 200 <= response.status_code < 300:
|
||||
return response.content
|
||||
utils.logger.error(
|
||||
f"[BilibiliClient.get_video_media] Unexpected status {response.status_code} for {url}"
|
||||
)
|
||||
return None
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from datetime import datetime, timedelta
|
||||
@@ -41,6 +41,7 @@ from var import crawler_type_var, source_keyword_var
|
||||
from .client import BilibiliClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchOrderType
|
||||
from .help import parse_video_info_from_url, parse_creator_info_from_url
|
||||
from .login import BilibiliLogin
|
||||
|
||||
|
||||
@@ -77,8 +78,9 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
@@ -103,8 +105,14 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
if config.CREATOR_MODE:
|
||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
await self.get_creator_videos(int(creator_id))
|
||||
for creator_url in config.BILI_CREATOR_ID_LIST:
|
||||
try:
|
||||
creator_info = parse_creator_info_from_url(creator_url)
|
||||
utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
|
||||
await self.get_creator_videos(int(creator_info.creator_id))
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}")
|
||||
continue
|
||||
else:
|
||||
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
||||
else:
|
||||
@@ -208,6 +216,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def search_by_keywords_in_time_range(self, daily_limit: bool):
|
||||
@@ -284,6 +297,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
|
||||
page += 1
|
||||
|
||||
# Sleep after page navigation
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
|
||||
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
except Exception as e:
|
||||
@@ -318,10 +336,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await asyncio.sleep(random.uniform(0.5, 1.5))
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
|
||||
await self.bili_client.get_video_all_comments(
|
||||
video_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=bilibili_store.batch_update_bilibili_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
@@ -347,14 +366,27 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
await self.get_specified_videos(video_bvids_list)
|
||||
if int(result["page"]["count"]) <= pn * ps:
|
||||
break
|
||||
await asyncio.sleep(random.random())
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
|
||||
pn += 1
|
||||
|
||||
async def get_specified_videos(self, bvids_list: List[str]):
|
||||
async def get_specified_videos(self, video_url_list: List[str]):
|
||||
"""
|
||||
get specified videos info
|
||||
get specified videos info from URLs or BV IDs
|
||||
:param video_url_list: List of video URLs or BV IDs
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...")
|
||||
bvids_list = []
|
||||
for video_url in video_url_list:
|
||||
try:
|
||||
video_info = parse_video_info_from_url(video_url)
|
||||
bvids_list.append(video_info.video_id)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}")
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}")
|
||||
continue
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
@@ -381,6 +413,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
|
||||
|
||||
# Sleep after fetching video details
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
|
||||
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
@@ -544,24 +581,37 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
return
|
||||
|
||||
content = await self.bili_client.get_video_media(video_url)
|
||||
await asyncio.sleep(random.random())
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||
|
||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
||||
async def get_all_creator_details(self, creator_url_list: List[str]):
|
||||
"""
|
||||
creator_id_list: get details for creator from creator_id_list
|
||||
creator_url_list: get details for creator from creator URL list
|
||||
"""
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...")
|
||||
|
||||
creator_id_list = []
|
||||
for creator_url in creator_url_list:
|
||||
try:
|
||||
creator_info = parse_creator_info_from_url(creator_url)
|
||||
creator_id_list.append(int(creator_info.creator_id))
|
||||
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
|
||||
except ValueError as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}")
|
||||
continue
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}")
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
try:
|
||||
for creator_id in creator_id_list:
|
||||
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
|
||||
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id))
|
||||
task_list.append(task)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
@@ -600,7 +650,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
|
||||
await self.bili_client.get_creator_all_fans(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
@@ -623,7 +673,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
|
||||
await self.bili_client.get_creator_all_followings(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
@@ -646,7 +696,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
|
||||
await self.bili_client.get_creator_all_dynamics(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
|
||||
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
@@ -9,15 +9,17 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 23:26
|
||||
# @Desc : bilibili 请求参数签名
|
||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
import re
|
||||
import urllib.parse
|
||||
from hashlib import md5
|
||||
from typing import Dict
|
||||
|
||||
from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo
|
||||
from tools import utils
|
||||
|
||||
|
||||
@@ -66,16 +68,71 @@ class BilibiliSign:
|
||||
return req_data
|
||||
|
||||
|
||||
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
|
||||
"""
|
||||
从B站视频URL中解析出视频ID
|
||||
Args:
|
||||
url: B站视频链接
|
||||
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
|
||||
- https://www.bilibili.com/video/BV1d54y1g7db
|
||||
- BV1d54y1g7db (直接传入BV号)
|
||||
Returns:
|
||||
VideoUrlInfo: 包含视频ID的对象
|
||||
"""
|
||||
# 如果传入的已经是BV号,直接返回
|
||||
if url.startswith("BV"):
|
||||
return VideoUrlInfo(video_id=url)
|
||||
|
||||
# 使用正则表达式提取BV号
|
||||
# 匹配 /video/BV... 或 /video/av... 格式
|
||||
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
|
||||
match = re.search(bv_pattern, url)
|
||||
|
||||
if match:
|
||||
video_id = match.group(1)
|
||||
return VideoUrlInfo(video_id=video_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出视频ID: {url}")
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从B站创作者空间URL中解析出创作者ID
|
||||
Args:
|
||||
url: B站创作者空间链接
|
||||
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
|
||||
- https://space.bilibili.com/20813884
|
||||
- 434377496 (直接传入UID)
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含创作者ID的对象
|
||||
"""
|
||||
# 如果传入的已经是纯数字ID,直接返回
|
||||
if url.isdigit():
|
||||
return CreatorUrlInfo(creator_id=url)
|
||||
|
||||
# 使用正则表达式提取UID
|
||||
# 匹配 /space.bilibili.com/数字 格式
|
||||
uid_pattern = r'space\.bilibili\.com/(\d+)'
|
||||
match = re.search(uid_pattern, url)
|
||||
|
||||
if match:
|
||||
creator_id = match.group(1)
|
||||
return CreatorUrlInfo(creator_id=creator_id)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_key = "7cd084941338484aae1ad9425b84077c"
|
||||
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
|
||||
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
|
||||
_req_data = dict()
|
||||
for params in _search_url.split("&"):
|
||||
kvalues = params.split("=")
|
||||
key = kvalues[0]
|
||||
value = kvalues[1]
|
||||
_req_data[key] = value
|
||||
print("pre req_data", _req_data)
|
||||
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
|
||||
print(_req_data)
|
||||
# 测试视频URL解析
|
||||
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||
video_url2 = "BV1d54y1g7db"
|
||||
print("视频URL解析测试:")
|
||||
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
|
||||
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
|
||||
|
||||
# 测试创作者URL解析
|
||||
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||
creator_url2 = "20813884"
|
||||
print("\n创作者URL解析测试:")
|
||||
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
|
||||
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")
|
||||
|
||||
Reference in New Issue
Block a user