1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
@@ -189,10 +189,11 @@ class BilibiliClient(AbstractApiClient):
if not aid or not cid or aid <= 0 or cid <= 0:
raise ValueError("aid 和 cid 必须存在")
uri = "/x/player/wbi/playurl"
qn_value = getattr(config, "BILI_QN", 80)
params = {
"avid": aid,
"cid": cid,
"qn": 80,
"qn": qn_value,
"fourk": 1,
"fnval": 1,
"platform": "pc",
@@ -201,15 +202,17 @@ class BilibiliClient(AbstractApiClient):
return await self.get(uri, params, enable_params_sign=True)
async def get_video_media(self, url: str) -> Union[bytes, None]:
async with httpx.AsyncClient(proxy=self.proxy) as client:
# Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206)
async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True) as client:
try:
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
response.raise_for_status()
if not response.reason_phrase == "OK":
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
return None
else:
if 200 <= response.status_code < 300:
return response.content
utils.logger.error(
f"[BilibiliClient.get_video_media] Unexpected status {response.status_code} for {url}"
)
return None
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
return None
@@ -15,7 +15,7 @@
import asyncio
import os
import random
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime, timedelta
@@ -41,6 +41,7 @@ from var import crawler_type_var, source_keyword_var
from .client import BilibiliClient
from .exception import DataFetchError
from .field import SearchOrderType
from .help import parse_video_info_from_url, parse_creator_info_from_url
from .login import BilibiliLogin
@@ -77,8 +78,9 @@ class BilibiliCrawler(AbstractCrawler):
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -103,8 +105,14 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
if config.CREATOR_MODE:
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
for creator_url in config.BILI_CREATOR_ID_LIST:
try:
creator_info = parse_creator_info_from_url(creator_url)
utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
await self.get_creator_videos(int(creator_info.creator_id))
except ValueError as e:
utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}")
continue
else:
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
@@ -208,6 +216,11 @@ class BilibiliCrawler(AbstractCrawler):
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_video_comments(video_id_list)
async def search_by_keywords_in_time_range(self, daily_limit: bool):
@@ -284,6 +297,11 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_bilibili_video(video_item, semaphore)
page += 1
# Sleep after page navigation
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
await self.batch_get_video_comments(video_id_list)
except Exception as e:
@@ -318,10 +336,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
await asyncio.sleep(random.uniform(0.5, 1.5))
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
await self.bili_client.get_video_all_comments(
video_id=video_id,
crawl_interval=random.random(),
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=bilibili_store.batch_update_bilibili_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
@@ -347,14 +366,27 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_specified_videos(video_bvids_list)
if int(result["page"]["count"]) <= pn * ps:
break
await asyncio.sleep(random.random())
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
pn += 1
async def get_specified_videos(self, bvids_list: List[str]):
async def get_specified_videos(self, video_url_list: List[str]):
"""
get specified videos info
get specified videos info from URLs or BV IDs
:param video_url_list: List of video URLs or BV IDs
:return:
"""
utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...")
bvids_list = []
for video_url in video_url_list:
try:
video_info = parse_video_info_from_url(video_url)
bvids_list.append(video_info.video_id)
utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}")
except ValueError as e:
utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}")
continue
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
video_details = await asyncio.gather(*task_list)
@@ -381,6 +413,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
# Sleep after fetching video details
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
return result
except DataFetchError as ex:
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
@@ -544,24 +581,37 @@ class BilibiliCrawler(AbstractCrawler):
return
content = await self.bili_client.get_video_media(video_url)
await asyncio.sleep(random.random())
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
if content is None:
return
extension_file_name = f"video.mp4"
await bilibili_store.store_video(aid, content, extension_file_name)
async def get_all_creator_details(self, creator_id_list: List[int]):
async def get_all_creator_details(self, creator_url_list: List[str]):
"""
creator_id_list: get details for creator from creator_id_list
creator_url_list: get details for creator from creator URL list
"""
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators")
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...")
creator_id_list = []
for creator_url in creator_url_list:
try:
creator_info = parse_creator_info_from_url(creator_url)
creator_id_list.append(int(creator_info.creator_id))
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
except ValueError as e:
utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}")
continue
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
try:
for creator_id in creator_id_list:
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id))
task_list.append(task)
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
@@ -600,7 +650,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
await self.bili_client.get_creator_all_fans(
creator_info=creator_info,
crawl_interval=random.random(),
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_fans,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
@@ -623,7 +673,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
await self.bili_client.get_creator_all_followings(
creator_info=creator_info,
crawl_interval=random.random(),
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_followings,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
@@ -646,7 +696,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info,
crawl_interval=random.random(),
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
)
@@ -9,15 +9,17 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 23:26
# @Desc : bilibili 请求参数签名
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
import re
import urllib.parse
from hashlib import md5
from typing import Dict
from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo
from tools import utils
@@ -66,16 +68,71 @@ class BilibiliSign:
return req_data
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
"""
从B站视频URL中解析出视频ID
Args:
url: B站视频链接
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
- https://www.bilibili.com/video/BV1d54y1g7db
- BV1d54y1g7db (直接传入BV号)
Returns:
VideoUrlInfo: 包含视频ID的对象
"""
# 如果传入的已经是BV号,直接返回
if url.startswith("BV"):
return VideoUrlInfo(video_id=url)
# 使用正则表达式提取BV号
# 匹配 /video/BV... 或 /video/av... 格式
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
match = re.search(bv_pattern, url)
if match:
video_id = match.group(1)
return VideoUrlInfo(video_id=video_id)
raise ValueError(f"无法从URL中解析出视频ID: {url}")
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
"""
从B站创作者空间URL中解析出创作者ID
Args:
url: B站创作者空间链接
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
- https://space.bilibili.com/20813884
- 434377496 (直接传入UID)
Returns:
CreatorUrlInfo: 包含创作者ID的对象
"""
# 如果传入的已经是纯数字ID,直接返回
if url.isdigit():
return CreatorUrlInfo(creator_id=url)
# 使用正则表达式提取UID
# 匹配 /space.bilibili.com/数字 格式
uid_pattern = r'space\.bilibili\.com/(\d+)'
match = re.search(uid_pattern, url)
if match:
creator_id = match.group(1)
return CreatorUrlInfo(creator_id=creator_id)
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
if __name__ == '__main__':
_img_key = "7cd084941338484aae1ad9425b84077c"
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
_req_data = dict()
for params in _search_url.split("&"):
kvalues = params.split("=")
key = kvalues[0]
value = kvalues[1]
_req_data[key] = value
print("pre req_data", _req_data)
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
print(_req_data)
# 测试视频URL解析
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
video_url2 = "BV1d54y1g7db"
print("视频URL解析测试:")
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
# 测试创作者URL解析
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
creator_url2 = "20813884"
print("\n创作者URL解析测试:")
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")