1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -10,5 +10,4 @@
|
||||
|
||||
|
||||
from .base_config import *
|
||||
from .db_config import *
|
||||
from .tieba_config import *
|
||||
from .db_config import *
|
||||
@@ -9,11 +9,12 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 基础配置
|
||||
PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
|
||||
KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔
|
||||
PLATFORM = "bili" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
|
||||
KEYWORDS = "电影鬼灭之刃,亲属想侵吞3姐妹亡父赔偿款,网警斩断侵害未成年人网络黑色产业链,2007年后出生的人不能在马尔代夫吸烟,沈月,是公主也是自己的骑士,以军虐囚视频,唐朝诡事录,广州地铁回应APP乘车码频繁弹窗广告,全红婵的减肥计划精确到克" # 关键词搜索配置,以英文逗号分隔
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
|
||||
# 是否开启 IP 代理
|
||||
ENABLE_IP_PROXY = False
|
||||
|
||||
@@ -36,7 +37,7 @@ SAVE_LOGIN_STATE = True
|
||||
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
||||
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
||||
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
||||
ENABLE_CDP_MODE = False
|
||||
ENABLE_CDP_MODE = True
|
||||
|
||||
# CDP调试端口,用于与浏览器通信
|
||||
# 如果端口被占用,系统会自动尝试下一个可用端口
|
||||
@@ -59,8 +60,8 @@ BROWSER_LAUNCH_TIMEOUT = 30
|
||||
# 设置为False可以保持浏览器运行,便于调试
|
||||
AUTO_CLOSE_BROWSER = True
|
||||
|
||||
# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。
|
||||
SAVE_DATA_OPTION = "db" # csv or db or json or sqlite
|
||||
# 数据保存类型选项配置,支持五种类型:csv、db、json、sqlite、postgresql, 最好保存到DB,有排重的功能。
|
||||
SAVE_DATA_OPTION = "postgresql" # csv or db or json or sqlite or postgresql
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
@@ -69,7 +70,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
START_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 10
|
||||
CRAWLER_MAX_NOTES_COUNT = 5
|
||||
|
||||
# 并发爬虫数量控制
|
||||
MAX_CONCURRENCY_NUM = 1
|
||||
|
||||
@@ -13,16 +13,23 @@
|
||||
# 每天爬取视频/帖子的数量控制
|
||||
MAX_NOTES_PER_DAY = 1
|
||||
|
||||
# 指定B站视频ID列表
|
||||
# 指定B站视频URL列表 (支持完整URL或BV号)
|
||||
# 示例:
|
||||
# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
|
||||
# - BV号: "BV1d54y1g7db"
|
||||
BILI_SPECIFIED_ID_LIST = [
|
||||
"BV1d54y1g7db",
|
||||
"https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
|
||||
"BV1Sz4y1U77N",
|
||||
"BV14Q4y1n7jz",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定B站用户ID列表
|
||||
# 指定B站创作者URL列表 (支持完整URL或UID)
|
||||
# 示例:
|
||||
# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
|
||||
# - UID: "20813884"
|
||||
BILI_CREATOR_ID_LIST = [
|
||||
"https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
|
||||
"20813884",
|
||||
# ........................
|
||||
]
|
||||
@@ -34,6 +41,11 @@ END_DAY = "2024-01-01"
|
||||
# 搜索模式
|
||||
BILI_SEARCH_MODE = "normal"
|
||||
|
||||
# 视频清晰度(qn)配置,常见取值:
|
||||
# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p高码率, 116=1080p60, 120=4K
|
||||
# 注意:更高清晰度需要账号/视频本身支持
|
||||
BILI_QN = 80
|
||||
|
||||
# 是否爬取用户信息
|
||||
CREATOR_MODE = True
|
||||
|
||||
|
||||
@@ -12,11 +12,19 @@
|
||||
import os
|
||||
|
||||
# mysql config - 使用MindSpider的数据库配置
|
||||
MYSQL_DB_PWD = "mneDccc7sHHANtFk"
|
||||
MYSQL_DB_USER = "root"
|
||||
MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com"
|
||||
MYSQL_DB_PORT = 3306
|
||||
MYSQL_DB_NAME = "mindspider"
|
||||
MYSQL_DB_PWD = "bettafish"
|
||||
MYSQL_DB_USER = "bettafish"
|
||||
MYSQL_DB_HOST = "127.0.0.1"
|
||||
MYSQL_DB_PORT = 5444
|
||||
MYSQL_DB_NAME = "bettafish"
|
||||
|
||||
mysql_db_config = {
|
||||
"user": MYSQL_DB_USER,
|
||||
"password": MYSQL_DB_PWD,
|
||||
"host": MYSQL_DB_HOST,
|
||||
"port": MYSQL_DB_PORT,
|
||||
"db_name": MYSQL_DB_NAME,
|
||||
}
|
||||
|
||||
|
||||
# redis config
|
||||
@@ -30,4 +38,24 @@ CACHE_TYPE_REDIS = "redis"
|
||||
CACHE_TYPE_MEMORY = "memory"
|
||||
|
||||
# sqlite config
|
||||
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")
|
||||
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
|
||||
|
||||
sqlite_db_config = {
|
||||
"db_path": SQLITE_DB_PATH
|
||||
}
|
||||
|
||||
# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
|
||||
POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "bettafish")
|
||||
POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "bettafish")
|
||||
POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "127.0.0.1")
|
||||
POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5444")
|
||||
POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "bettafish")
|
||||
|
||||
postgresql_db_config = {
|
||||
"user": POSTGRESQL_DB_USER,
|
||||
"password": POSTGRESQL_DB_PWD,
|
||||
"host": POSTGRESQL_DB_HOST,
|
||||
"port": POSTGRESQL_DB_PORT,
|
||||
"db_name": POSTGRESQL_DB_NAME,
|
||||
}
|
||||
|
||||
|
||||
@@ -11,15 +11,27 @@
|
||||
# 抖音平台配置
|
||||
PUBLISH_TIME_TYPE = 0
|
||||
|
||||
# 指定DY视频ID列表
|
||||
# 指定DY视频URL列表 (支持多种格式)
|
||||
# 支持格式:
|
||||
# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
|
||||
# 2. 带modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
|
||||
# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
|
||||
# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
|
||||
# 5. 纯视频ID: "7280854932641664319"
|
||||
DY_SPECIFIED_ID_LIST = [
|
||||
"7280854932641664319",
|
||||
"7202432992642387233",
|
||||
"https://www.douyin.com/video/7525538910311632128",
|
||||
"https://v.douyin.com/drIPtQ_WPWY/",
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525538910311632128",
|
||||
"7202432992642387233",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定DY用户ID列表
|
||||
# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
|
||||
# 支持格式:
|
||||
# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
|
||||
# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
|
||||
DY_CREATOR_ID_LIST = [
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
|
||||
# ........................
|
||||
]
|
||||
|
||||
@@ -10,11 +10,22 @@
|
||||
|
||||
# 快手平台配置
|
||||
|
||||
# 指定快手视频ID列表
|
||||
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
|
||||
# 指定快手视频URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
|
||||
# 2. 纯视频ID: "3xf8enb8dbj6uig"
|
||||
KS_SPECIFIED_ID_LIST = [
|
||||
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
|
||||
"3xf8enb8dbj6uig",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定快手用户ID列表
|
||||
# 指定快手创作者URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
|
||||
# 2. 纯user_id: "3x4sm73aye7jq7i"
|
||||
KS_CREATOR_ID_LIST = [
|
||||
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
|
||||
"3x4sm73aye7jq7i",
|
||||
# ........................
|
||||
]
|
||||
|
||||
@@ -17,12 +17,16 @@ SORT_TYPE = "popularity_descending"
|
||||
|
||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
"https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定用户ID列表
|
||||
# 指定创作者URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||
# 2. 纯user_id: "63e36c9a000000002703502b"
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"63e36c9a000000002703502b",
|
||||
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||
"63e36c9a000000002703502b",
|
||||
# ........................
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user