1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
@@ -15,7 +15,7 @@ from typing import Optional
import cmd_arg
import config
import db
from database import db
from base.base_crawler import AbstractCrawler
from media_platform.bilibili import BilibiliCrawler
from media_platform.douyin import DouYinCrawler
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory:
@@ -50,20 +52,40 @@ class CrawlerFactory:
crawler: Optional[AbstractCrawler] = None
# persist-1<persist1@126.com>
# 原因:增加 --init_db 功能,用于数据库初始化。
# 副作用:无
# 回滚策略:还原此文件。
async def main():
# Init crawler
global crawler
# parse cmd
await cmd_arg.parse_cmd()
args = await cmd_arg.parse_cmd()
# init db
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
await db.init_db()
if args.init_db:
await db.init_db(args.init_db)
print(f"Database {args.init_db} initialized successfully.")
return # Exit the main function cleanly
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
# Generate wordcloud after crawling is complete
# Only for JSON save mode
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
try:
file_writer = AsyncFileWriter(
platform=config.PLATFORM,
crawler_type=crawler_type_var.get()
)
await file_writer.generate_wordcloud_from_comments()
except Exception as e:
print(f"Error generating wordcloud: {e}")
def cleanup():
if crawler: