更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -11,42 +11,54 @@
import asyncio
import sys
from typing import Optional
from typing import Dict, Optional, Type
import importlib
import cmd_arg
import config
from database import db
from base.base_crawler import AbstractCrawler
from media_platform.bilibili import BilibiliCrawler
from media_platform.douyin import DouYinCrawler
from media_platform.kuaishou import KuaishouCrawler
from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory:
CRAWLERS = {
"xhs": XiaoHongShuCrawler,
"dy": DouYinCrawler,
"ks": KuaishouCrawler,
"bili": BilibiliCrawler,
"wb": WeiboCrawler,
"tieba": TieBaCrawler,
"zhihu": ZhihuCrawler,
_CRAWLER_PATHS = {
"xhs": "media_platform.xhs.XiaoHongShuCrawler",
"dy": "media_platform.douyin.DouYinCrawler",
"ks": "media_platform.kuaishou.KuaishouCrawler",
"bili": "media_platform.bilibili.BilibiliCrawler",
"wb": "media_platform.weibo.WeiboCrawler",
"tieba": "media_platform.tieba.TieBaCrawler",
"zhihu": "media_platform.zhihu.ZhihuCrawler",
}
_cache: Dict[str, Type[AbstractCrawler]] = {}
@staticmethod
def create_crawler(platform: str) -> AbstractCrawler:
crawler_class = CrawlerFactory.CRAWLERS.get(platform)
if not crawler_class:
path = CrawlerFactory._CRAWLER_PATHS.get(platform)
if not path:
raise ValueError(
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
)
return crawler_class()
if platform not in CrawlerFactory._cache:
module_name, class_name = path.rsplit(".", 1)
try:
module = importlib.import_module(module_name)
crawler_class = getattr(module, class_name)
except ModuleNotFoundError as exc:
hint = (
"Please install optional dependency 'xhshow' (pip install xhshow) "
"or disable the xhs platform."
if platform == "xhs" and exc.name == "xhshow"
else f"Missing dependency while importing {module_name}"
)
raise ModuleNotFoundError(f"{exc}: {hint}") from exc
CrawlerFactory._cache[platform] = crawler_class
return CrawlerFactory._cache[platform]()
crawler: Optional[AbstractCrawler] = None
@@ -59,6 +71,12 @@ crawler: Optional[AbstractCrawler] = None
async def main():
# Init crawler
global crawler
# 导入工具模块以初始化日志
from tools import utils
utils.logger.info("=" * 60)
utils.logger.info("MediaCrawler 启动")
utils.logger.info("=" * 60)
# parse cmd
args = await cmd_arg.parse_cmd()