更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -11,42 +11,54 @@
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Dict, Optional, Type
|
||||
|
||||
import importlib
|
||||
|
||||
import cmd_arg
|
||||
import config
|
||||
from database import db
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from media_platform.bilibili import BilibiliCrawler
|
||||
from media_platform.douyin import DouYinCrawler
|
||||
from media_platform.kuaishou import KuaishouCrawler
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
from media_platform.weibo import WeiboCrawler
|
||||
from media_platform.xhs import XiaoHongShuCrawler
|
||||
from media_platform.zhihu import ZhihuCrawler
|
||||
from tools.async_file_writer import AsyncFileWriter
|
||||
from var import crawler_type_var
|
||||
|
||||
|
||||
class CrawlerFactory:
|
||||
CRAWLERS = {
|
||||
"xhs": XiaoHongShuCrawler,
|
||||
"dy": DouYinCrawler,
|
||||
"ks": KuaishouCrawler,
|
||||
"bili": BilibiliCrawler,
|
||||
"wb": WeiboCrawler,
|
||||
"tieba": TieBaCrawler,
|
||||
"zhihu": ZhihuCrawler,
|
||||
_CRAWLER_PATHS = {
|
||||
"xhs": "media_platform.xhs.XiaoHongShuCrawler",
|
||||
"dy": "media_platform.douyin.DouYinCrawler",
|
||||
"ks": "media_platform.kuaishou.KuaishouCrawler",
|
||||
"bili": "media_platform.bilibili.BilibiliCrawler",
|
||||
"wb": "media_platform.weibo.WeiboCrawler",
|
||||
"tieba": "media_platform.tieba.TieBaCrawler",
|
||||
"zhihu": "media_platform.zhihu.ZhihuCrawler",
|
||||
}
|
||||
_cache: Dict[str, Type[AbstractCrawler]] = {}
|
||||
|
||||
@staticmethod
|
||||
def create_crawler(platform: str) -> AbstractCrawler:
|
||||
crawler_class = CrawlerFactory.CRAWLERS.get(platform)
|
||||
if not crawler_class:
|
||||
path = CrawlerFactory._CRAWLER_PATHS.get(platform)
|
||||
if not path:
|
||||
raise ValueError(
|
||||
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
|
||||
)
|
||||
return crawler_class()
|
||||
|
||||
if platform not in CrawlerFactory._cache:
|
||||
module_name, class_name = path.rsplit(".", 1)
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
crawler_class = getattr(module, class_name)
|
||||
except ModuleNotFoundError as exc:
|
||||
hint = (
|
||||
"Please install optional dependency 'xhshow' (pip install xhshow) "
|
||||
"or disable the xhs platform."
|
||||
if platform == "xhs" and exc.name == "xhshow"
|
||||
else f"Missing dependency while importing {module_name}"
|
||||
)
|
||||
raise ModuleNotFoundError(f"{exc}: {hint}") from exc
|
||||
CrawlerFactory._cache[platform] = crawler_class
|
||||
|
||||
return CrawlerFactory._cache[platform]()
|
||||
|
||||
|
||||
crawler: Optional[AbstractCrawler] = None
|
||||
@@ -59,6 +71,12 @@ crawler: Optional[AbstractCrawler] = None
|
||||
async def main():
|
||||
# Init crawler
|
||||
global crawler
|
||||
|
||||
# 导入工具模块以初始化日志
|
||||
from tools import utils
|
||||
utils.logger.info("=" * 60)
|
||||
utils.logger.info("MediaCrawler 启动")
|
||||
utils.logger.info("=" * 60)
|
||||
|
||||
# parse cmd
|
||||
args = await cmd_arg.parse_cmd()
|
||||
|
||||
Reference in New Issue
Block a user