bettafish-company/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py

# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。


import asyncio
import sys
from typing import Dict, Optional, Type

import importlib

import cmd_arg
import config
from database import db
from base.base_crawler import AbstractCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var


class CrawlerFactory:
    _CRAWLER_PATHS = {
        "xhs": "media_platform.xhs.XiaoHongShuCrawler",
        "dy": "media_platform.douyin.DouYinCrawler",
        "ks": "media_platform.kuaishou.KuaishouCrawler",
        "bili": "media_platform.bilibili.BilibiliCrawler",
        "wb": "media_platform.weibo.WeiboCrawler",
        "tieba": "media_platform.tieba.TieBaCrawler",
        "zhihu": "media_platform.zhihu.ZhihuCrawler",
    }
    _cache: Dict[str, Type[AbstractCrawler]] = {}

    @staticmethod
    def create_crawler(platform: str) -> AbstractCrawler:
        path = CrawlerFactory._CRAWLER_PATHS.get(platform)
        if not path:
            raise ValueError(
                "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
            )

        if platform not in CrawlerFactory._cache:
            module_name, class_name = path.rsplit(".", 1)
            try:
                module = importlib.import_module(module_name)
                crawler_class = getattr(module, class_name)
            except ModuleNotFoundError as exc:
                hint = (
                    "Please install optional dependency 'xhshow' (pip install xhshow) "
                    "or disable the xhs platform."
                    if platform == "xhs" and exc.name == "xhshow"
                    else f"Missing dependency while importing {module_name}"
                )
                raise ModuleNotFoundError(f"{exc}: {hint}") from exc
            CrawlerFactory._cache[platform] = crawler_class

        return CrawlerFactory._cache[platform]()


crawler: Optional[AbstractCrawler] = None


# persist-1<persist1@126.com>
# 原因：增加 --init_db 功能，用于数据库初始化。
# 副作用：无
# 回滚策略：还原此文件。
async def main():
    # Init crawler
    global crawler

    # 导入工具模块以初始化日志
    from tools import utils
    utils.logger.info("=" * 60)
    utils.logger.info("MediaCrawler 启动")
    utils.logger.info("=" * 60)

    # parse cmd
    args = await cmd_arg.parse_cmd()

    # init db
    if args.init_db:
        await db.init_db(args.init_db)
        print(f"Database {args.init_db} initialized successfully.")
        return  # Exit the main function cleanly

    crawler = None
    try:
        crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
        await crawler.start()

        # Generate wordcloud after crawling is complete
        # Only for JSON save mode
        if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
            try:
                file_writer = AsyncFileWriter(
                    platform=config.PLATFORM,
                    crawler_type=crawler_type_var.get()
                )
                await file_writer.generate_wordcloud_from_comments()
            except Exception as e:
                print(f"Error generating wordcloud: {e}")
    finally:
        # 确保爬虫结束后关闭浏览器
        if crawler:
            try:
                await crawler.close()
                print(f"[MediaCrawler] 浏览器已关闭")
            except Exception as e:
                print(f"[MediaCrawler] 关闭浏览器时出错: {e}")


def cleanup():
    # 注意：crawler.close() 已经在 main() 的 finally 块中调用
    # 这里只处理数据库关闭
    if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
        try:
            asyncio.run(db.close())
        except Exception as e:
            print(f"[MediaCrawler] 关闭数据库连接时出错: {e}")


if __name__ == "__main__":
    try:
        asyncio.get_event_loop().run_until_complete(main())
    finally:
        cleanup()