# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: # 1. 不得用于任何商业用途。 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 # 3. 不得进行大规模爬取或对平台造成运营干扰。 # 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 # # 详细许可条款请参阅项目根目录下的LICENSE文件。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 import asyncio import sys from typing import Dict, Optional, Type import importlib import cmd_arg import config from database import db from base.base_crawler import AbstractCrawler from tools.async_file_writer import AsyncFileWriter from var import crawler_type_var class CrawlerFactory: _CRAWLER_PATHS = { "xhs": "media_platform.xhs.XiaoHongShuCrawler", "dy": "media_platform.douyin.DouYinCrawler", "ks": "media_platform.kuaishou.KuaishouCrawler", "bili": "media_platform.bilibili.BilibiliCrawler", "wb": "media_platform.weibo.WeiboCrawler", "tieba": "media_platform.tieba.TieBaCrawler", "zhihu": "media_platform.zhihu.ZhihuCrawler", } _cache: Dict[str, Type[AbstractCrawler]] = {} @staticmethod def create_crawler(platform: str) -> AbstractCrawler: path = CrawlerFactory._CRAWLER_PATHS.get(platform) if not path: raise ValueError( "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..." ) if platform not in CrawlerFactory._cache: module_name, class_name = path.rsplit(".", 1) try: module = importlib.import_module(module_name) crawler_class = getattr(module, class_name) except ModuleNotFoundError as exc: hint = ( "Please install optional dependency 'xhshow' (pip install xhshow) " "or disable the xhs platform." if platform == "xhs" and exc.name == "xhshow" else f"Missing dependency while importing {module_name}" ) raise ModuleNotFoundError(f"{exc}: {hint}") from exc CrawlerFactory._cache[platform] = crawler_class return CrawlerFactory._cache[platform]() crawler: Optional[AbstractCrawler] = None # persist-1 # 原因:增加 --init_db 功能,用于数据库初始化。 # 副作用:无 # 回滚策略:还原此文件。 async def main(): # Init crawler global crawler # 导入工具模块以初始化日志 from tools import utils utils.logger.info("=" * 60) utils.logger.info("MediaCrawler 启动") utils.logger.info("=" * 60) # parse cmd args = await cmd_arg.parse_cmd() # init db if args.init_db: await db.init_db(args.init_db) print(f"Database {args.init_db} initialized successfully.") return # Exit the main function cleanly crawler = None try: crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) await crawler.start() # Generate wordcloud after crawling is complete # Only for JSON save mode if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD: try: file_writer = AsyncFileWriter( platform=config.PLATFORM, crawler_type=crawler_type_var.get() ) await file_writer.generate_wordcloud_from_comments() except Exception as e: print(f"Error generating wordcloud: {e}") finally: # 确保爬虫结束后关闭浏览器 if crawler: try: await crawler.close() print(f"[MediaCrawler] 浏览器已关闭") except Exception as e: print(f"[MediaCrawler] 关闭浏览器时出错: {e}") def cleanup(): # 注意:crawler.close() 已经在 main() 的 finally 块中调用 # 这里只处理数据库关闭 if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]: try: asyncio.run(db.close()) except Exception as e: print(f"[MediaCrawler] 关闭数据库连接时出错: {e}") if __name__ == "__main__": try: asyncio.get_event_loop().run_until_complete(main()) finally: cleanup()