Files
bettafish-company/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py
T

131 lines
4.4 KiB
Python

# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import sys
from typing import Dict, Optional, Type
import importlib
import cmd_arg
import config
from database import db
from base.base_crawler import AbstractCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory:
_CRAWLER_PATHS = {
"xhs": "media_platform.xhs.XiaoHongShuCrawler",
"dy": "media_platform.douyin.DouYinCrawler",
"ks": "media_platform.kuaishou.KuaishouCrawler",
"bili": "media_platform.bilibili.BilibiliCrawler",
"wb": "media_platform.weibo.WeiboCrawler",
"tieba": "media_platform.tieba.TieBaCrawler",
"zhihu": "media_platform.zhihu.ZhihuCrawler",
}
_cache: Dict[str, Type[AbstractCrawler]] = {}
@staticmethod
def create_crawler(platform: str) -> AbstractCrawler:
path = CrawlerFactory._CRAWLER_PATHS.get(platform)
if not path:
raise ValueError(
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
)
if platform not in CrawlerFactory._cache:
module_name, class_name = path.rsplit(".", 1)
try:
module = importlib.import_module(module_name)
crawler_class = getattr(module, class_name)
except ModuleNotFoundError as exc:
hint = (
"Please install optional dependency 'xhshow' (pip install xhshow) "
"or disable the xhs platform."
if platform == "xhs" and exc.name == "xhshow"
else f"Missing dependency while importing {module_name}"
)
raise ModuleNotFoundError(f"{exc}: {hint}") from exc
CrawlerFactory._cache[platform] = crawler_class
return CrawlerFactory._cache[platform]()
crawler: Optional[AbstractCrawler] = None
# persist-1<persist1@126.com>
# 原因:增加 --init_db 功能,用于数据库初始化。
# 副作用:无
# 回滚策略:还原此文件。
async def main():
# Init crawler
global crawler
# 导入工具模块以初始化日志
from tools import utils
utils.logger.info("=" * 60)
utils.logger.info("MediaCrawler 启动")
utils.logger.info("=" * 60)
# parse cmd
args = await cmd_arg.parse_cmd()
# init db
if args.init_db:
await db.init_db(args.init_db)
print(f"Database {args.init_db} initialized successfully.")
return # Exit the main function cleanly
crawler = None
try:
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
# Generate wordcloud after crawling is complete
# Only for JSON save mode
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
try:
file_writer = AsyncFileWriter(
platform=config.PLATFORM,
crawler_type=crawler_type_var.get()
)
await file_writer.generate_wordcloud_from_comments()
except Exception as e:
print(f"Error generating wordcloud: {e}")
finally:
# 确保爬虫结束后关闭浏览器
if crawler:
try:
await crawler.close()
print(f"[MediaCrawler] 浏览器已关闭")
except Exception as e:
print(f"[MediaCrawler] 关闭浏览器时出错: {e}")
def cleanup():
# 注意:crawler.close() 已经在 main() 的 finally 块中调用
# 这里只处理数据库关闭
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
try:
asyncio.run(db.close())
except Exception as e:
print(f"[MediaCrawler] 关闭数据库连接时出错: {e}")
if __name__ == "__main__":
try:
asyncio.get_event_loop().run_until_complete(main())
finally:
cleanup()