131 lines
4.4 KiB
Python
131 lines
4.4 KiB
Python
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
|
# 1. 不得用于任何商业用途。
|
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
|
# 5. 不得用于任何非法或不当的用途。
|
|
#
|
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
|
|
|
|
|
import asyncio
|
|
import sys
|
|
from typing import Dict, Optional, Type
|
|
|
|
import importlib
|
|
|
|
import cmd_arg
|
|
import config
|
|
from database import db
|
|
from base.base_crawler import AbstractCrawler
|
|
from tools.async_file_writer import AsyncFileWriter
|
|
from var import crawler_type_var
|
|
|
|
|
|
class CrawlerFactory:
|
|
_CRAWLER_PATHS = {
|
|
"xhs": "media_platform.xhs.XiaoHongShuCrawler",
|
|
"dy": "media_platform.douyin.DouYinCrawler",
|
|
"ks": "media_platform.kuaishou.KuaishouCrawler",
|
|
"bili": "media_platform.bilibili.BilibiliCrawler",
|
|
"wb": "media_platform.weibo.WeiboCrawler",
|
|
"tieba": "media_platform.tieba.TieBaCrawler",
|
|
"zhihu": "media_platform.zhihu.ZhihuCrawler",
|
|
}
|
|
_cache: Dict[str, Type[AbstractCrawler]] = {}
|
|
|
|
@staticmethod
|
|
def create_crawler(platform: str) -> AbstractCrawler:
|
|
path = CrawlerFactory._CRAWLER_PATHS.get(platform)
|
|
if not path:
|
|
raise ValueError(
|
|
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
|
|
)
|
|
|
|
if platform not in CrawlerFactory._cache:
|
|
module_name, class_name = path.rsplit(".", 1)
|
|
try:
|
|
module = importlib.import_module(module_name)
|
|
crawler_class = getattr(module, class_name)
|
|
except ModuleNotFoundError as exc:
|
|
hint = (
|
|
"Please install optional dependency 'xhshow' (pip install xhshow) "
|
|
"or disable the xhs platform."
|
|
if platform == "xhs" and exc.name == "xhshow"
|
|
else f"Missing dependency while importing {module_name}"
|
|
)
|
|
raise ModuleNotFoundError(f"{exc}: {hint}") from exc
|
|
CrawlerFactory._cache[platform] = crawler_class
|
|
|
|
return CrawlerFactory._cache[platform]()
|
|
|
|
|
|
crawler: Optional[AbstractCrawler] = None
|
|
|
|
|
|
# persist-1<persist1@126.com>
|
|
# 原因:增加 --init_db 功能,用于数据库初始化。
|
|
# 副作用:无
|
|
# 回滚策略:还原此文件。
|
|
async def main():
|
|
# Init crawler
|
|
global crawler
|
|
|
|
# 导入工具模块以初始化日志
|
|
from tools import utils
|
|
utils.logger.info("=" * 60)
|
|
utils.logger.info("MediaCrawler 启动")
|
|
utils.logger.info("=" * 60)
|
|
|
|
# parse cmd
|
|
args = await cmd_arg.parse_cmd()
|
|
|
|
# init db
|
|
if args.init_db:
|
|
await db.init_db(args.init_db)
|
|
print(f"Database {args.init_db} initialized successfully.")
|
|
return # Exit the main function cleanly
|
|
|
|
crawler = None
|
|
try:
|
|
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
|
await crawler.start()
|
|
|
|
# Generate wordcloud after crawling is complete
|
|
# Only for JSON save mode
|
|
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
|
|
try:
|
|
file_writer = AsyncFileWriter(
|
|
platform=config.PLATFORM,
|
|
crawler_type=crawler_type_var.get()
|
|
)
|
|
await file_writer.generate_wordcloud_from_comments()
|
|
except Exception as e:
|
|
print(f"Error generating wordcloud: {e}")
|
|
finally:
|
|
# 确保爬虫结束后关闭浏览器
|
|
if crawler:
|
|
try:
|
|
await crawler.close()
|
|
print(f"[MediaCrawler] 浏览器已关闭")
|
|
except Exception as e:
|
|
print(f"[MediaCrawler] 关闭浏览器时出错: {e}")
|
|
|
|
|
|
def cleanup():
|
|
# 注意:crawler.close() 已经在 main() 的 finally 块中调用
|
|
# 这里只处理数据库关闭
|
|
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
|
|
try:
|
|
asyncio.run(db.close())
|
|
except Exception as e:
|
|
print(f"[MediaCrawler] 关闭数据库连接时出错: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.get_event_loop().run_until_complete(main())
|
|
finally:
|
|
cleanup()
|