diff --git a/MindSpider/BroadTopicExtraction/database_manager.py b/MindSpider/BroadTopicExtraction/database_manager.py new file mode 100644 index 0000000..2e2bf21 --- /dev/null +++ b/MindSpider/BroadTopicExtraction/database_manager.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +BroadTopicExtraction模块 - 数据库管理器 +只负责新闻数据和话题分析的存储和查询 +""" + +import sys +import json +from datetime import datetime, date +from pathlib import Path +from typing import List, Dict, Optional +import pymysql +from pymysql.cursors import DictCursor + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + raise ImportError("无法导入config.py配置文件") + +class DatabaseManager: + """数据库管理器""" + + def __init__(self): + """初始化数据库管理器""" + self.connection = None + self.connect() + + def connect(self): + """连接数据库""" + try: + self.connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + database=config.DB_NAME, + charset=config.DB_CHARSET, + autocommit=True, + cursorclass=DictCursor + ) + print(f"成功连接到数据库: {config.DB_NAME}") + except Exception as e: + print(f"数据库连接失败: {e}") + raise + + def close(self): + """关闭数据库连接""" + if self.connection: + self.connection.close() + print("数据库连接已关闭") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + # ==================== 新闻数据操作 ==================== + + def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int: + """ + 保存每日新闻数据,如果当天已有数据则覆盖 + + Args: + news_data: 新闻数据列表 + crawl_date: 爬取日期,默认为今天 + + Returns: + 保存的新闻数量 + """ + if not crawl_date: + crawl_date = date.today() + + current_timestamp = int(datetime.now().timestamp()) + + try: + cursor = self.connection.cursor() + + # 先删除当天所有的新闻记录(覆盖模式) + delete_query = "DELETE FROM daily_news WHERE crawl_date = %s" + deleted_count = cursor.execute(delete_query, (crawl_date,)) + if deleted_count > 0: + print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录") + + # 批量插入新记录 + saved_count = 0 + for news_item in news_data: + try: + # 简化的新闻ID生成 + news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}" + + # 插入新记录 + insert_query = """ + INSERT INTO daily_news ( + news_id, source_platform, title, url, crawl_date, + rank_position, add_ts + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + """ + cursor.execute(insert_query, ( + news_id, + news_item.get('source', 'unknown'), + news_item.get('title', ''), + news_item.get('url', ''), + crawl_date, + news_item.get('rank', None), + current_timestamp + )) + saved_count += 1 + + except Exception as e: + print(f"保存单条新闻失败: {e}") + continue + + print(f"成功保存 {saved_count} 条新闻记录") + return saved_count + + except Exception as e: + print(f"保存新闻数据失败: {e}") + return 0 + + def get_daily_news(self, crawl_date: date = None) -> List[Dict]: + """ + 获取每日新闻数据 + + Args: + crawl_date: 爬取日期,默认为今天 + + Returns: + 新闻列表 + """ + if not crawl_date: + crawl_date = date.today() + + query = """ + SELECT * FROM daily_news + WHERE crawl_date = %s + ORDER BY rank_position ASC + """ + + cursor = self.connection.cursor() + cursor.execute(query, (crawl_date,)) + return cursor.fetchall() + + # ==================== 话题数据操作 ==================== + + def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool: + """ + 保存每日话题分析 + + Args: + keywords: 话题关键词列表 + summary: 新闻分析总结 + extract_date: 提取日期,默认为今天 + + Returns: + 是否保存成功 + """ + if not extract_date: + extract_date = date.today() + + current_timestamp = int(datetime.now().timestamp()) + + try: + cursor = self.connection.cursor() + + # 检查今天是否已有记录 + check_query = "SELECT id FROM daily_topics WHERE extract_date = %s" + cursor.execute(check_query, (extract_date,)) + existing = cursor.fetchone() + + keywords_json = json.dumps(keywords, ensure_ascii=False) + + if existing: + # 更新现有记录 + update_query = """ + UPDATE daily_topics + SET keywords = %s, summary = %s, add_ts = %s + WHERE extract_date = %s + """ + cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date)) + print(f"更新了 {extract_date} 的话题分析") + else: + # 插入新记录 + insert_query = """ + INSERT INTO daily_topics (extract_date, keywords, summary, add_ts) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp)) + print(f"保存了 {extract_date} 的话题分析") + + return True + + except Exception as e: + print(f"保存话题分析失败: {e}") + return False + + def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: + """ + 获取每日话题分析 + + Args: + extract_date: 提取日期,默认为今天 + + Returns: + 话题分析数据,如果不存在返回None + """ + if not extract_date: + extract_date = date.today() + + try: + cursor = self.connection.cursor() + query = "SELECT * FROM daily_topics WHERE extract_date = %s" + cursor.execute(query, (extract_date,)) + result = cursor.fetchone() + + if result: + # 解析关键词JSON + result['keywords'] = json.loads(result['keywords']) + return result + else: + return None + + except Exception as e: + print(f"获取话题分析失败: {e}") + return None + + def get_recent_topics(self, days: int = 7) -> List[Dict]: + """ + 获取最近几天的话题分析 + + Args: + days: 天数 + + Returns: + 话题分析列表 + """ + try: + cursor = self.connection.cursor() + query = """ + SELECT * FROM daily_topics + WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + ORDER BY extract_date DESC + """ + cursor.execute(query, (days,)) + results = cursor.fetchall() + + # 解析每个结果的关键词JSON + for result in results: + result['keywords'] = json.loads(result['keywords']) + + return results + + except Exception as e: + print(f"获取最近话题分析失败: {e}") + return [] + + # ==================== 统计查询 ==================== + + def get_summary_stats(self, days: int = 7) -> Dict: + """获取统计摘要""" + try: + cursor = self.connection.cursor() + + # 新闻统计 + news_query = """ + SELECT + crawl_date, + COUNT(*) as news_count, + COUNT(DISTINCT source_platform) as platforms_count + FROM daily_news + WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + GROUP BY crawl_date + ORDER BY crawl_date DESC + """ + cursor.execute(news_query, (days,)) + news_stats = cursor.fetchall() + + # 话题统计 + topics_query = """ + SELECT + extract_date, + keywords, + CHAR_LENGTH(summary) as summary_length + FROM daily_topics + WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + ORDER BY extract_date DESC + """ + cursor.execute(topics_query, (days,)) + topics_stats = cursor.fetchall() + + return { + 'news_stats': news_stats, + 'topics_stats': topics_stats + } + + except Exception as e: + print(f"获取统计摘要失败: {e}") + return {'news_stats': [], 'topics_stats': []} + +if __name__ == "__main__": + # 测试数据库管理器 + with DatabaseManager() as db: + # 测试获取新闻 + news = db.get_daily_news() + print(f"今日新闻数量: {len(news)}") + + # 测试获取话题 + topics = db.get_daily_topics() + if topics: + print(f"今日话题关键词: {topics['keywords']}") + else: + print("今日暂无话题分析") + + print("简化数据库管理器测试完成!") diff --git a/MindSpider/BroadTopicExtraction/get_today_news.py b/MindSpider/BroadTopicExtraction/get_today_news.py new file mode 100644 index 0000000..2745381 --- /dev/null +++ b/MindSpider/BroadTopicExtraction/get_today_news.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +BroadTopicExtraction模块 - 新闻获取和收集 +整合新闻API调用和数据库存储功能 +""" + +import sys +import asyncio +import httpx +import json +from datetime import datetime, date +from pathlib import Path +from typing import List, Dict, Optional + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + from BroadTopicExtraction.database_manager import DatabaseManager +except ImportError as e: + raise ImportError(f"导入模块失败: {e}") + +# 新闻API基础URL +BASE_URL = "https://newsnow.busiyi.world" + +# 新闻源中文名称映射 +SOURCE_NAMES = { + "weibo": "微博热搜", + "zhihu": "知乎热榜", + "bilibili-hot-search": "B站热搜", + "toutiao": "今日头条", + "douyin": "抖音热榜", + "github-trending-today": "GitHub趋势", + "coolapk": "酷安热榜", + "tieba": "百度贴吧", + "wallstreetcn": "华尔街见闻", + "thepaper": "澎湃新闻", + "cls-hot": "财联社", + "xueqiu": "雪球热榜", + "kuaishou": "快手热榜" +} + +class NewsCollector: + """新闻收集器 - 整合API调用和数据库存储""" + + def __init__(self): + """初始化新闻收集器""" + self.db_manager = DatabaseManager() + self.supported_sources = list(SOURCE_NAMES.keys()) + + def close(self): + """关闭资源""" + if self.db_manager: + self.db_manager.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + self.close() + + # ==================== 新闻API调用 ==================== + + async def fetch_news(self, source: str) -> dict: + """从指定源获取最新新闻""" + url = f"{BASE_URL}/api/s?id={source}&latest" + headers = {"Accept": "application/json"} + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(url, headers=headers) + response.raise_for_status() + + # 解析JSON响应 + data = json.loads(response.text) + return { + "source": source, + "status": "success", + "data": data, + "timestamp": datetime.now().isoformat() + } + except httpx.TimeoutException: + return { + "source": source, + "status": "timeout", + "error": "请求超时", + "timestamp": datetime.now().isoformat() + } + except httpx.HTTPStatusError as e: + return { + "source": source, + "status": "http_error", + "error": f"HTTP错误: {e.response.status_code}", + "timestamp": datetime.now().isoformat() + } + except Exception as e: + return { + "source": source, + "status": "error", + "error": f"未知错误: {str(e)}", + "timestamp": datetime.now().isoformat() + } + + async def get_popular_news(self, sources: List[str] = None) -> List[dict]: + """获取热门新闻""" + if sources is None: + sources = list(SOURCE_NAMES.keys()) + + print(f"正在获取 {len(sources)} 个新闻源的最新内容...") + print("=" * 80) + + results = [] + for source in sources: + source_name = SOURCE_NAMES.get(source, source) + print(f"正在获取 {source_name} 的新闻...") + result = await self.fetch_news(source) + results.append(result) + + if result["status"] == "success": + data = result["data"] + if 'items' in data and isinstance(data['items'], list): + count = len(data['items']) + print(f"✓ {source_name}: 获取成功,共 {count} 条新闻") + else: + print(f"✓ {source_name}: 获取成功") + else: + print(f"✗ {source_name}: {result.get('error', '获取失败')}") + + # 避免请求过快 + await asyncio.sleep(0.5) + + return results + + # ==================== 数据处理和存储 ==================== + + async def collect_and_save_news(self, sources: Optional[List[str]] = None) -> Dict: + """ + 收集并保存每日热点新闻 + + Args: + sources: 指定的新闻源列表,None表示使用所有支持的源 + + Returns: + 包含收集结果的字典 + """ + print(f"开始收集每日热点新闻...") + print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # 选择新闻源 + if sources is None: + # 使用所有支持的新闻源 + sources = list(SOURCE_NAMES.keys()) + + print(f"将从 {len(sources)} 个新闻源收集数据:") + for source in sources: + source_name = SOURCE_NAMES.get(source, source) + print(f" - {source_name}") + + try: + # 获取新闻数据 + results = await self.get_popular_news(sources) + + # 处理结果 + processed_data = self._process_news_results(results) + + # 保存到数据库(覆盖模式) + if processed_data['news_list']: + saved_count = self.db_manager.save_daily_news( + processed_data['news_list'], + date.today() + ) + processed_data['saved_count'] = saved_count + + # 打印统计信息 + self._print_collection_summary(processed_data) + + return processed_data + + except Exception as e: + print(f"收集新闻失败: {e}") + return { + 'success': False, + 'error': str(e), + 'news_list': [], + 'total_news': 0 + } + + def _process_news_results(self, results: List[Dict]) -> Dict: + """处理新闻获取结果""" + news_list = [] + successful_sources = 0 + total_news = 0 + + for result in results: + source = result['source'] + status = result['status'] + + if status == 'success': + successful_sources += 1 + data = result['data'] + + if 'items' in data and isinstance(data['items'], list): + source_news_count = len(data['items']) + total_news += source_news_count + + # 处理该源的新闻 + for i, item in enumerate(data['items'], 1): + processed_news = self._process_news_item(item, source, i) + if processed_news: + news_list.append(processed_news) + + return { + 'success': True, + 'news_list': news_list, + 'successful_sources': successful_sources, + 'total_sources': len(results), + 'total_news': total_news, + 'collection_time': datetime.now().isoformat() + } + + def _process_news_item(self, item: Dict, source: str, rank: int) -> Optional[Dict]: + """处理单条新闻""" + try: + if isinstance(item, dict): + title = item.get('title', '无标题').strip() + url = item.get('url', '') + + # 生成新闻ID + news_id = f"{source}_{item.get('id', f'rank_{rank}')}" + + return { + 'id': news_id, + 'title': title, + 'url': url, + 'source': source, + 'rank': rank + } + else: + # 处理字符串类型的新闻 + title = str(item)[:100] if len(str(item)) > 100 else str(item) + return { + 'id': f"{source}_rank_{rank}", + 'title': title, + 'url': '', + 'source': source, + 'rank': rank + } + + except Exception as e: + print(f"处理新闻项失败: {e}") + return None + + def _print_collection_summary(self, data: Dict): + """打印收集摘要""" + print("\n" + "=" * 50) + print("新闻收集摘要") + print("=" * 50) + + print(f"总新闻源: {data['total_sources']}") + print(f"成功源数: {data['successful_sources']}") + print(f"总新闻数: {data['total_news']}") + + if 'saved_count' in data: + print(f"已保存数: {data['saved_count']}") + + print("=" * 50) + + def get_today_news(self) -> List[Dict]: + """获取今天的新闻""" + try: + return self.db_manager.get_daily_news(date.today()) + except Exception as e: + print(f"获取今日新闻失败: {e}") + return [] + +async def main(): + """测试新闻收集器""" + print("测试新闻收集器...") + + async with NewsCollector() as collector: + # 收集新闻 + result = await collector.collect_and_save_news( + sources=["weibo", "zhihu"] # 测试用,只使用两个源 + ) + + if result['success']: + print(f"收集成功!共获取 {result['total_news']} 条新闻") + else: + print(f"收集失败: {result.get('error', '未知错误')}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/MindSpider/BroadTopicExtraction/main.py b/MindSpider/BroadTopicExtraction/main.py new file mode 100644 index 0000000..7160ed4 --- /dev/null +++ b/MindSpider/BroadTopicExtraction/main.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +BroadTopicExtraction模块 - 主程序 +整合话题提取的完整工作流程和命令行工具 +""" + +import sys +import asyncio +import argparse +from datetime import datetime, date +from pathlib import Path +from typing import List, Dict, Optional + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + from BroadTopicExtraction.get_today_news import NewsCollector, SOURCE_NAMES + from BroadTopicExtraction.topic_extractor import TopicExtractor + from BroadTopicExtraction.database_manager import DatabaseManager +except ImportError as e: + print(f"导入模块失败: {e}") + print("请确保在项目根目录运行,并且已安装所有依赖") + sys.exit(1) + +class BroadTopicExtraction: + """BroadTopicExtraction主要工作流程""" + + def __init__(self): + """初始化""" + self.news_collector = NewsCollector() + self.topic_extractor = TopicExtractor() + self.db_manager = DatabaseManager() + + print("BroadTopicExtraction 初始化完成") + + def close(self): + """关闭资源""" + if self.news_collector: + self.news_collector.close() + if self.db_manager: + self.db_manager.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + self.close() + + async def run_daily_extraction(self, + news_sources: Optional[List[str]] = None, + max_keywords: int = 100) -> Dict: + """ + 运行每日话题提取流程 + + Args: + news_sources: 新闻源列表,None表示使用所有支持的源 + max_keywords: 最大关键词数量 + + Returns: + 包含完整提取结果的字典 + """ + print("\n" + "=" * 80) + print("MindSpider AI爬虫 - 每日话题提取") + print("=" * 80) + print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"目标日期: {date.today()}") + + if news_sources: + print(f"指定平台: {len(news_sources)} 个") + for source in news_sources: + source_name = SOURCE_NAMES.get(source, source) + print(f" - {source_name}") + else: + print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台") + + print(f"关键词数: 最多 {max_keywords} 个") + + extraction_result = { + 'success': False, + 'extraction_date': date.today().isoformat(), + 'start_time': datetime.now().isoformat(), + 'news_collection': {}, + 'topic_extraction': {}, + 'database_save': {}, + 'error': None + } + + try: + # 步骤1: 收集新闻 + print("\n【步骤1】收集热点新闻...") + news_result = await self.news_collector.collect_and_save_news( + sources=news_sources + ) + + extraction_result['news_collection'] = { + 'success': news_result['success'], + 'total_news': news_result.get('total_news', 0), + 'successful_sources': news_result.get('successful_sources', 0), + 'total_sources': news_result.get('total_sources', 0) + } + + if not news_result['success'] or not news_result['news_list']: + raise Exception("新闻收集失败或没有获取到新闻") + + # 步骤2: 提取关键词和生成总结 + print("\n【步骤2】提取关键词和生成总结...") + keywords, summary = self.topic_extractor.extract_keywords_and_summary( + news_result['news_list'], + max_keywords=max_keywords + ) + + extraction_result['topic_extraction'] = { + 'success': len(keywords) > 0, + 'keywords_count': len(keywords), + 'keywords': keywords, + 'summary': summary + } + + if not keywords: + print("警告: 没有提取到有效关键词") + + # 步骤3: 保存到数据库 + print("\n【步骤3】保存分析结果到数据库...") + save_success = self.db_manager.save_daily_topics( + keywords, summary, date.today() + ) + + extraction_result['database_save'] = { + 'success': save_success + } + + extraction_result['success'] = True + extraction_result['end_time'] = datetime.now().isoformat() + + print("\n" + "=" * 80) + print("每日话题提取流程完成!") + print("=" * 80) + + return extraction_result + + except Exception as e: + print(f"\n话题提取流程失败: {e}") + extraction_result['error'] = str(e) + extraction_result['end_time'] = datetime.now().isoformat() + return extraction_result + + def print_extraction_results(self, extraction_result: Dict): + """打印提取结果""" + print("\n" + "=" * 80) + print("话题提取结果报告") + print("=" * 80) + + if not extraction_result['success']: + print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}") + return + + # 新闻收集结果 + news_data = extraction_result.get('news_collection', {}) + print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻") + print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}") + + # 话题提取结果 + topic_data = extraction_result.get('topic_extraction', {}) + keywords = topic_data.get('keywords', []) + summary = topic_data.get('summary', '') + + print(f"\n🔑 提取关键词: {len(keywords)} 个") + if keywords: + # 每行显示5个关键词 + for i in range(0, len(keywords), 5): + keyword_group = keywords[i:i+5] + print(f" {', '.join(keyword_group)}") + + print(f"\n📝 新闻总结:") + print(f" {summary}") + + # 数据库保存结果 + db_data = extraction_result.get('database_save', {}) + if db_data.get('success'): + print(f"\n💾 数据库保存: 成功") + else: + print(f"\n💾 数据库保存: 失败") + + print("\n" + "=" * 80) + + def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]: + """ + 获取用于爬取的关键词列表 + + Args: + extract_date: 提取日期,默认为今天 + + Returns: + 关键词列表 + """ + try: + # 从数据库获取话题分析 + topics_data = self.db_manager.get_daily_topics(extract_date) + + if not topics_data: + print(f"没有找到 {extract_date or date.today()} 的话题数据") + return [] + + keywords = topics_data['keywords'] + + # 生成搜索关键词 + search_keywords = self.topic_extractor.get_search_keywords(keywords) + + print(f"准备了 {len(search_keywords)} 个关键词用于爬取") + return search_keywords + + except Exception as e: + print(f"获取爬取关键词失败: {e}") + return [] + + def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]: + """获取指定日期的分析结果""" + try: + return self.db_manager.get_daily_topics(target_date) + except Exception as e: + print(f"获取每日分析失败: {e}") + return None + + def get_recent_analysis(self, days: int = 7) -> List[Dict]: + """获取最近几天的分析结果""" + try: + return self.db_manager.get_recent_topics(days) + except Exception as e: + print(f"获取最近分析失败: {e}") + return [] + +# ==================== 命令行工具 ==================== + +async def run_extraction_command(sources=None, keywords_count=100, show_details=True): + """运行话题提取命令""" + + try: + async with BroadTopicExtraction() as extractor: + # 运行话题提取 + result = await extractor.run_daily_extraction( + news_sources=sources, + max_keywords=keywords_count + ) + + if result['success']: + if show_details: + # 显示详细结果 + extractor.print_extraction_results(result) + else: + # 只显示简要结果 + news_data = result.get('news_collection', {}) + topic_data = result.get('topic_extraction', {}) + + print(f"✅ 话题提取成功完成!") + print(f" 收集新闻: {news_data.get('total_news', 0)} 条") + print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个") + print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符") + + # 获取爬取关键词 + crawling_keywords = extractor.get_keywords_for_crawling() + + if crawling_keywords: + print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:") + print(f" {', '.join(crawling_keywords)}") + + # 保存关键词到文件 + keywords_file = project_root / "data" / "daily_keywords.txt" + keywords_file.parent.mkdir(exist_ok=True) + + with open(keywords_file, 'w', encoding='utf-8') as f: + f.write('\n'.join(crawling_keywords)) + + print(f" 关键词已保存到: {keywords_file}") + + return True + + else: + print(f"❌ 话题提取失败: {result.get('error', '未知错误')}") + return False + + except Exception as e: + print(f"❌ 执行过程中发生错误: {e}") + return False + +def main(): + """主函数""" + parser = argparse.ArgumentParser(description="MindSpider每日话题提取工具") + parser.add_argument("--sources", nargs="+", help="指定新闻源平台", + choices=list(SOURCE_NAMES.keys())) + parser.add_argument("--keywords", type=int, default=100, help="最大关键词数量 (默认100)") + parser.add_argument("--quiet", action="store_true", help="简化输出模式") + parser.add_argument("--list-sources", action="store_true", help="显示支持的新闻源") + + args = parser.parse_args() + + # 显示支持的新闻源 + if args.list_sources: + print("支持的新闻源平台:") + for source, name in SOURCE_NAMES.items(): + print(f" {source:<25} {name}") + return + + # 验证参数 + if args.keywords < 1 or args.keywords > 200: + print("关键词数量应在1-200之间") + sys.exit(1) + + # 运行提取 + try: + success = asyncio.run(run_extraction_command( + sources=args.sources, + keywords_count=args.keywords, + show_details=not args.quiet + )) + + sys.exit(0 if success else 1) + + except KeyboardInterrupt: + print("\n用户中断操作") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/MindSpider/BroadTopicExtraction/topic_extractor.py b/MindSpider/BroadTopicExtraction/topic_extractor.py new file mode 100644 index 0000000..d8329c2 --- /dev/null +++ b/MindSpider/BroadTopicExtraction/topic_extractor.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +BroadTopicExtraction模块 - 话题提取器 +基于DeepSeek直接提取关键词和生成新闻总结 +""" + +import sys +import json +import re +from pathlib import Path +from typing import List, Dict, Tuple +from openai import OpenAI + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + raise ImportError("无法导入config.py配置文件") + +class TopicExtractor: + """话题提取器""" + + def __init__(self): + """初始化话题提取器""" + self.client = OpenAI( + api_key=config.DEEPSEEK_API_KEY, + base_url="https://api.deepseek.com" + ) + self.model = "deepseek-chat" + + def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]: + """ + 从新闻列表中提取关键词和生成总结 + + Args: + news_list: 新闻列表 + max_keywords: 最大关键词数量 + + Returns: + (关键词列表, 新闻分析总结) + """ + if not news_list: + return [], "今日暂无热点新闻" + + # 构建新闻摘要文本 + news_text = self._build_news_summary(news_list) + + # 构建提示词 + prompt = self._build_analysis_prompt(news_text, max_keywords) + + try: + # 调用DeepSeek API + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "你是一个专业的新闻分析师,擅长从热点新闻中提取关键词和撰写分析总结。"}, + {"role": "user", "content": prompt} + ], + max_tokens=1500, + temperature=0.3 + ) + + # 解析返回结果 + result_text = response.choices[0].message.content + keywords, summary = self._parse_analysis_result(result_text) + + print(f"成功提取 {len(keywords)} 个关键词并生成新闻总结") + return keywords[:max_keywords], summary + + except Exception as e: + print(f"话题提取失败: {e}") + # 返回简单的fallback结果 + fallback_keywords = self._extract_simple_keywords(news_list) + fallback_summary = f"今日共收集到 {len(news_list)} 条热点新闻,涵盖多个平台的热门话题。" + return fallback_keywords[:max_keywords], fallback_summary + + def _build_news_summary(self, news_list: List[Dict]) -> str: + """构建新闻摘要文本""" + news_items = [] + + for i, news in enumerate(news_list, 1): + title = news.get('title', '无标题') + source = news.get('source_platform', news.get('source', '未知')) + + # 清理标题中的特殊字符 + title = re.sub(r'[#@]', '', title).strip() + + news_items.append(f"{i}. 【{source}】{title}") + + return "\n".join(news_items) + + def _build_analysis_prompt(self, news_text: str, max_keywords: int) -> str: + """构建分析提示词""" + news_count = len(news_text.split('\n')) + + prompt = f""" +请分析以下{news_count}条今日热点新闻,完成两个任务: + +新闻列表: +{news_text} + +任务1:提取关键词(最多{max_keywords}个) +- 提取能代表今日热点话题的关键词 +- 关键词应该适合用于社交媒体平台搜索 +- 优先选择热度高、讨论量大的话题 +- 避免过于宽泛或过于具体的词汇 + +任务2:撰写新闻分析总结(150-300字) +- 简要概括今日热点新闻的主要内容 +- 指出当前社会关注的重点话题方向 +- 分析这些热点反映的社会现象或趋势 +- 语言简洁明了,客观中性 + +请严格按照以下JSON格式输出: +```json +{{ + "keywords": ["关键词1", "关键词2", "关键词3"], + "summary": "今日新闻分析总结内容..." +}} +``` + +请直接输出JSON格式的结果,不要包含其他文字说明。 +""" + return prompt + + def _parse_analysis_result(self, result_text: str) -> Tuple[List[str], str]: + """解析分析结果""" + try: + # 尝试提取JSON部分 + json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL) + if json_match: + json_text = json_match.group(1) + else: + # 如果没有代码块,尝试直接解析 + json_text = result_text.strip() + + # 解析JSON + data = json.loads(json_text) + + keywords = data.get('keywords', []) + summary = data.get('summary', '') + + # 验证和清理关键词 + clean_keywords = [] + for keyword in keywords: + keyword = str(keyword).strip() + if keyword and len(keyword) > 1 and keyword not in clean_keywords: + clean_keywords.append(keyword) + + # 验证总结 + if not summary or len(summary.strip()) < 10: + summary = "今日热点新闻涵盖多个领域,反映了当前社会的多元化关注点。" + + return clean_keywords, summary.strip() + + except json.JSONDecodeError as e: + print(f"解析JSON失败: {e}") + print(f"原始返回: {result_text}") + + # 尝试手动解析 + return self._manual_parse_result(result_text) + + except Exception as e: + print(f"处理分析结果失败: {e}") + return [], "分析结果处理失败,请稍后重试。" + + def _manual_parse_result(self, text: str) -> Tuple[List[str], str]: + """手动解析结果(当JSON解析失败时的后备方案)""" + print("尝试手动解析结果...") + + keywords = [] + summary = "" + + lines = text.split('\n') + + for line in lines: + line = line.strip() + if not line: + continue + + # 寻找关键词 + if '关键词' in line or 'keywords' in line.lower(): + # 提取关键词 + keyword_match = re.findall(r'[""](.*?)["""]', line) + if keyword_match: + keywords.extend(keyword_match) + else: + # 尝试其他分隔符 + parts = re.split(r'[,,、]', line) + for part in parts: + clean_part = re.sub(r'[关键词::keywords\[\]"]', '', part).strip() + if clean_part and len(clean_part) > 1: + keywords.append(clean_part) + + # 寻找总结 + elif '总结' in line or '分析' in line or 'summary' in line.lower(): + if ':' in line or ':' in line: + summary = line.split(':')[-1].split(':')[-1].strip() + + # 如果这一行看起来像总结内容 + elif len(line) > 50 and ('今日' in line or '热点' in line or '新闻' in line): + if not summary: + summary = line + + # 清理关键词 + clean_keywords = [] + for keyword in keywords: + keyword = keyword.strip() + if keyword and len(keyword) > 1 and keyword not in clean_keywords: + clean_keywords.append(keyword) + + # 如果没有找到总结,生成一个简单的 + if not summary: + summary = "今日热点新闻内容丰富,涵盖了社会各个层面的关注点。" + + return clean_keywords[:max_keywords], summary + + def _extract_simple_keywords(self, news_list: List[Dict]) -> List[str]: + """简单关键词提取(fallback方案)""" + keywords = [] + + for news in news_list: + title = news.get('title', '') + + # 简单的关键词提取 + # 移除常见的无意义词汇 + title_clean = re.sub(r'[#@【】\[\]()()]', ' ', title) + words = title_clean.split() + + for word in words: + word = word.strip() + if (len(word) > 1 and + word not in ['的', '了', '在', '和', '与', '或', '但', '是', '有', '被', '将', '已', '正在'] and + word not in keywords): + keywords.append(word) + + return keywords[:10] + + def get_search_keywords(self, keywords: List[str], limit: int = 10) -> List[str]: + """ + 获取用于搜索的关键词 + + Args: + keywords: 关键词列表 + limit: 限制数量 + + Returns: + 适合搜索的关键词列表 + """ + # 过滤和优化关键词 + search_keywords = [] + + for keyword in keywords: + keyword = str(keyword).strip() + + # 过滤条件 + if (len(keyword) > 1 and + len(keyword) < 20 and # 不能太长 + keyword not in search_keywords and + not keyword.isdigit() and # 不是纯数字 + not re.match(r'^[a-zA-Z]+$', keyword)): # 不是纯英文(除非是专有名词) + + search_keywords.append(keyword) + + return search_keywords[:limit] + +if __name__ == "__main__": + # 测试话题提取器 + extractor = TopicExtractor() + + # 模拟新闻数据 + test_news = [ + {"title": "AI技术发展迅速", "source_platform": "科技新闻"}, + {"title": "股市行情分析", "source_platform": "财经新闻"}, + {"title": "明星最新动态", "source_platform": "娱乐新闻"} + ] + + keywords, summary = extractor.extract_keywords_and_summary(test_news) + + print(f"提取的关键词: {keywords}") + print(f"新闻总结: {summary}") + + search_keywords = extractor.get_search_keywords(keywords) + print(f"搜索关键词: {search_keywords}") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitattributes b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitattributes new file mode 100644 index 0000000..3ab78cc --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitattributes @@ -0,0 +1,3 @@ +*.js linguist-language=python +*.css linguist-language=python +*.html linguist-language=python diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore new file mode 100644 index 0000000..c9a9ac8 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore @@ -0,0 +1,176 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +*.xml +*.iml +.idea +/temp_image/ +/browser_data/ +/data/ + +*/.DS_Store +.vscode +/node_modules +docs/.vitepress/cache + +# other gitignore +.venv +.refer \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version new file mode 100644 index 0000000..bd28b9c --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version @@ -0,0 +1 @@ +3.9 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py new file mode 100644 index 0000000..33859fa --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py @@ -0,0 +1,107 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 14:21 +# @Desc : 异步Aiomysql的增删改查封装 +from typing import Any, Dict, List, Union + +import aiomysql + + +class AsyncMysqlDB: + def __init__(self, pool: aiomysql.Pool) -> None: + self.__pool = pool + + async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]: + """ + 从给定的 SQL 中查询记录,返回的是一个列表 + :param sql: 查询的sql + :param args: sql中传递动态参数列表 + :return: + """ + async with self.__pool.acquire() as conn: + async with conn.cursor(aiomysql.DictCursor) as cur: + await cur.execute(sql, args) + data = await cur.fetchall() + return data or [] + + async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]: + """ + 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果 + :param sql: 查询的sql + :param args:sql中传递动态参数列表 + :return: + """ + async with self.__pool.acquire() as conn: + async with conn.cursor(aiomysql.DictCursor) as cur: + await cur.execute(sql, args) + data = await cur.fetchone() + return data + + async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int: + """ + 表中插入数据 + :param table_name: 表名 + :param item: 一条记录的字典信息 + :return: + """ + fields = list(item.keys()) + values = list(item.values()) + fields = [f'`{field}`' for field in fields] + fieldstr = ','.join(fields) + valstr = ','.join(['%s'] * len(item)) + sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr) + async with self.__pool.acquire() as conn: + async with conn.cursor(aiomysql.DictCursor) as cur: + await cur.execute(sql, values) + lastrowid = cur.lastrowid + return lastrowid + + async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str, + value_where: Union[str, int, float]) -> int: + """ + 更新指定表的记录 + :param table_name: 表名 + :param updates: 需要更新的字段和值的 key - value 映射 + :param field_where: update 语句 where 条件中的字段名 + :param value_where: update 语句 where 条件中的字段值 + :return: + """ + upsets = [] + values = [] + for k, v in updates.items(): + s = '`%s`=%%s' % k + upsets.append(s) + values.append(v) + upsets = ','.join(upsets) + sql = 'UPDATE %s SET %s WHERE %s="%s"' % ( + table_name, + upsets, + field_where, value_where, + ) + async with self.__pool.acquire() as conn: + async with conn.cursor() as cur: + rows = await cur.execute(sql, values) + return rows + + async def execute(self, sql: str, *args: Union[str, int]) -> int: + """ + 需要更新、写入等操作的 excute 执行语句 + :param sql: + :param args: + :return: + """ + async with self.__pool.acquire() as conn: + async with conn.cursor() as cur: + rows = await cur.execute(sql, args) + return rows diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py new file mode 100644 index 0000000..d9409bd --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py @@ -0,0 +1,111 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 14:21 +# @Desc : 异步SQLite的增删改查封装 +from typing import Any, Dict, List, Union + +import aiosqlite + + +class AsyncSqliteDB: + def __init__(self, db_path: str) -> None: + self.__db_path = db_path + + async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]: + """ + 从给定的 SQL 中查询记录,返回的是一个列表 + :param sql: 查询的sql + :param args: sql中传递动态参数列表 + :return: + """ + async with aiosqlite.connect(self.__db_path) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(sql, args) as cursor: + rows = await cursor.fetchall() + return [dict(row) for row in rows] if rows else [] + + async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]: + """ + 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果 + :param sql: 查询的sql + :param args:sql中传递动态参数列表 + :return: + """ + async with aiosqlite.connect(self.__db_path) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(sql, args) as cursor: + row = await cursor.fetchone() + return dict(row) if row else None + + async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int: + """ + 表中插入数据 + :param table_name: 表名 + :param item: 一条记录的字典信息 + :return: + """ + fields = list(item.keys()) + values = list(item.values()) + fieldstr = ','.join(fields) + valstr = ','.join(['?'] * len(item)) + sql = f"INSERT INTO {table_name} ({fieldstr}) VALUES({valstr})" + async with aiosqlite.connect(self.__db_path) as conn: + async with conn.execute(sql, values) as cursor: + await conn.commit() + return cursor.lastrowid + + async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str, + value_where: Union[str, int, float]) -> int: + """ + 更新指定表的记录 + :param table_name: 表名 + :param updates: 需要更新的字段和值的 key - value 映射 + :param field_where: update 语句 where 条件中的字段名 + :param value_where: update 语句 where 条件中的字段值 + :return: + """ + upsets = [] + values = [] + for k, v in updates.items(): + upsets.append(f'{k}=?') + values.append(v) + upsets_str = ','.join(upsets) + values.append(value_where) + sql = f'UPDATE {table_name} SET {upsets_str} WHERE {field_where}=?' + async with aiosqlite.connect(self.__db_path) as conn: + async with conn.execute(sql, values) as cursor: + await conn.commit() + return cursor.rowcount + + async def execute(self, sql: str, *args: Union[str, int]) -> int: + """ + 需要更新、写入等操作的 excute 执行语句 + :param sql: + :param args: + :return: + """ + async with aiosqlite.connect(self.__db_path) as conn: + async with conn.execute(sql, args) as cursor: + await conn.commit() + return cursor.rowcount + + async def executescript(self, sql_script: str) -> None: + """ + 执行SQL脚本,用于初始化数据库表结构 + :param sql_script: SQL脚本内容 + :return: + """ + async with aiosqlite.connect(self.__db_path) as conn: + await conn.executescript(sql_script) + await conn.commit() \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/base/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/base/__init__.py new file mode 100644 index 0000000..7c5494a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/base/__init__.py @@ -0,0 +1,11 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/base/base_crawler.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/base/base_crawler.py new file mode 100644 index 0000000..a42b49b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/base/base_crawler.py @@ -0,0 +1,118 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +from abc import ABC, abstractmethod +from typing import Dict, Optional + +from playwright.async_api import BrowserContext, BrowserType, Playwright + + +class AbstractCrawler(ABC): + + @abstractmethod + async def start(self): + """ + start crawler + """ + pass + + @abstractmethod + async def search(self): + """ + search + """ + pass + + @abstractmethod + async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext: + """ + launch browser + :param chromium: chromium browser + :param playwright_proxy: playwright proxy + :param user_agent: user agent + :param headless: headless mode + :return: browser context + """ + pass + + async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext: + """ + 使用CDP模式启动浏览器(可选实现) + :param playwright: playwright实例 + :param playwright_proxy: playwright代理配置 + :param user_agent: 用户代理 + :param headless: 无头模式 + :return: 浏览器上下文 + """ + # 默认实现:回退到标准模式 + return await self.launch_browser(playwright.chromium, playwright_proxy, user_agent, headless) + + +class AbstractLogin(ABC): + + @abstractmethod + async def begin(self): + pass + + @abstractmethod + async def login_by_qrcode(self): + pass + + @abstractmethod + async def login_by_mobile(self): + pass + + @abstractmethod + async def login_by_cookies(self): + pass + + +class AbstractStore(ABC): + + @abstractmethod + async def store_content(self, content_item: Dict): + pass + + @abstractmethod + async def store_comment(self, comment_item: Dict): + pass + + # TODO support all platform + # only xhs is supported, so @abstractmethod is commented + @abstractmethod + async def store_creator(self, creator: Dict): + pass + + +class AbstractStoreImage(ABC): + # TODO: support all platform + # only weibo is supported + # @abstractmethod + async def store_image(self, image_content_item: Dict): + pass + + +class AbstractStoreVideo(ABC): + # TODO: support all platform + # only weibo is supported + # @abstractmethod + async def store_video(self, video_content_item: Dict): + pass + + +class AbstractApiClient(ABC): + + @abstractmethod + async def request(self, method, url, **kwargs): + pass + + @abstractmethod + async def update_cookies(self, browser_context: BrowserContext): + pass diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/__init__.py new file mode 100644 index 0000000..7c5494a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/__init__.py @@ -0,0 +1,11 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/abs_cache.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/abs_cache.py new file mode 100644 index 0000000..26df099 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/abs_cache.py @@ -0,0 +1,53 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/2 11:06 +# @Desc : 抽象类 + +from abc import ABC, abstractmethod +from typing import Any, List, Optional + + +class AbstractCache(ABC): + + @abstractmethod + def get(self, key: str) -> Optional[Any]: + """ + 从缓存中获取键的值。 + 这是一个抽象方法。子类必须实现这个方法。 + :param key: 键 + :return: + """ + raise NotImplementedError + + @abstractmethod + def set(self, key: str, value: Any, expire_time: int) -> None: + """ + 将键的值设置到缓存中。 + 这是一个抽象方法。子类必须实现这个方法。 + :param key: 键 + :param value: 值 + :param expire_time: 过期时间 + :return: + """ + raise NotImplementedError + + @abstractmethod + def keys(self, pattern: str) -> List[str]: + """ + 获取所有符合pattern的key + :param pattern: 匹配模式 + :return: + """ + raise NotImplementedError diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/cache_factory.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/cache_factory.py new file mode 100644 index 0000000..543c4a6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/cache_factory.py @@ -0,0 +1,40 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/2 11:23 +# @Desc : + + +class CacheFactory: + """ + 缓存工厂类 + """ + + @staticmethod + def create_cache(cache_type: str, *args, **kwargs): + """ + 创建缓存对象 + :param cache_type: 缓存类型 + :param args: 参数 + :param kwargs: 关键字参数 + :return: + """ + if cache_type == 'memory': + from .local_cache import ExpiringLocalCache + return ExpiringLocalCache(*args, **kwargs) + elif cache_type == 'redis': + from .redis_cache import RedisCache + return RedisCache() + else: + raise ValueError(f'Unknown cache type: {cache_type}') diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/local_cache.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/local_cache.py new file mode 100644 index 0000000..64abe68 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/local_cache.py @@ -0,0 +1,131 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/2 11:05 +# @Desc : 本地缓存 + +import asyncio +import time +from typing import Any, Dict, List, Optional, Tuple + +from cache.abs_cache import AbstractCache + + +class ExpiringLocalCache(AbstractCache): + + def __init__(self, cron_interval: int = 10): + """ + 初始化本地缓存 + :param cron_interval: 定时清楚cache的时间间隔 + :return: + """ + self._cron_interval = cron_interval + self._cache_container: Dict[str, Tuple[Any, float]] = {} + self._cron_task: Optional[asyncio.Task] = None + # 开启定时清理任务 + self._schedule_clear() + + def __del__(self): + """ + 析构函数,清理定时任务 + :return: + """ + if self._cron_task is not None: + self._cron_task.cancel() + + def get(self, key: str) -> Optional[Any]: + """ + 从缓存中获取键的值 + :param key: + :return: + """ + value, expire_time = self._cache_container.get(key, (None, 0)) + if value is None: + return None + + # 如果键已过期,则删除键并返回None + if expire_time < time.time(): + del self._cache_container[key] + return None + + return value + + def set(self, key: str, value: Any, expire_time: int) -> None: + """ + 将键的值设置到缓存中 + :param key: + :param value: + :param expire_time: + :return: + """ + self._cache_container[key] = (value, time.time() + expire_time) + + def keys(self, pattern: str) -> List[str]: + """ + 获取所有符合pattern的key + :param pattern: 匹配模式 + :return: + """ + if pattern == '*': + return list(self._cache_container.keys()) + + # 本地缓存通配符暂时将*替换为空 + if '*' in pattern: + pattern = pattern.replace('*', '') + + return [key for key in self._cache_container.keys() if pattern in key] + + def _schedule_clear(self): + """ + 开启定时清理任务, + :return: + """ + + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + self._cron_task = loop.create_task(self._start_clear_cron()) + + def _clear(self): + """ + 根据过期时间清理缓存 + :return: + """ + for key, (value, expire_time) in self._cache_container.items(): + if expire_time < time.time(): + del self._cache_container[key] + + async def _start_clear_cron(self): + """ + 开启定时清理任务 + :return: + """ + while True: + self._clear() + await asyncio.sleep(self._cron_interval) + + +if __name__ == '__main__': + cache = ExpiringLocalCache(cron_interval=2) + cache.set('name', '程序员阿江-Relakkes', 3) + print(cache.get('key')) + print(cache.keys("*")) + time.sleep(4) + print(cache.get('key')) + del cache + time.sleep(1) + print("done") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/redis_cache.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/redis_cache.py new file mode 100644 index 0000000..1a4d41a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cache/redis_cache.py @@ -0,0 +1,87 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/5/29 22:57 +# @Desc : RedisCache实现 +import pickle +import time +from typing import Any, List + +from redis import Redis + +from cache.abs_cache import AbstractCache +from config import db_config + + +class RedisCache(AbstractCache): + + def __init__(self) -> None: + # 连接redis, 返回redis客户端 + self._redis_client = self._connet_redis() + + @staticmethod + def _connet_redis() -> Redis: + """ + 连接redis, 返回redis客户端, 这里按需配置redis连接信息 + :return: + """ + return Redis( + host=db_config.REDIS_DB_HOST, + port=db_config.REDIS_DB_PORT, + db=db_config.REDIS_DB_NUM, + password=db_config.REDIS_DB_PWD, + ) + + def get(self, key: str) -> Any: + """ + 从缓存中获取键的值, 并且反序列化 + :param key: + :return: + """ + value = self._redis_client.get(key) + if value is None: + return None + return pickle.loads(value) + + def set(self, key: str, value: Any, expire_time: int) -> None: + """ + 将键的值设置到缓存中, 并且序列化 + :param key: + :param value: + :param expire_time: + :return: + """ + self._redis_client.set(key, pickle.dumps(value), ex=expire_time) + + def keys(self, pattern: str) -> List[str]: + """ + 获取所有符合pattern的key + """ + return [key.decode() for key in self._redis_client.keys(pattern)] + + +if __name__ == '__main__': + redis_cache = RedisCache() + # basic usage + redis_cache.set("name", "程序员阿江-Relakkes", 1) + print(redis_cache.get("name")) # Relakkes + print(redis_cache.keys("*")) # ['name'] + time.sleep(2) + print(redis_cache.get("name")) # None + + # special python type usage + # list + redis_cache.set("list", [1, 2, 3], 10) + _value = redis_cache.get("list") + print(_value, f"value type:{type(_value)}") # [1, 2, 3] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/__init__.py new file mode 100644 index 0000000..bfdbd9a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/__init__.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from .arg import * diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py new file mode 100644 index 0000000..12643ee --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py @@ -0,0 +1,55 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import argparse + +import config +from tools.utils import str2bool + + +async def parse_cmd(): + # 读取command arg + parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序') + parser.add_argument('--platform', type=str, + help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)', + choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM) + parser.add_argument('--lt', type=str, + help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)', + choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) + parser.add_argument('--type', type=str, + help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)', + choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) + parser.add_argument('--start', type=int, + help='Number of start page / 起始页码', default=config.START_PAGE) + parser.add_argument('--keywords', type=str, + help='Please input keywords / 请输入关键词', default=config.KEYWORDS) + parser.add_argument('--get_comment', type=str2bool, + help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS) + parser.add_argument('--get_sub_comment', type=str2bool, + help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS) + parser.add_argument('--save_data_option', type=str, + help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)', + choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION) + parser.add_argument('--cookies', type=str, + help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES) + + args = parser.parse_args() + + # override config + config.PLATFORM = args.platform + config.LOGIN_TYPE = args.lt + config.CRAWLER_TYPE = args.type + config.START_PAGE = args.start + config.KEYWORDS = args.keywords + config.ENABLE_GET_COMMENTS = args.get_comment + config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment + config.SAVE_DATA_OPTION = args.save_data_option + config.COOKIES = args.cookies diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py new file mode 100644 index 0000000..eb3f161 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py @@ -0,0 +1,14 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from .base_config import * +from .db_config import * +from .tieba_config import * \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py new file mode 100644 index 0000000..70665b4 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py @@ -0,0 +1,115 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 基础配置 +PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu +KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔 +LOGIN_TYPE = "qrcode" # qrcode or phone or cookie +COOKIES = "" +CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) +# 是否开启 IP 代理 +ENABLE_IP_PROXY = False + +# 代理IP池数量 +IP_PROXY_POOL_COUNT = 2 + +# 代理IP提供商名称 +IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp + +# 设置为True不会打开浏览器(无头浏览器) +# 设置False会打开一个浏览器 +# 小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码 +# 抖音如果一直提示失败,打开浏览器看下是否扫码登录之后出现了手机号验证,如果出现了手动过一下再试。 +HEADLESS = True + +# 是否保存登录状态 +SAVE_LOGIN_STATE = True + +# ==================== CDP (Chrome DevTools Protocol) 配置 ==================== +# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力 +# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制 +# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险 +ENABLE_CDP_MODE = False + +# CDP调试端口,用于与浏览器通信 +# 如果端口被占用,系统会自动尝试下一个可用端口 +CDP_DEBUG_PORT = 9222 + +# 自定义浏览器路径(可选) +# 如果为空,系统会自动检测Chrome/Edge的安装路径 +# Windows示例: "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" +# macOS示例: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" +CUSTOM_BROWSER_PATH = "" + +# CDP模式下是否启用无头模式 +# 注意:即使设置为True,某些反检测功能在无头模式下可能效果不佳 +CDP_HEADLESS = False + +# 浏览器启动超时时间(秒) +BROWSER_LAUNCH_TIMEOUT = 30 + +# 是否在程序结束时自动关闭浏览器 +# 设置为False可以保持浏览器运行,便于调试 +AUTO_CLOSE_BROWSER = True + +# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。 +SAVE_DATA_OPTION = "db" # csv or db or json or sqlite + +# 用户浏览器缓存的浏览器文件配置 +USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name + +# 爬取开始页数 默认从第一页开始 +START_PAGE = 1 + +# 爬取视频/帖子的数量控制 +CRAWLER_MAX_NOTES_COUNT = 10 + +# 并发爬虫数量控制 +MAX_CONCURRENCY_NUM = 1 + +# 是否开启爬媒体模式(包含图片或视频资源),默认不开启爬媒体 +ENABLE_GET_MEIDAS = False + +# 是否开启爬评论模式, 默认开启爬评论 +ENABLE_GET_COMMENTS = True + +# 爬取一级评论的数量控制(单视频/帖子) +CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 20 + +# 是否开启爬二级评论模式, 默认不开启爬二级评论 +# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 +ENABLE_GET_SUB_COMMENTS = False + +# 词云相关 +# 是否开启生成评论词云图 +ENABLE_GET_WORDCLOUD = False +# 自定义词语及其分组 +# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 +CUSTOM_WORDS = { + "零几": "年份", # 将“零几”识别为一个整体 + "高频词": "专业术语", # 示例自定义词 +} + +# 停用(禁用)词文件路径 +STOP_WORDS_FILE = "./docs/hit_stopwords.txt" + +# 中文字体文件路径 +FONT_PATH = "./docs/STZHONGS.TTF" + +# 爬取间隔时间 +CRAWLER_MAX_SLEEP_SEC = 2 + +from .bilibili_config import * +from .xhs_config import * +from .dy_config import * +from .ks_config import * +from .weibo_config import * +from .tieba_config import * +from .zhihu_config import * diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py new file mode 100644 index 0000000..2b516b4 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 +# bilili 平台配置 + +# 每天爬取视频/帖子的数量控制 +MAX_NOTES_PER_DAY = 1 + +# 指定B站视频ID列表 +BILI_SPECIFIED_ID_LIST = [ + "BV1d54y1g7db", + "BV1Sz4y1U77N", + "BV14Q4y1n7jz", + # ........................ +] + +# 指定B站用户ID列表 +BILI_CREATOR_ID_LIST = [ + "20813884", + # ........................ +] + +# 指定时间范围 +START_DAY = "2024-01-01" +END_DAY = "2024-01-01" + +# 搜索模式 +BILI_SEARCH_MODE = "normal" + +# 是否爬取用户信息 +CREATOR_MODE = True + +# 开始爬取用户信息页码 +START_CONTACTS_PAGE = 1 + +# 单个视频/帖子最大爬取评论数 +CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100 + +# 单个视频/帖子最大爬取动态数 +CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py new file mode 100644 index 0000000..fd85c35 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py @@ -0,0 +1,33 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import os + +# mysql config - 使用MindSpider的数据库配置 +MYSQL_DB_PWD = "mneDccc7sHHANtFk" +MYSQL_DB_USER = "root" +MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com" +MYSQL_DB_PORT = 3306 +MYSQL_DB_NAME = "mindspider" + + +# redis config +REDIS_DB_HOST = "127.0.0.1" # your redis host +REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password +REDIS_DB_PORT = os.getenv("REDIS_DB_PORT", 6379) # your redis port +REDIS_DB_NUM = os.getenv("REDIS_DB_NUM", 0) # your redis db num + +# cache type +CACHE_TYPE_REDIS = "redis" +CACHE_TYPE_MEMORY = "memory" + +# sqlite config +SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db") \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py new file mode 100644 index 0000000..b974dca --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 抖音平台配置 +PUBLISH_TIME_TYPE = 0 + +# 指定DY视频ID列表 +DY_SPECIFIED_ID_LIST = [ + "7280854932641664319", + "7202432992642387233", + # ........................ +] + +# 指定DY用户ID列表 +DY_CREATOR_ID_LIST = [ + "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE", + # ........................ +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py new file mode 100644 index 0000000..962b457 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py @@ -0,0 +1,20 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 快手平台配置 + +# 指定快手视频ID列表 +KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"] + +# 指定快手用户ID列表 +KS_CREATOR_ID_LIST = [ + "3x4sm73aye7jq7i", + # ........................ +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/tieba_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/tieba_config.py new file mode 100644 index 0000000..f5fcae6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/tieba_config.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 贴吧平台配置 + +# 指定贴吧ID列表 +TIEBA_SPECIFIED_ID_LIST = [] + +# 指定贴吧名称列表 +TIEBA_NAME_LIST = [ + # "盗墓笔记" +] + +# 指定贴吧用户URL列表 +TIEBA_CREATOR_URL_LIST = [ + "https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs", + # ........................ +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/weibo_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/weibo_config.py new file mode 100644 index 0000000..a8224ad --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/weibo_config.py @@ -0,0 +1,27 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# 微博平台配置 + +# 搜索类型,具体的枚举值在media_platform/weibo/field.py中 +WEIBO_SEARCH_TYPE = "popular" + +# 指定微博ID列表 +WEIBO_SPECIFIED_ID_LIST = [ + "4982041758140155", + # ........................ +] + +# 指定微博用户ID列表 +WEIBO_CREATOR_ID_LIST = [ + "5533390220", + # ........................ +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py new file mode 100644 index 0000000..485277a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# 小红书平台配置 + +# 排序方式,具体的枚举值在media_platform/xhs/field.py中 +SORT_TYPE = "popularity_descending" + +# 指定笔记URL列表, 必须要携带xsec_token参数 +XHS_SPECIFIED_NOTE_URL_LIST = [ + "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" + # ........................ +] + +# 指定用户ID列表 +XHS_CREATOR_ID_LIST = [ + "63e36c9a000000002703502b", + # ........................ +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/zhihu_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/zhihu_config.py new file mode 100644 index 0000000..6038470 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/zhihu_config.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# 知乎平台配置 + +# 指定知乎用户URL列表 +ZHIHU_CREATOR_URL_LIST = [ + "https://www.zhihu.com/people/yd1234567", + # ........................ +] + +# 指定知乎ID列表 +ZHIHU_SPECIFIED_ID_LIST = [ + "https://www.zhihu.com/question/826896610/answer/4885821440", # 回答 + "https://zhuanlan.zhihu.com/p/673461588", # 文章 + "https://www.zhihu.com/zvideo/1539542068422144000", # 视频 +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/__init__.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/__init__.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/baidu_tieba.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/baidu_tieba.py new file mode 100644 index 0000000..f0fa221 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/baidu_tieba.py @@ -0,0 +1,14 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- + +TIEBA_URL = 'https://tieba.baidu.com' \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/zhihu.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/zhihu.py new file mode 100644 index 0000000..2a52667 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/constant/zhihu.py @@ -0,0 +1,19 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +ZHIHU_URL = "https://www.zhihu.com" +ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com" + +ANSWER_NAME = "answer" +ARTICLE_NAME = "article" +VIDEO_NAME = "zvideo" + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py new file mode 100644 index 0000000..eb9c4ce --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py @@ -0,0 +1,209 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 14:54 +# @Desc : mediacrawler db 管理 +import asyncio +from typing import Dict +from urllib.parse import urlparse + +import aiofiles +import aiomysql + +import config +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from tools import utils +from var import db_conn_pool_var, media_crawler_db_var + + +async def init_mediacrawler_db(): + """ + 初始化数据库链接池对象,并将该对象塞给media_crawler_db_var上下文变量 + Returns: + + """ + pool = await aiomysql.create_pool( + host=config.MYSQL_DB_HOST, + port=config.MYSQL_DB_PORT, + user=config.MYSQL_DB_USER, + password=config.MYSQL_DB_PWD, + db=config.MYSQL_DB_NAME, + autocommit=True, + ) + async_db_obj = AsyncMysqlDB(pool) + + # 将连接池对象和封装的CRUD sql接口对象放到上下文变量中 + db_conn_pool_var.set(pool) + media_crawler_db_var.set(async_db_obj) + + +async def init_sqlite_db(): + """ + 初始化SQLite数据库对象,并将该对象塞给media_crawler_db_var上下文变量 + Returns: + + """ + async_db_obj = AsyncSqliteDB(config.SQLITE_DB_PATH) + + # 将SQLite数据库对象放到上下文变量中 + media_crawler_db_var.set(async_db_obj) + + +async def init_db(): + """ + 初始化db连接池 + Returns: + + """ + utils.logger.info("[init_db] start init mediacrawler db connect object") + if config.SAVE_DATA_OPTION == "sqlite": + await init_sqlite_db() + utils.logger.info("[init_db] end init sqlite db connect object") + else: + await init_mediacrawler_db() + utils.logger.info("[init_db] end init mysql db connect object") + + +async def close(): + """ + 关闭数据库连接 + Returns: + + """ + utils.logger.info("[close] close mediacrawler db connection") + if config.SAVE_DATA_OPTION == "sqlite": + # SQLite数据库连接会在AsyncSqliteDB对象销毁时自动关闭 + utils.logger.info("[close] sqlite db connection will be closed automatically") + else: + # MySQL连接池关闭 + db_pool: aiomysql.Pool = db_conn_pool_var.get() + if db_pool is not None: + db_pool.close() + utils.logger.info("[close] mysql db pool closed") + + +async def init_table_schema(db_type: str = None): + """ + 用来初始化数据库表结构,请在第一次需要创建表结构的时候使用,多次执行该函数会将已有的表以及数据全部删除 + Args: + db_type: 数据库类型,可选值为 'sqlite' 或 'mysql',如果不指定则使用配置文件中的设置 + Returns: + + """ + # 如果没有指定数据库类型,则使用配置文件中的设置 + if db_type is None: + db_type = config.SAVE_DATA_OPTION + + if db_type == "sqlite": + utils.logger.info("[init_table_schema] begin init sqlite table schema ...") + + # 检查并删除可能存在的损坏数据库文件 + import os + if os.path.exists(config.SQLITE_DB_PATH): + try: + # 尝试删除现有的数据库文件 + os.remove(config.SQLITE_DB_PATH) + utils.logger.info(f"[init_table_schema] removed existing sqlite db file: {config.SQLITE_DB_PATH}") + except Exception as e: + utils.logger.warning(f"[init_table_schema] failed to remove existing sqlite db file: {e}") + # 如果删除失败,尝试重命名文件 + try: + backup_path = f"{config.SQLITE_DB_PATH}.backup_{utils.get_current_timestamp()}" + os.rename(config.SQLITE_DB_PATH, backup_path) + utils.logger.info(f"[init_table_schema] renamed existing sqlite db file to: {backup_path}") + except Exception as rename_e: + utils.logger.error(f"[init_table_schema] failed to rename existing sqlite db file: {rename_e}") + raise rename_e + + await init_sqlite_db() + async_db_obj: AsyncSqliteDB = media_crawler_db_var.get() + async with aiofiles.open("schema/sqlite_tables.sql", mode="r", encoding="utf-8") as f: + schema_sql = await f.read() + await async_db_obj.executescript(schema_sql) + utils.logger.info("[init_table_schema] sqlite table schema init successful") + elif db_type == "mysql": + utils.logger.info("[init_table_schema] begin init mysql table schema ...") + await init_mediacrawler_db() + async_db_obj: AsyncMysqlDB = media_crawler_db_var.get() + async with aiofiles.open("schema/tables.sql", mode="r", encoding="utf-8") as f: + schema_sql = await f.read() + await async_db_obj.execute(schema_sql) + utils.logger.info("[init_table_schema] mysql table schema init successful") + await close() + else: + utils.logger.error(f"[init_table_schema] 不支持的数据库类型: {db_type}") + raise ValueError(f"不支持的数据库类型: {db_type},支持的类型: sqlite, mysql") + + +def show_database_options(): + """ + 显示支持的数据库选项 + """ + print("\n=== MediaCrawler 数据库初始化工具 ===") + print("支持的数据库类型:") + print("1. sqlite - SQLite 数据库 (轻量级,无需额外配置)") + print("2. mysql - MySQL 数据库 (需要配置数据库连接信息)") + print("3. config - 使用配置文件中的设置") + print("4. exit - 退出程序") + print("="*50) + + +def get_user_choice(): + """ + 获取用户选择的数据库类型 + Returns: + str: 用户选择的数据库类型 + """ + while True: + choice = input("请输入数据库类型 (sqlite/mysql/config/exit): ").strip().lower() + + if choice in ['sqlite', 'mysql', 'config', 'exit']: + return choice + else: + print("❌ 无效的选择,请输入: sqlite, mysql, config 或 exit") + + +async def main(): + """ + 主函数,处理用户交互和数据库初始化 + """ + try: + show_database_options() + + while True: + choice = get_user_choice() + + if choice == 'exit': + print("👋 程序已退出") + break + elif choice == 'config': + print(f"📋 使用配置文件中的设置: {config.SAVE_DATA_OPTION}") + await init_table_schema() + print("✅ 数据库表结构初始化完成!") + break + else: + print(f"🚀 开始初始化 {choice.upper()} 数据库...") + await init_table_schema(choice) + print("✅ 数据库表结构初始化完成!") + break + + except KeyboardInterrupt: + print("\n\n⚠️ 用户中断操作") + except Exception as e: + print(f"\n❌ 初始化失败: {str(e)}") + utils.logger.error(f"[main] 数据库初始化失败: {str(e)}") + + +if __name__ == '__main__': + asyncio.get_event_loop().run_until_complete(main()) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/douyin.js b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/douyin.js new file mode 100644 index 0000000..6563c61 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/douyin.js @@ -0,0 +1,435 @@ +// All the content in this article is only for learning and communication use, not for any other purpose, strictly prohibited for commercial use and illegal use, otherwise all the consequences are irrelevant to the author! +// copy from https://github.com/ShilongLee/Crawler/tree/main/lib/js thanks for ShilongLee +function rc4_encrypt(plaintext, key) { + var s = []; + for (var i = 0; i < 256; i++) { + s[i] = i; + } + var j = 0; + for (var i = 0; i < 256; i++) { + j = (j + s[i] + key.charCodeAt(i % key.length)) % 256; + var temp = s[i]; + s[i] = s[j]; + s[j] = temp; + } + + var i = 0; + var j = 0; + var cipher = []; + for (var k = 0; k < plaintext.length; k++) { + i = (i + 1) % 256; + j = (j + s[i]) % 256; + var temp = s[i]; + s[i] = s[j]; + s[j] = temp; + var t = (s[i] + s[j]) % 256; + cipher.push(String.fromCharCode(s[t] ^ plaintext.charCodeAt(k))); + } + return cipher.join(''); +} + +function le(e, r) { + return (e << (r %= 32) | e >>> 32 - r) >>> 0 +} + +function de(e) { + return 0 <= e && e < 16 ? 2043430169 : 16 <= e && e < 64 ? 2055708042 : void console['error']("invalid j for constant Tj") +} + +function pe(e, r, t, n) { + return 0 <= e && e < 16 ? (r ^ t ^ n) >>> 0 : 16 <= e && e < 64 ? (r & t | r & n | t & n) >>> 0 : (console['error']('invalid j for bool function FF'), + 0) +} + +function he(e, r, t, n) { + return 0 <= e && e < 16 ? (r ^ t ^ n) >>> 0 : 16 <= e && e < 64 ? (r & t | ~r & n) >>> 0 : (console['error']('invalid j for bool function GG'), + 0) +} + +function reset() { + this.reg[0] = 1937774191, + this.reg[1] = 1226093241, + this.reg[2] = 388252375, + this.reg[3] = 3666478592, + this.reg[4] = 2842636476, + this.reg[5] = 372324522, + this.reg[6] = 3817729613, + this.reg[7] = 2969243214, + this["chunk"] = [], + this["size"] = 0 +} + +function write(e) { + var a = "string" == typeof e ? function (e) { + n = encodeURIComponent(e)['replace'](/%([0-9A-F]{2})/g, (function (e, r) { + return String['fromCharCode']("0x" + r) + } + )) + , a = new Array(n['length']); + return Array['prototype']['forEach']['call'](n, (function (e, r) { + a[r] = e.charCodeAt(0) + } + )), + a + }(e) : e; + this.size += a.length; + var f = 64 - this['chunk']['length']; + if (a['length'] < f) + this['chunk'] = this['chunk'].concat(a); + else + for (this['chunk'] = this['chunk'].concat(a.slice(0, f)); this['chunk'].length >= 64;) + this['_compress'](this['chunk']), + f < a['length'] ? this['chunk'] = a['slice'](f, Math['min'](f + 64, a['length'])) : this['chunk'] = [], + f += 64 +} + +function sum(e, t) { + e && (this['reset'](), + this['write'](e)), + this['_fill'](); + for (var f = 0; f < this.chunk['length']; f += 64) + this._compress(this['chunk']['slice'](f, f + 64)); + var i = null; + if (t == 'hex') { + i = ""; + for (f = 0; f < 8; f++) + i += se(this['reg'][f]['toString'](16), 8, "0") + } else + for (i = new Array(32), + f = 0; f < 8; f++) { + var c = this.reg[f]; + i[4 * f + 3] = (255 & c) >>> 0, + c >>>= 8, + i[4 * f + 2] = (255 & c) >>> 0, + c >>>= 8, + i[4 * f + 1] = (255 & c) >>> 0, + c >>>= 8, + i[4 * f] = (255 & c) >>> 0 + } + return this['reset'](), + i +} + +function _compress(t) { + if (t < 64) + console.error("compress error: not enough data"); + else { + for (var f = function (e) { + for (var r = new Array(132), t = 0; t < 16; t++) + r[t] = e[4 * t] << 24, + r[t] |= e[4 * t + 1] << 16, + r[t] |= e[4 * t + 2] << 8, + r[t] |= e[4 * t + 3], + r[t] >>>= 0; + for (var n = 16; n < 68; n++) { + var a = r[n - 16] ^ r[n - 9] ^ le(r[n - 3], 15); + a = a ^ le(a, 15) ^ le(a, 23), + r[n] = (a ^ le(r[n - 13], 7) ^ r[n - 6]) >>> 0 + } + for (n = 0; n < 64; n++) + r[n + 68] = (r[n] ^ r[n + 4]) >>> 0; + return r + }(t), i = this['reg'].slice(0), c = 0; c < 64; c++) { + var o = le(i[0], 12) + i[4] + le(de(c), c) + , s = ((o = le(o = (4294967295 & o) >>> 0, 7)) ^ le(i[0], 12)) >>> 0 + , u = pe(c, i[0], i[1], i[2]); + u = (4294967295 & (u = u + i[3] + s + f[c + 68])) >>> 0; + var b = he(c, i[4], i[5], i[6]); + b = (4294967295 & (b = b + i[7] + o + f[c])) >>> 0, + i[3] = i[2], + i[2] = le(i[1], 9), + i[1] = i[0], + i[0] = u, + i[7] = i[6], + i[6] = le(i[5], 19), + i[5] = i[4], + i[4] = (b ^ le(b, 9) ^ le(b, 17)) >>> 0 + } + for (var l = 0; l < 8; l++) + this['reg'][l] = (this['reg'][l] ^ i[l]) >>> 0 + } +} + +function _fill() { + var a = 8 * this['size'] + , f = this['chunk']['push'](128) % 64; + for (64 - f < 8 && (f -= 64); f < 56; f++) + this.chunk['push'](0); + for (var i = 0; i < 4; i++) { + var c = Math['floor'](a / 4294967296); + this['chunk'].push(c >>> 8 * (3 - i) & 255) + } + for (i = 0; i < 4; i++) + this['chunk']['push'](a >>> 8 * (3 - i) & 255) + +} + +function SM3() { + this.reg = []; + this.chunk = []; + this.size = 0; + this.reset() +} +SM3.prototype.reset = reset; +SM3.prototype.write = write; +SM3.prototype.sum = sum; +SM3.prototype._compress = _compress; +SM3.prototype._fill = _fill; + +function result_encrypt(long_str, num = null) { + let s_obj = { + "s0": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", + "s1": "Dkdpgh4ZKsQB80/Mfvw36XI1R25+WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=", + "s2": "Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=", + "s3": "ckdp1h4ZKsUB80/Mfvw36XIgR25+WQAlEi7NLboqYTOPuzmFjJnryx9HVGDaStCe", + "s4": "Dkdpgh2ZmsQB80/MfvV36XI1R45-WUAlEixNLwoqYTOPuzKFjJnry79HbGcaStCe" + } + let constant = { + "0": 16515072, + "1": 258048, + "2": 4032, + "str": s_obj[num], + } + + let result = ""; + let lound = 0; + let long_int = get_long_int(lound, long_str); + for (let i = 0; i < long_str.length / 3 * 4; i++) { + if (Math.floor(i / 4) !== lound) { + lound += 1; + long_int = get_long_int(lound, long_str); + } + let key = i % 4; + switch (key) { + case 0: + temp_int = (long_int & constant["0"]) >> 18; + result += constant["str"].charAt(temp_int); + break; + case 1: + temp_int = (long_int & constant["1"]) >> 12; + result += constant["str"].charAt(temp_int); + break; + case 2: + temp_int = (long_int & constant["2"]) >> 6; + result += constant["str"].charAt(temp_int); + break; + case 3: + temp_int = long_int & 63; + result += constant["str"].charAt(temp_int); + break; + default: + break; + } + } + return result; +} + +function get_long_int(round, long_str) { + round = round * 3; + return (long_str.charCodeAt(round) << 16) | (long_str.charCodeAt(round + 1) << 8) | (long_str.charCodeAt(round + 2)); +} + +function gener_random(random, option) { + return [ + (random & 255 & 170) | option[0] & 85, // 163 + (random & 255 & 85) | option[0] & 170, //87 + (random >> 8 & 255 & 170) | option[1] & 85, //37 + (random >> 8 & 255 & 85) | option[1] & 170, //41 + ] +} + +////////////////////////////////////////////// +function generate_rc4_bb_str(url_search_params, user_agent, window_env_str, suffix = "cus", Arguments = [0, 1, 14]) { + let sm3 = new SM3() + let start_time = Date.now() + /** + * 进行3次加密处理 + * 1: url_search_params两次sm3之的结果 + * 2: 对后缀两次sm3之的结果 + * 3: 对ua处理之后的结果 + */ + // url_search_params两次sm3之的结果 + let url_search_params_list = sm3.sum(sm3.sum(url_search_params + suffix)) + // 对后缀两次sm3之的结果 + let cus = sm3.sum(sm3.sum(suffix)) + // 对ua处理之后的结果 + let ua = sm3.sum(result_encrypt(rc4_encrypt(user_agent, String.fromCharCode.apply(null, [0.00390625, 1, Arguments[2]])), "s3")) + // + let end_time = Date.now() + // b + let b = { + 8: 3, // 固定 + 10: end_time, //3次加密结束时间 + 15: { + "aid": 6383, + "pageId": 6241, + "boe": false, + "ddrt": 7, + "paths": { + "include": [ + {}, + {}, + {}, + {}, + {}, + {}, + {} + ], + "exclude": [] + }, + "track": { + "mode": 0, + "delay": 300, + "paths": [] + }, + "dump": true, + "rpU": "" + }, + 16: start_time, //3次加密开始时间 + 18: 44, //固定 + 19: [1, 0, 1, 5], + } + + //3次加密开始时间 + b[20] = (b[16] >> 24) & 255 + b[21] = (b[16] >> 16) & 255 + b[22] = (b[16] >> 8) & 255 + b[23] = b[16] & 255 + b[24] = (b[16] / 256 / 256 / 256 / 256) >> 0 + b[25] = (b[16] / 256 / 256 / 256 / 256 / 256) >> 0 + + // 参数Arguments [0, 1, 14, ...] + // let Arguments = [0, 1, 14] + b[26] = (Arguments[0] >> 24) & 255 + b[27] = (Arguments[0] >> 16) & 255 + b[28] = (Arguments[0] >> 8) & 255 + b[29] = Arguments[0] & 255 + + b[30] = (Arguments[1] / 256) & 255 + b[31] = (Arguments[1] % 256) & 255 + b[32] = (Arguments[1] >> 24) & 255 + b[33] = (Arguments[1] >> 16) & 255 + + b[34] = (Arguments[2] >> 24) & 255 + b[35] = (Arguments[2] >> 16) & 255 + b[36] = (Arguments[2] >> 8) & 255 + b[37] = Arguments[2] & 255 + + // (url_search_params + "cus") 两次sm3之的结果 + /**let url_search_params_list = [ + 91, 186, 35, 86, 143, 253, 6, 76, + 34, 21, 167, 148, 7, 42, 192, 219, + 188, 20, 182, 85, 213, 74, 213, 147, + 37, 155, 93, 139, 85, 118, 228, 213 + ]*/ + b[38] = url_search_params_list[21] + b[39] = url_search_params_list[22] + + // ("cus") 对后缀两次sm3之的结果 + /** + * let cus = [ + 136, 101, 114, 147, 58, 77, 207, 201, + 215, 162, 154, 93, 248, 13, 142, 160, + 105, 73, 215, 241, 83, 58, 51, 43, + 255, 38, 168, 141, 216, 194, 35, 236 + ]*/ + b[40] = cus[21] + b[41] = cus[22] + + // 对ua处理之后的结果 + /** + * let ua = [ + 129, 190, 70, 186, 86, 196, 199, 53, + 99, 38, 29, 209, 243, 17, 157, 69, + 147, 104, 53, 23, 114, 126, 66, 228, + 135, 30, 168, 185, 109, 156, 251, 88 + ]*/ + b[42] = ua[23] + b[43] = ua[24] + + //3次加密结束时间 + b[44] = (b[10] >> 24) & 255 + b[45] = (b[10] >> 16) & 255 + b[46] = (b[10] >> 8) & 255 + b[47] = b[10] & 255 + b[48] = b[8] + b[49] = (b[10] / 256 / 256 / 256 / 256) >> 0 + b[50] = (b[10] / 256 / 256 / 256 / 256 / 256) >> 0 + + + // object配置项 + b[51] = b[15]['pageId'] + b[52] = (b[15]['pageId'] >> 24) & 255 + b[53] = (b[15]['pageId'] >> 16) & 255 + b[54] = (b[15]['pageId'] >> 8) & 255 + b[55] = b[15]['pageId'] & 255 + + b[56] = b[15]['aid'] + b[57] = b[15]['aid'] & 255 + b[58] = (b[15]['aid'] >> 8) & 255 + b[59] = (b[15]['aid'] >> 16) & 255 + b[60] = (b[15]['aid'] >> 24) & 255 + + // 中间进行了环境检测 + // 代码索引: 2496 索引值: 17 (索引64关键条件) + // '1536|747|1536|834|0|30|0|0|1536|834|1536|864|1525|747|24|24|Win32'.charCodeAt()得到65位数组 + /** + * let window_env_list = [49, 53, 51, 54, 124, 55, 52, 55, 124, 49, 53, 51, 54, 124, 56, 51, 52, 124, 48, 124, 51, + * 48, 124, 48, 124, 48, 124, 49, 53, 51, 54, 124, 56, 51, 52, 124, 49, 53, 51, 54, 124, 56, + * 54, 52, 124, 49, 53, 50, 53, 124, 55, 52, 55, 124, 50, 52, 124, 50, 52, 124, 87, 105, 110, + * 51, 50] + */ + let window_env_list = []; + for (let index = 0; index < window_env_str.length; index++) { + window_env_list.push(window_env_str.charCodeAt(index)) + } + b[64] = window_env_list.length + b[65] = b[64] & 255 + b[66] = (b[64] >> 8) & 255 + + b[69] = [].length + b[70] = b[69] & 255 + b[71] = (b[69] >> 8) & 255 + + b[72] = b[18] ^ b[20] ^ b[26] ^ b[30] ^ b[38] ^ b[40] ^ b[42] ^ b[21] ^ b[27] ^ b[31] ^ b[35] ^ b[39] ^ b[41] ^ b[43] ^ b[22] ^ + b[28] ^ b[32] ^ b[36] ^ b[23] ^ b[29] ^ b[33] ^ b[37] ^ b[44] ^ b[45] ^ b[46] ^ b[47] ^ b[48] ^ b[49] ^ b[50] ^ b[24] ^ + b[25] ^ b[52] ^ b[53] ^ b[54] ^ b[55] ^ b[57] ^ b[58] ^ b[59] ^ b[60] ^ b[65] ^ b[66] ^ b[70] ^ b[71] + let bb = [ + b[18], b[20], b[52], b[26], b[30], b[34], b[58], b[38], b[40], b[53], b[42], b[21], b[27], b[54], b[55], b[31], + b[35], b[57], b[39], b[41], b[43], b[22], b[28], b[32], b[60], b[36], b[23], b[29], b[33], b[37], b[44], b[45], + b[59], b[46], b[47], b[48], b[49], b[50], b[24], b[25], b[65], b[66], b[70], b[71] + ] + bb = bb.concat(window_env_list).concat(b[72]) + return rc4_encrypt(String.fromCharCode.apply(null, bb), String.fromCharCode.apply(null, [121])); +} + +function generate_random_str() { + let random_str_list = [] + random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [3, 45])) + random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [1, 0])) + random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [1, 5])) + return String.fromCharCode.apply(null, random_str_list) +} + +function sign(url_search_params, user_agent, arguments) { + /** + * url_search_params:"device_platform=webapp&aid=6383&channel=channel_pc_web&update_version_code=170400&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=123.0.0.0&browser_online=true&engine_name=Blink&engine_version=123.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7362810250930783783&msToken=VkDUvz1y24CppXSl80iFPr6ez-3FiizcwD7fI1OqBt6IICq9RWG7nCvxKb8IVi55mFd-wnqoNkXGnxHrikQb4PuKob5Q-YhDp5Um215JzlBszkUyiEvR" + * user_agent:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" + */ + let result_str = generate_random_str() + generate_rc4_bb_str( + url_search_params, + user_agent, + "1536|747|1536|834|0|30|0|0|1536|834|1536|864|1525|747|24|24|Win32", + "cus", + arguments + ); + return result_encrypt(result_str, "s4") + "="; +} + +function sign_datail(params, userAgent) { + return sign(params, userAgent, [0, 1, 14]) +} + +function sign_reply(params, userAgent) { + return sign(params, userAgent, [0, 1, 8]) +} \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/stealth.min.js b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/stealth.min.js new file mode 100644 index 0000000..b4ca68f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/stealth.min.js @@ -0,0 +1,7 @@ +/*! + * Note: Auto-generated, do not update manually. + * Generated by: https://github.com/berstend/puppeteer-extra/tree/master/packages/extract-stealth-evasions + * Generated on: Mon, 05 Jun 2023 06:17:57 GMT + * License: MIT + */ +(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:'utils => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, \'chrome\')`\n Object.defineProperty(window, \'chrome\', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We\'ll extend that later\n })\n }\n\n // That means we\'re running headful and don\'t need to mock anything\n if (\'app\' in window.chrome) {\n return // Nothing to do here\n }\n\n const makeError = {\n ErrorInInvocation: fn => {\n const err = new TypeError(`Error in invocation of app.${fn}()`)\n return utils.stripErrorWithAnchor(\n err,\n `at ${fn} (eval at `\n )\n }\n }\n\n // There\'s a some static data in that property which doesn\'t seem to change,\n // we should periodically check for updates: `JSON.stringify(window.app, null, 2)`\n const STATIC_DATA = JSON.parse(\n `\n{\n "isInstalled": false,\n "InstallState": {\n "DISABLED": "disabled",\n "INSTALLED": "installed",\n "NOT_INSTALLED": "not_installed"\n },\n "RunningState": {\n "CANNOT_RUN": "cannot_run",\n "READY_TO_RUN": "ready_to_run",\n "RUNNING": "running"\n }\n}\n `.trim()\n )\n\n window.chrome.app = {\n ...STATIC_DATA,\n\n get isInstalled() {\n return false\n },\n\n getDetails: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`getDetails`)\n }\n return null\n },\n getIsInstalled: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`getIsInstalled`)\n }\n return false\n },\n runningState: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`runningState`)\n }\n return \'cannot_run\'\n }\n }\n utils.patchToStringNested(window.chrome.app)\n }',_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"utils => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n if ('csi' in window.chrome) {\n return // Nothing to do here\n }\n\n // Check that the Navigation Timing API v1 is available, we need that\n if (!window.performance || !window.performance.timing) {\n return\n }\n\n const { timing } = window.performance\n\n window.chrome.csi = function() {\n return {\n onloadT: timing.domContentLoadedEventEnd,\n startE: timing.navigationStart,\n pageT: Date.now() - timing.navigationStart,\n tran: 15 // Transition type or something\n }\n }\n utils.patchToString(window.chrome.csi)\n }",_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n if ('loadTimes' in window.chrome) {\n return // Nothing to do here\n }\n\n // Check that the Navigation Timing API v1 + v2 is available, we need that\n if (\n !window.performance ||\n !window.performance.timing ||\n !window.PerformancePaintTiming\n ) {\n return\n }\n\n const { performance } = window\n\n // Some stuff is not available on about:blank as it requires a navigation to occur,\n // let's harden the code to not fail then:\n const ntEntryFallback = {\n nextHopProtocol: 'h2',\n type: 'other'\n }\n\n // The API exposes some funky info regarding the connection\n const protocolInfo = {\n get connectionInfo() {\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ntEntry.nextHopProtocol\n },\n get npnNegotiatedProtocol() {\n // NPN is deprecated in favor of ALPN, but this implementation returns the\n // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n ? ntEntry.nextHopProtocol\n : 'unknown'\n },\n get navigationType() {\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ntEntry.type\n },\n get wasAlternateProtocolAvailable() {\n // The Alternate-Protocol header is deprecated in favor of Alt-Svc\n // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this\n // should always return false.\n return false\n },\n get wasFetchedViaSpdy() {\n // SPDY is deprecated in favor of HTTP/2, but this implementation returns\n // true for HTTP/2 or HTTP2+QUIC/39 as well.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n },\n get wasNpnNegotiated() {\n // NPN is deprecated in favor of ALPN, but this implementation returns true\n // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n }\n }\n\n const { timing } = window.performance\n\n // Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3\n function toFixed(num, fixed) {\n var re = new RegExp('^-?\\\\d+(?:.\\\\d{0,' + (fixed || -1) + '})?')\n return num.toString().match(re)[0]\n }\n\n const timingInfo = {\n get firstPaintAfterLoadTime() {\n // This was never actually implemented and always returns 0.\n return 0\n },\n get requestTime() {\n return timing.navigationStart / 1000\n },\n get startLoadTime() {\n return timing.navigationStart / 1000\n },\n get commitLoadTime() {\n return timing.responseStart / 1000\n },\n get finishDocumentLoadTime() {\n return timing.domContentLoadedEventEnd / 1000\n },\n get finishLoadTime() {\n return timing.loadEventEnd / 1000\n },\n get firstPaintTime() {\n const fpEntry = performance.getEntriesByType('paint')[0] || {\n startTime: timing.loadEventEnd / 1000 // Fallback if no navigation occured (`about:blank`)\n }\n return toFixed(\n (fpEntry.startTime + performance.timeOrigin) / 1000,\n 3\n )\n }\n }\n\n window.chrome.loadTimes = function() {\n return {\n ...protocolInfo,\n ...timingInfo\n }\n }\n utils.patchToString(window.chrome.loadTimes)\n }",_args:[{opts:{}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts, STATIC_DATA }) => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n const existsAlready = 'runtime' in window.chrome\n // `chrome.runtime` is only exposed on secure origins\n const isNotSecure = !window.location.protocol.startsWith('https')\n if (existsAlready || (isNotSecure && !opts.runOnInsecureOrigins)) {\n return // Nothing to do here\n }\n\n window.chrome.runtime = {\n // There's a bunch of static data in that property which doesn't seem to change,\n // we should periodically check for updates: `JSON.stringify(window.chrome.runtime, null, 2)`\n ...STATIC_DATA,\n // `chrome.runtime.id` is extension related and returns undefined in Chrome\n get id() {\n return undefined\n },\n // These two require more sophisticated mocks\n connect: null,\n sendMessage: null\n }\n\n const makeCustomRuntimeErrors = (preamble, method, extensionId) => ({\n NoMatchingSignature: new TypeError(\n preamble + `No matching signature.`\n ),\n MustSpecifyExtensionID: new TypeError(\n preamble +\n `${method} called from a webpage must specify an Extension ID (string) for its first argument.`\n ),\n InvalidExtensionID: new TypeError(\n preamble + `Invalid extension id: '${extensionId}'`\n )\n })\n\n // Valid Extension IDs are 32 characters in length and use the letter `a` to `p`:\n // https://source.chromium.org/chromium/chromium/src/+/master:components/crx_file/id_util.cc;drc=14a055ccb17e8c8d5d437fe080faba4c6f07beac;l=90\n const isValidExtensionID = str =>\n str.length === 32 && str.toLowerCase().match(/^[a-p]+$/)\n\n /** Mock `chrome.runtime.sendMessage` */\n const sendMessageHandler = {\n apply: function(target, ctx, args) {\n const [extensionId, options, responseCallback] = args || []\n\n // Define custom errors\n const errorPreamble = `Error in invocation of runtime.sendMessage(optional string extensionId, any message, optional object options, optional function responseCallback): `\n const Errors = makeCustomRuntimeErrors(\n errorPreamble,\n `chrome.runtime.sendMessage()`,\n extensionId\n )\n\n // Check if the call signature looks ok\n const noArguments = args.length === 0\n const tooManyArguments = args.length > 4\n const incorrectOptions = options && typeof options !== 'object'\n const incorrectResponseCallback =\n responseCallback && typeof responseCallback !== 'function'\n if (\n noArguments ||\n tooManyArguments ||\n incorrectOptions ||\n incorrectResponseCallback\n ) {\n throw Errors.NoMatchingSignature\n }\n\n // At least 2 arguments are required before we even validate the extension ID\n if (args.length < 2) {\n throw Errors.MustSpecifyExtensionID\n }\n\n // Now let's make sure we got a string as extension ID\n if (typeof extensionId !== 'string') {\n throw Errors.NoMatchingSignature\n }\n\n if (!isValidExtensionID(extensionId)) {\n throw Errors.InvalidExtensionID\n }\n\n return undefined // Normal behavior\n }\n }\n utils.mockWithProxy(\n window.chrome.runtime,\n 'sendMessage',\n function sendMessage() {},\n sendMessageHandler\n )\n\n /**\n * Mock `chrome.runtime.connect`\n *\n * @see https://developer.chrome.com/apps/runtime#method-connect\n */\n const connectHandler = {\n apply: function(target, ctx, args) {\n const [extensionId, connectInfo] = args || []\n\n // Define custom errors\n const errorPreamble = `Error in invocation of runtime.connect(optional string extensionId, optional object connectInfo): `\n const Errors = makeCustomRuntimeErrors(\n errorPreamble,\n `chrome.runtime.connect()`,\n extensionId\n )\n\n // Behavior differs a bit from sendMessage:\n const noArguments = args.length === 0\n const emptyStringArgument = args.length === 1 && extensionId === ''\n if (noArguments || emptyStringArgument) {\n throw Errors.MustSpecifyExtensionID\n }\n\n const tooManyArguments = args.length > 2\n const incorrectConnectInfoType =\n connectInfo && typeof connectInfo !== 'object'\n\n if (tooManyArguments || incorrectConnectInfoType) {\n throw Errors.NoMatchingSignature\n }\n\n const extensionIdIsString = typeof extensionId === 'string'\n if (extensionIdIsString && extensionId === '') {\n throw Errors.MustSpecifyExtensionID\n }\n if (extensionIdIsString && !isValidExtensionID(extensionId)) {\n throw Errors.InvalidExtensionID\n }\n\n // There's another edge-case here: extensionId is optional so we might find a connectInfo object as first param, which we need to validate\n const validateConnectInfo = ci => {\n // More than a first param connectInfo as been provided\n if (args.length > 1) {\n throw Errors.NoMatchingSignature\n }\n // An empty connectInfo has been provided\n if (Object.keys(ci).length === 0) {\n throw Errors.MustSpecifyExtensionID\n }\n // Loop over all connectInfo props an check them\n Object.entries(ci).forEach(([k, v]) => {\n const isExpected = ['name', 'includeTlsChannelId'].includes(k)\n if (!isExpected) {\n throw new TypeError(\n errorPreamble + `Unexpected property: '${k}'.`\n )\n }\n const MismatchError = (propName, expected, found) =>\n TypeError(\n errorPreamble +\n `Error at property '${propName}': Invalid type: expected ${expected}, found ${found}.`\n )\n if (k === 'name' && typeof v !== 'string') {\n throw MismatchError(k, 'string', typeof v)\n }\n if (k === 'includeTlsChannelId' && typeof v !== 'boolean') {\n throw MismatchError(k, 'boolean', typeof v)\n }\n })\n }\n if (typeof extensionId === 'object') {\n validateConnectInfo(extensionId)\n throw Errors.MustSpecifyExtensionID\n }\n\n // Unfortunately even when the connect fails Chrome will return an object with methods we need to mock as well\n return utils.patchToStringNested(makeConnectResponse())\n }\n }\n utils.mockWithProxy(\n window.chrome.runtime,\n 'connect',\n function connect() {},\n connectHandler\n )\n\n function makeConnectResponse() {\n const onSomething = () => ({\n addListener: function addListener() {},\n dispatch: function dispatch() {},\n hasListener: function hasListener() {},\n hasListeners: function hasListeners() {\n return false\n },\n removeListener: function removeListener() {}\n })\n\n const response = {\n name: '',\n sender: undefined,\n disconnect: function disconnect() {},\n onDisconnect: onSomething(),\n onMessage: onSomething(),\n postMessage: function postMessage() {\n if (!arguments.length) {\n throw new TypeError(`Insufficient number of arguments.`)\n }\n throw new Error(`Attempting to use a disconnected port object`)\n }\n }\n return response\n }\n }",_args:[{opts:{runOnInsecureOrigins:!1},STATIC_DATA:{OnInstalledReason:{CHROME_UPDATE:"chrome_update",INSTALL:"install",SHARED_MODULE_UPDATE:"shared_module_update",UPDATE:"update"},OnRestartRequiredReason:{APP_UPDATE:"app_update",OS_UPDATE:"os_update",PERIODIC:"periodic"},PlatformArch:{ARM:"arm",ARM64:"arm64",MIPS:"mips",MIPS64:"mips64",X86_32:"x86-32",X86_64:"x86-64"},PlatformNaclArch:{ARM:"arm",MIPS:"mips",MIPS64:"mips64",X86_32:"x86-32",X86_64:"x86-64"},PlatformOs:{ANDROID:"android",CROS:"cros",LINUX:"linux",MAC:"mac",OPENBSD:"openbsd",WIN:"win"},RequestUpdateCheckStatus:{NO_UPDATE:"no_update",THROTTLED:"throttled",UPDATE_AVAILABLE:"update_available"}}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"utils => {\n /**\n * Input might look funky, we need to normalize it so e.g. whitespace isn't an issue for our spoofing.\n *\n * @example\n * video/webm; codecs=\"vp8, vorbis\"\n * video/mp4; codecs=\"avc1.42E01E\"\n * audio/x-m4a;\n * audio/ogg; codecs=\"vorbis\"\n * @param {String} arg\n */\n const parseInput = arg => {\n const [mime, codecStr] = arg.trim().split(';')\n let codecs = []\n if (codecStr && codecStr.includes('codecs=\"')) {\n codecs = codecStr\n .trim()\n .replace(`codecs=\"`, '')\n .replace(`\"`, '')\n .trim()\n .split(',')\n .filter(x => !!x)\n .map(x => x.trim())\n }\n return {\n mime,\n codecStr,\n codecs\n }\n }\n\n const canPlayType = {\n // Intercept certain requests\n apply: function(target, ctx, args) {\n if (!args || !args.length) {\n return target.apply(ctx, args)\n }\n const { mime, codecs } = parseInput(args[0])\n // This specific mp4 codec is missing in Chromium\n if (mime === 'video/mp4') {\n if (codecs.includes('avc1.42E01E')) {\n return 'probably'\n }\n }\n // This mimetype is only supported if no codecs are specified\n if (mime === 'audio/x-m4a' && !codecs.length) {\n return 'maybe'\n }\n\n // This mimetype is only supported if no codecs are specified\n if (mime === 'audio/aac' && !codecs.length) {\n return 'probably'\n }\n // Everything else as usual\n return target.apply(ctx, args)\n }\n }\n\n /* global HTMLMediaElement */\n utils.replaceWithProxy(\n HTMLMediaElement.prototype,\n 'canPlayType',\n canPlayType\n )\n }",_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n utils.replaceGetterWithProxy(\n Object.getPrototypeOf(navigator),\n 'hardwareConcurrency',\n utils.makeHandler().getterValue(opts.hardwareConcurrency)\n )\n }",_args:[{opts:{hardwareConcurrency:4}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n const languages = opts.languages.length\n ? opts.languages\n : ['en-US', 'en']\n utils.replaceGetterWithProxy(\n Object.getPrototypeOf(navigator),\n 'languages',\n utils.makeHandler().getterValue(Object.freeze([...languages]))\n )\n }",_args:[{opts:{languages:[]}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n const isSecure = document.location.protocol.startsWith('https')\n\n // In headful on secure origins the permission should be \"default\", not \"denied\"\n if (isSecure) {\n utils.replaceGetterWithProxy(Notification, 'permission', {\n apply() {\n return 'default'\n }\n })\n }\n\n // Another weird behavior:\n // On insecure origins in headful the state is \"denied\",\n // whereas in headless it's \"prompt\"\n if (!isSecure) {\n const handler = {\n apply(target, ctx, args) {\n const param = (args || [])[0]\n\n const isNotifications =\n param && param.name && param.name === 'notifications'\n if (!isNotifications) {\n return utils.cache.Reflect.apply(...arguments)\n }\n\n return Promise.resolve(\n Object.setPrototypeOf(\n {\n state: 'denied',\n onchange: null\n },\n PermissionStatus.prototype\n )\n )\n }\n }\n // Note: Don't use `Object.getPrototypeOf` here\n utils.replaceWithProxy(Permissions.prototype, 'query', handler)\n }\n }",_args:[{}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { fns, data }) => {\n fns = utils.materializeFns(fns)\n\n // That means we're running headful\n const hasPlugins = 'plugins' in navigator && navigator.plugins.length\n if (hasPlugins) {\n return // nothing to do here\n }\n\n const mimeTypes = fns.generateMimeTypeArray(utils, fns)(data.mimeTypes)\n const plugins = fns.generatePluginArray(utils, fns)(data.plugins)\n\n // Plugin and MimeType cross-reference each other, let's do that now\n // Note: We're looping through `data.plugins` here, not the generated `plugins`\n for (const pluginData of data.plugins) {\n pluginData.__mimeTypes.forEach((type, index) => {\n plugins[pluginData.name][index] = mimeTypes[type]\n\n Object.defineProperty(plugins[pluginData.name], type, {\n value: mimeTypes[type],\n writable: false,\n enumerable: false, // Not enumerable\n configurable: true\n })\n Object.defineProperty(mimeTypes[type], 'enabledPlugin', {\n value:\n type === 'application/x-pnacl'\n ? mimeTypes['application/x-nacl'].enabledPlugin // these reference the same plugin, so we need to re-use the Proxy in order to avoid leaks\n : new Proxy(plugins[pluginData.name], {}), // Prevent circular references\n writable: false,\n enumerable: false, // Important: `JSON.stringify(navigator.plugins)`\n configurable: true\n })\n })\n }\n\n const patchNavigator = (name, value) =>\n utils.replaceProperty(Object.getPrototypeOf(navigator), name, {\n get() {\n return value\n }\n })\n\n patchNavigator('mimeTypes', mimeTypes)\n patchNavigator('plugins', plugins)\n\n // All done\n }",_args:[{fns:{generateMimeTypeArray:"(utils, fns) => mimeTypesData => {\n return fns.generateMagicArray(utils, fns)(\n mimeTypesData,\n MimeTypeArray.prototype,\n MimeType.prototype,\n 'type'\n )\n}",generatePluginArray:"(utils, fns) => pluginsData => {\n return fns.generateMagicArray(utils, fns)(\n pluginsData,\n PluginArray.prototype,\n Plugin.prototype,\n 'name'\n )\n}",generateMagicArray:"(utils, fns) =>\n function(\n dataArray = [],\n proto = MimeTypeArray.prototype,\n itemProto = MimeType.prototype,\n itemMainProp = 'type'\n ) {\n // Quick helper to set props with the same descriptors vanilla is using\n const defineProp = (obj, prop, value) =>\n Object.defineProperty(obj, prop, {\n value,\n writable: false,\n enumerable: false, // Important for mimeTypes & plugins: `JSON.stringify(navigator.mimeTypes)`\n configurable: true\n })\n\n // Loop over our fake data and construct items\n const makeItem = data => {\n const item = {}\n for (const prop of Object.keys(data)) {\n if (prop.startsWith('__')) {\n continue\n }\n defineProp(item, prop, data[prop])\n }\n return patchItem(item, data)\n }\n\n const patchItem = (item, data) => {\n let descriptor = Object.getOwnPropertyDescriptors(item)\n\n // Special case: Plugins have a magic length property which is not enumerable\n // e.g. `navigator.plugins[i].length` should always be the length of the assigned mimeTypes\n if (itemProto === Plugin.prototype) {\n descriptor = {\n ...descriptor,\n length: {\n value: data.__mimeTypes.length,\n writable: false,\n enumerable: false,\n configurable: true // Important to be able to use the ownKeys trap in a Proxy to strip `length`\n }\n }\n }\n\n // We need to spoof a specific `MimeType` or `Plugin` object\n const obj = Object.create(itemProto, descriptor)\n\n // Virtually all property keys are not enumerable in vanilla\n const blacklist = [...Object.keys(data), 'length', 'enabledPlugin']\n return new Proxy(obj, {\n ownKeys(target) {\n return Reflect.ownKeys(target).filter(k => !blacklist.includes(k))\n },\n getOwnPropertyDescriptor(target, prop) {\n if (blacklist.includes(prop)) {\n return undefined\n }\n return Reflect.getOwnPropertyDescriptor(target, prop)\n }\n })\n }\n\n const magicArray = []\n\n // Loop through our fake data and use that to create convincing entities\n dataArray.forEach(data => {\n magicArray.push(makeItem(data))\n })\n\n // Add direct property access based on types (e.g. `obj['application/pdf']`) afterwards\n magicArray.forEach(entry => {\n defineProp(magicArray, entry[itemMainProp], entry)\n })\n\n // This is the best way to fake the type to make sure this is false: `Array.isArray(navigator.mimeTypes)`\n const magicArrayObj = Object.create(proto, {\n ...Object.getOwnPropertyDescriptors(magicArray),\n\n // There's one ugly quirk we unfortunately need to take care of:\n // The `MimeTypeArray` prototype has an enumerable `length` property,\n // but headful Chrome will still skip it when running `Object.getOwnPropertyNames(navigator.mimeTypes)`.\n // To strip it we need to make it first `configurable` and can then overlay a Proxy with an `ownKeys` trap.\n length: {\n value: magicArray.length,\n writable: false,\n enumerable: false,\n configurable: true // Important to be able to use the ownKeys trap in a Proxy to strip `length`\n }\n })\n\n // Generate our functional function mocks :-)\n const functionMocks = fns.generateFunctionMocks(utils)(\n proto,\n itemMainProp,\n magicArray\n )\n\n // We need to overlay our custom object with a JS Proxy\n const magicArrayObjProxy = new Proxy(magicArrayObj, {\n get(target, key = '') {\n // Redirect function calls to our custom proxied versions mocking the vanilla behavior\n if (key === 'item') {\n return functionMocks.item\n }\n if (key === 'namedItem') {\n return functionMocks.namedItem\n }\n if (proto === PluginArray.prototype && key === 'refresh') {\n return functionMocks.refresh\n }\n // Everything else can pass through as normal\n return utils.cache.Reflect.get(...arguments)\n },\n ownKeys(target) {\n // There are a couple of quirks where the original property demonstrates \"magical\" behavior that makes no sense\n // This can be witnessed when calling `Object.getOwnPropertyNames(navigator.mimeTypes)` and the absense of `length`\n // My guess is that it has to do with the recent change of not allowing data enumeration and this being implemented weirdly\n // For that reason we just completely fake the available property names based on our data to match what regular Chrome is doing\n // Specific issues when not patching this: `length` property is available, direct `types` props (e.g. `obj['application/pdf']`) are missing\n const keys = []\n const typeProps = magicArray.map(mt => mt[itemMainProp])\n typeProps.forEach((_, i) => keys.push(`${i}`))\n typeProps.forEach(propName => keys.push(propName))\n return keys\n },\n getOwnPropertyDescriptor(target, prop) {\n if (prop === 'length') {\n return undefined\n }\n return Reflect.getOwnPropertyDescriptor(target, prop)\n }\n })\n\n return magicArrayObjProxy\n }",generateFunctionMocks:"utils => (\n proto,\n itemMainProp,\n dataArray\n) => ({\n /** Returns the MimeType object with the specified index. */\n item: utils.createProxy(proto.item, {\n apply(target, ctx, args) {\n if (!args.length) {\n throw new TypeError(\n `Failed to execute 'item' on '${\n proto[Symbol.toStringTag]\n }': 1 argument required, but only 0 present.`\n )\n }\n // Special behavior alert:\n // - Vanilla tries to cast strings to Numbers (only integers!) and use them as property index lookup\n // - If anything else than an integer (including as string) is provided it will return the first entry\n const isInteger = args[0] && Number.isInteger(Number(args[0])) // Cast potential string to number first, then check for integer\n // Note: Vanilla never returns `undefined`\n return (isInteger ? dataArray[Number(args[0])] : dataArray[0]) || null\n }\n }),\n /** Returns the MimeType object with the specified name. */\n namedItem: utils.createProxy(proto.namedItem, {\n apply(target, ctx, args) {\n if (!args.length) {\n throw new TypeError(\n `Failed to execute 'namedItem' on '${\n proto[Symbol.toStringTag]\n }': 1 argument required, but only 0 present.`\n )\n }\n return dataArray.find(mt => mt[itemMainProp] === args[0]) || null // Not `undefined`!\n }\n }),\n /** Does nothing and shall return nothing */\n refresh: proto.refresh\n ? utils.createProxy(proto.refresh, {\n apply(target, ctx, args) {\n return undefined\n }\n })\n : undefined\n})"},data:{mimeTypes:[{type:"application/pdf",suffixes:"pdf",description:"",__pluginName:"Chrome PDF Viewer"},{type:"application/x-google-chrome-pdf",suffixes:"pdf",description:"Portable Document Format",__pluginName:"Chrome PDF Plugin"},{type:"application/x-nacl",suffixes:"",description:"Native Client Executable",__pluginName:"Native Client"},{type:"application/x-pnacl",suffixes:"",description:"Portable Native Client Executable",__pluginName:"Native Client"}],plugins:[{name:"Chrome PDF Plugin",filename:"internal-pdf-viewer",description:"Portable Document Format",__mimeTypes:["application/x-google-chrome-pdf"]},{name:"Chrome PDF Viewer",filename:"mhjfbmdgcfjbbpaeojofohoefgiehjai",description:"",__mimeTypes:["application/pdf"]},{name:"Native Client",filename:"internal-nacl-plugin",description:"",__mimeTypes:["application/x-nacl","application/x-pnacl"]}]}}]}),!1===navigator.webdriver||void 0===navigator.webdriver||delete Object.getPrototypeOf(navigator).webdriver,(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n const getParameterProxyHandler = {\n apply: function(target, ctx, args) {\n const param = (args || [])[0]\n const result = utils.cache.Reflect.apply(target, ctx, args)\n // UNMASKED_VENDOR_WEBGL\n if (param === 37445) {\n return opts.vendor || 'Intel Inc.' // default in headless: Google Inc.\n }\n // UNMASKED_RENDERER_WEBGL\n if (param === 37446) {\n return opts.renderer || 'Intel Iris OpenGL Engine' // default in headless: Google SwiftShader\n }\n return result\n }\n }\n\n // There's more than one WebGL rendering context\n // https://developer.mozilla.org/en-US/docs/Web/API/WebGL2RenderingContext#Browser_compatibility\n // To find out the original values here: Object.getOwnPropertyDescriptors(WebGLRenderingContext.prototype.getParameter)\n const addProxy = (obj, propName) => {\n utils.replaceWithProxy(obj, propName, getParameterProxyHandler)\n }\n // For whatever weird reason loops don't play nice with Object.defineProperty, here's the next best thing:\n addProxy(WebGLRenderingContext.prototype, 'getParameter')\n addProxy(WebGL2RenderingContext.prototype, 'getParameter')\n }",_args:[{}]}),(()=>{try{if(window.outerWidth&&window.outerHeight)return;const n=85;window.outerWidth=window.innerWidth,window.outerHeight=window.innerHeight+n}catch(n){}})(),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n try {\n // Adds a contentWindow proxy to the provided iframe element\n const addContentWindowProxy = iframe => {\n const contentWindowProxy = {\n get(target, key) {\n // Now to the interesting part:\n // We actually make this thing behave like a regular iframe window,\n // by intercepting calls to e.g. `.self` and redirect it to the correct thing. :)\n // That makes it possible for these assertions to be correct:\n // iframe.contentWindow.self === window.top // must be false\n if (key === 'self') {\n return this\n }\n // iframe.contentWindow.frameElement === iframe // must be true\n if (key === 'frameElement') {\n return iframe\n }\n // Intercept iframe.contentWindow[0] to hide the property 0 added by the proxy.\n if (key === '0') {\n return undefined\n }\n return Reflect.get(target, key)\n }\n }\n\n if (!iframe.contentWindow) {\n const proxy = new Proxy(window, contentWindowProxy)\n Object.defineProperty(iframe, 'contentWindow', {\n get() {\n return proxy\n },\n set(newValue) {\n return newValue // contentWindow is immutable\n },\n enumerable: true,\n configurable: false\n })\n }\n }\n\n // Handles iframe element creation, augments `srcdoc` property so we can intercept further\n const handleIframeCreation = (target, thisArg, args) => {\n const iframe = target.apply(thisArg, args)\n\n // We need to keep the originals around\n const _iframe = iframe\n const _srcdoc = _iframe.srcdoc\n\n // Add hook for the srcdoc property\n // We need to be very surgical here to not break other iframes by accident\n Object.defineProperty(iframe, 'srcdoc', {\n configurable: true, // Important, so we can reset this later\n get: function() {\n return _srcdoc\n },\n set: function(newValue) {\n addContentWindowProxy(this)\n // Reset property, the hook is only needed once\n Object.defineProperty(iframe, 'srcdoc', {\n configurable: false,\n writable: false,\n value: _srcdoc\n })\n _iframe.srcdoc = newValue\n }\n })\n return iframe\n }\n\n // Adds a hook to intercept iframe creation events\n const addIframeCreationSniffer = () => {\n /* global document */\n const createElementHandler = {\n // Make toString() native\n get(target, key) {\n return Reflect.get(target, key)\n },\n apply: function(target, thisArg, args) {\n const isIframe =\n args && args.length && `${args[0]}`.toLowerCase() === 'iframe'\n if (!isIframe) {\n // Everything as usual\n return target.apply(thisArg, args)\n } else {\n return handleIframeCreation(target, thisArg, args)\n }\n }\n }\n // All this just due to iframes with srcdoc bug\n utils.replaceWithProxy(\n document,\n 'createElement',\n createElementHandler\n )\n }\n\n // Let's go\n addIframeCreationSniffer()\n } catch (err) {\n // console.warn(err)\n }\n }",_args:[]}); \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/zhihu.js b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/zhihu.js new file mode 100644 index 0000000..8048218 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/libs/zhihu.js @@ -0,0 +1,166 @@ +// copy from https://github.com/tiam-bloom/zhihuQuestionAnswer/blob/main/zhihuvmp.js thanks to tiam-bloom +// 仅供学习交流使用,严禁用于商业用途,也不要滥用,否则后果自负 +// modified by relakkes + +const crypto = require('crypto'); // 导入加密模块 + + +let init_str = "6fpLRqJO8M/c3jnYxFkUVC4ZIG12SiH=5v0mXDazWBTsuw7QetbKdoPyAl+hN9rgE"; +var h = { + zk: [1170614578, 1024848638, 1413669199, -343334464, -766094290, -1373058082, -143119608, -297228157, 1933479194, -971186181, -406453910, 460404854, -547427574, -1891326262, -1679095901, 2119585428, -2029270069, 2035090028, -1521520070, -5587175, -77751101, -2094365853, -1243052806, 1579901135, 1321810770, 456816404, -1391643889, -229302305, 330002838, -788960546, 363569021, -1947871109], + zb: [20, 223, 245, 7, 248, 2, 194, 209, 87, 6, 227, 253, 240, 128, 222, 91, 237, 9, 125, 157, 230, 93, 252, 205, 90, 79, 144, 199, 159, 197, 186, 167, 39, 37, 156, 198, 38, 42, 43, 168, 217, 153, 15, 103, 80, 189, 71, 191, 97, 84, 247, 95, 36, 69, 14, 35, 12, 171, 28, 114, 178, 148, 86, 182, 32, 83, 158, 109, 22, 255, 94, 238, 151, 85, 77, 124, 254, 18, 4, 26, 123, 176, 232, 193, 131, 172, 143, 142, 150, 30, 10, 146, 162, 62, 224, 218, 196, 229, 1, 192, 213, 27, 110, 56, 231, 180, 138, 107, 242, 187, 54, 120, 19, 44, 117, 228, 215, 203, 53, 239, 251, 127, 81, 11, 133, 96, 204, 132, 41, 115, 73, 55, 249, 147, 102, 48, 122, 145, 106, 118, 74, 190, 29, 16, 174, 5, 177, 129, 63, 113, 99, 31, 161, 76, 246, 34, 211, 13, 60, 68, 207, 160, 65, 111, 82, 165, 67, 169, 225, 57, 112, 244, 155, 51, 236, 200, 233, 58, 61, 47, 100, 137, 185, 64, 17, 70, 234, 163, 219, 108, 170, 166, 59, 149, 52, 105, 24, 212, 78, 173, 45, 0, 116, 226, 119, 136, 206, 135, 175, 195, 25, 92, 121, 208, 126, 139, 3, 75, 141, 21, 130, 98, 241, 40, 154, 66, 184, 49, 181, 46, 243, 88, 101, 183, 8, 23, 72, 188, 104, 179, 210, 134, 250, 201, 164, 89, 216, 202, 220, 50, 221, 152, 140, 33, 235, 214] + +}; + +function i(e, t, n) { + t[n] = 255 & e >>> 24, + t[n + 1] = 255 & e >>> 16, + t[n + 2] = 255 & e >>> 8, + t[n + 3] = 255 & e +} + +function Q(e, t) { + return (4294967295 & e) << t | e >>> 32 - t +} + +function B(e, t) { + return (255 & e[t]) << 24 | (255 & e[t + 1]) << 16 | (255 & e[t + 2]) << 8 | 255 & e[t + 3] +} + +function G(e) { + var t = new Array(4) + , n = new Array(4); + i(e, t, 0), + n[0] = h.zb[255 & t[0]], + n[1] = h.zb[255 & t[1]], + n[2] = h.zb[255 & t[2]], + n[3] = h.zb[255 & t[3]]; + + var r = B(n, 0); + return r ^ Q(r, 2) ^ Q(r, 10) ^ Q(r, 18) ^ Q(r, 24) +} + +function array_0_16_offset(e) { + var t = new Array(16) + , n = new Array(36); + n[0] = B(e, 0), + n[1] = B(e, 4), + n[2] = B(e, 8), + n[3] = B(e, 12); + for (var r = 0; r < 32; r++) { + var o = G(n[r + 1] ^ n[r + 2] ^ n[r + 3] ^ h.zk[r]); + n[r + 4] = n[r] ^ o + } + return i(n[35], t, 0), + i(n[34], t, 4), + i(n[33], t, 8), + i(n[32], t, 12), + t + +} + +function array_16_48_offset(e, t) { + for (var n = [], r = e.length, i = 0; 0 < r; r -= 16) { + for (var o = e.slice(16 * i, 16 * (i + 1)), a = new Array(16), c = 0; c < 16; c++) + a[c] = o[c] ^ t[c]; + t = array_0_16_offset(a), + n = n.concat(t), + i++ + } + return n +} + +function encode_0_16(array_0_16) { + let result = []; + let array_offset = [48, 53, 57, 48, 53, 51, 102, 55, 100, 49, 53, 101, 48, 49, 100, 55]; + for (let i = 0; i < array_0_16.length; i++) { + let a = array_0_16[i] ^ array_offset[i], + b = a ^ 42; + result.push(b) + } + return array_0_16_offset(result) +} + +function encode(ar) { + let b = ar[1] << 8, + c = ar[0] | b, + d = ar[2] << 16, + e = c | d, + result_array = [], + x6 = 6; + result_array.push(e & 63); + while (result_array.length < 4) { + let a = e >>> x6; + result_array.push(a & 63); + x6 += 6; + } + return result_array +} + +function get_init_array(encode_md5) { + let init_array = [] + for (let i = 0; i < encode_md5.length; i++) { + init_array.push(encode_md5.charCodeAt(i)) + } + init_array.unshift(0) + init_array.unshift(Math.floor(Math.random() * 127)) + while (init_array.length < 48) { + init_array.push(14) + } + let array_0_16 = encode_0_16(init_array.slice(0, 16)), + array_16_48 = array_16_48_offset(init_array.slice(16, 48), array_0_16), + array_result = array_0_16.concat(array_16_48); + return array_result +} + +function get_zse_96(encode_md5) { + let result_array = [], + init_array = get_init_array(encode_md5), + result = ""; + for (let i = 47; i >= 0; i -= 4) { + init_array[i] ^= 58 + } + init_array.reverse() + for (let j = 3; j <= init_array.length; j += 3) { + let ar = init_array.slice(j - 3, j); + result_array = result_array.concat(encode(ar)) + } + for (let index = 0; index < result_array.length; index++) { + result += init_str.charAt(result_array[index]) + } + result = '2.0_' + result + return result +} + +/***********************relakkes modify*******************************************************/ + +/** + * 从cookies中提取dc0的值 + * @param cookies + * @returns {string} + */ +const extract_dc0_value_from_cookies = function (cookies) { + const t9 = RegExp("d_c0=([^;]+)") + const tt = t9.exec(cookies); + const dc0 = tt && tt[1] + return tt && tt[1] +} + +/** + * 获取zhihu sign value 对python暴漏的接口 + * @param url 请求的路由参数 + * @param cookies 请求的cookies,需要包含dc0这个key + * @returns {*} + */ +function get_sign(url, cookies) { + const ta = "101_3_3.0" + const dc0 = extract_dc0_value_from_cookies(cookies) + const tc = "3_2.0aR_sn77yn6O92wOB8hPZnQr0EMYxc4f18wNBUgpTQ6nxERFZfTY0-4Lm-h3_tufIwJS8gcxTgJS_AuPZNcXCTwxI78YxEM20s4PGDwN8gGcYAupMWufIoLVqr4gxrRPOI0cY7HL8qun9g93mFukyigcmebS_FwOYPRP0E4rZUrN9DDom3hnynAUMnAVPF_PhaueTFH9fQL39OCCqYTxfb0rfi9wfPhSM6vxGDJo_rBHpQGNmBBLqPJHK2_w8C9eTVMO9Z9NOrMtfhGH_DgpM-BNM1DOxScLG3gg1Hre1FCXKQcXKkrSL1r9GWDXMk8wqBLNmbRH96BtOFqVZ7UYG3gC8D9cMS7Y9UrHLVCLZPJO8_CL_6GNCOg_zhJS8PbXmGTcBpgxfkieOPhNfthtf2gC_qD3YOce8nCwG2uwBOqeMoML9NBC1xb9yk6SuJhHLK7SM6LVfCve_3vLKlqcL6TxL_UosDvHLxrHmWgxBQ8Xs" + const params_join_str = [ta, url, dc0, tc].join("+") + const params_md5_value = crypto.createHash('md5').update(params_join_str).digest('hex') + + return { + "x-zst-81": tc, + "x-zse-96": get_zse_96(params_md5_value), + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py new file mode 100644 index 0000000..c074c7d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py @@ -0,0 +1,80 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import sys +from typing import Optional + +import cmd_arg +import config +import db +from base.base_crawler import AbstractCrawler +from media_platform.bilibili import BilibiliCrawler +from media_platform.douyin import DouYinCrawler +from media_platform.kuaishou import KuaishouCrawler +from media_platform.tieba import TieBaCrawler +from media_platform.weibo import WeiboCrawler +from media_platform.xhs import XiaoHongShuCrawler +from media_platform.zhihu import ZhihuCrawler + + +class CrawlerFactory: + CRAWLERS = { + "xhs": XiaoHongShuCrawler, + "dy": DouYinCrawler, + "ks": KuaishouCrawler, + "bili": BilibiliCrawler, + "wb": WeiboCrawler, + "tieba": TieBaCrawler, + "zhihu": ZhihuCrawler, + } + + @staticmethod + def create_crawler(platform: str) -> AbstractCrawler: + crawler_class = CrawlerFactory.CRAWLERS.get(platform) + if not crawler_class: + raise ValueError( + "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..." + ) + return crawler_class() + + +crawler: Optional[AbstractCrawler] = None + + +async def main(): + # Init crawler + global crawler + + # parse cmd + await cmd_arg.parse_cmd() + + # init db + if config.SAVE_DATA_OPTION in ["db", "sqlite"]: + await db.init_db() + + crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) + await crawler.start() + + +def cleanup(): + if crawler: + # asyncio.run(crawler.close()) + pass + if config.SAVE_DATA_OPTION in ["db", "sqlite"]: + asyncio.run(db.close()) + + +if __name__ == "__main__": + try: + asyncio.get_event_loop().run_until_complete(main()) + finally: + cleanup() diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/__init__.py new file mode 100644 index 0000000..7c5494a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/__init__.py @@ -0,0 +1,11 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/__init__.py new file mode 100644 index 0000000..2ac60c2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/__init__.py @@ -0,0 +1,17 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:36 +# @Desc : + +from .core import * \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py new file mode 100644 index 0000000..0abf872 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py @@ -0,0 +1,553 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:44 +# @Desc : bilibili 请求客户端 +import asyncio +import json +import random +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext, Page + +import config +from base.base_crawler import AbstractApiClient +from tools import utils + +from .exception import DataFetchError +from .field import CommentOrderType, SearchOrderType +from .help import BilibiliSign + + +class BilibiliClient(AbstractApiClient): + + def __init__( + self, + timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间 + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxy = proxy + self.timeout = timeout + self.headers = headers + self._host = "https://api.bilibili.com" + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + + async def request(self, method, url, **kwargs) -> Any: + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + try: + data: Dict = response.json() + except json.JSONDecodeError: + utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}") + raise DataFetchError(f"Failed to decode JSON, content: {response.text}") + if data.get("code") != 0: + raise DataFetchError(data.get("message", "unkonw error")) + else: + return data.get("data", {}) + + async def pre_request_data(self, req_data: Dict) -> Dict: + """ + 发送请求进行请求参数签名 + 需要从 localStorage 拿 wbi_img_urls 这参数,值如下: + https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png + :param req_data: + :return: + """ + if not req_data: + return {} + img_key, sub_key = await self.get_wbi_keys() + return BilibiliSign(img_key, sub_key).sign(req_data) + + async def get_wbi_keys(self) -> Tuple[str, str]: + """ + 获取最新的 img_key 和 sub_key + :return: + """ + local_storage = await self.playwright_page.evaluate("() => window.localStorage") + wbi_img_urls = local_storage.get("wbi_img_urls", "") + if not wbi_img_urls: + img_url_from_storage = local_storage.get("wbi_img_url") + sub_url_from_storage = local_storage.get("wbi_sub_url") + if img_url_from_storage and sub_url_from_storage: + wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}" + if wbi_img_urls and "-" in wbi_img_urls: + img_url, sub_url = wbi_img_urls.split("-") + else: + resp = await self.request(method="GET", url=self._host + "/x/web-interface/nav") + img_url: str = resp['wbi_img']['img_url'] + sub_url: str = resp['wbi_img']['sub_url'] + img_key = img_url.rsplit('/', 1)[1].split('.')[0] + sub_key = sub_url.rsplit('/', 1)[1].split('.')[0] + return img_key, sub_key + + async def get(self, uri: str, params=None, enable_params_sign: bool = True) -> Dict: + final_uri = uri + if enable_params_sign: + params = await self.pre_request_data(params) + if isinstance(params, dict): + final_uri = (f"{uri}?" + f"{urlencode(params)}") + return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) + + async def post(self, uri: str, data: dict) -> Dict: + data = await self.pre_request_data(data) + json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) + return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers) + + async def pong(self) -> bool: + """get a note to check if login state is ok""" + utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...") + ping_flag = False + try: + check_login_uri = "/x/web-interface/nav" + response = await self.get(check_login_uri) + if response.get("isLogin"): + utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!") + ping_flag = True + except Exception as e: + utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def search_video_by_keyword( + self, + keyword: str, + page: int = 1, + page_size: int = 20, + order: SearchOrderType = SearchOrderType.DEFAULT, + pubtime_begin_s: int = 0, + pubtime_end_s: int = 0, + ) -> Dict: + """ + KuaiShou web search api + :param keyword: 搜索关键词 + :param page: 分页参数具体第几页 + :param page_size: 每一页参数的数量 + :param order: 搜索结果排序,默认位综合排序 + :param pubtime_begin_s: 发布时间开始时间戳 + :param pubtime_end_s: 发布时间结束时间戳 + :return: + """ + uri = "/x/web-interface/wbi/search/type" + post_data = { + "search_type": "video", + "keyword": keyword, + "page": page, + "page_size": page_size, + "order": order.value, + "pubtime_begin_s": pubtime_begin_s, + "pubtime_end_s": pubtime_end_s + } + return await self.get(uri, post_data) + + async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict: + """ + Bilibli web video detail api, aid 和 bvid任选一个参数 + :param aid: 稿件avid + :param bvid: 稿件bvid + :return: + """ + if not aid and not bvid: + raise ValueError("请提供 aid 或 bvid 中的至少一个参数") + + uri = "/x/web-interface/view/detail" + params = dict() + if aid: + params.update({"aid": aid}) + else: + params.update({"bvid": bvid}) + return await self.get(uri, params, enable_params_sign=False) + + async def get_video_play_url(self, aid: int, cid: int) -> Dict: + """ + Bilibli web video play url api + :param aid: 稿件avid + :param cid: cid + :return: + """ + if not aid or not cid or aid <= 0 or cid <= 0: + raise ValueError("aid 和 cid 必须存在") + uri = "/x/player/wbi/playurl" + params = { + "avid": aid, + "cid": cid, + "qn": 80, + "fourk": 1, + "fnval": 1, + "platform": "pc", + } + + return await self.get(uri, params, enable_params_sign=True) + + async def get_video_media(self, url: str) -> Union[bytes, None]: + async with httpx.AsyncClient(proxy=self.proxy) as client: + try: + response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx + utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + return None + + async def get_video_comments( + self, + video_id: str, + order_mode: CommentOrderType = CommentOrderType.DEFAULT, + next: int = 0, + ) -> Dict: + """get video comments + :param video_id: 视频 ID + :param order_mode: 排序方式 + :param next: 评论页选择 + :return: + """ + uri = "/x/v2/reply/wbi/main" + post_data = {"oid": video_id, "mode": order_mode.value, "type": 1, "ps": 20, "next": next} + return await self.get(uri, post_data) + + async def get_video_all_comments( + self, + video_id: str, + crawl_interval: float = 1.0, + is_fetch_sub_comments=False, + callback: Optional[Callable] = None, + max_count: int = 10, + ): + """ + get video all comments include sub comments + :param video_id: + :param crawl_interval: + :param is_fetch_sub_comments: + :param callback: + max_count: 一次笔记爬取的最大评论数量 + + :return: + """ + result = [] + is_end = False + next_page = 0 + max_retries = 3 + while not is_end and len(result) < max_count: + comments_res = None + for attempt in range(max_retries): + try: + comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page) + break # Success + except DataFetchError as e: + if attempt < max_retries - 1: + delay = 5 * (2**attempt) + random.uniform(0, 1) + utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(delay) + else: + utils.logger.error(f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}") + is_end = True + break + if not comments_res: + break + + cursor_info: Dict = comments_res.get("cursor") + if not cursor_info: + utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.") + break + + comment_list: List[Dict] = comments_res.get("replies", []) + + # 检查 is_end 和 next 是否存在 + if "is_end" not in cursor_info or "next" not in cursor_info: + utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.") + is_end = True + else: + is_end = cursor_info.get("is_end") + next_page = cursor_info.get("next") + + if not isinstance(is_end, bool): + utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.") + is_end = True + if is_fetch_sub_comments: + for comment in comment_list: + comment_id = comment['rpid'] + if (comment.get("rcount", 0) > 0): + {await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)} + if len(result) + len(comment_list) > max_count: + comment_list = comment_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(video_id, comment_list) + await asyncio.sleep(crawl_interval) + if not is_fetch_sub_comments: + result.extend(comment_list) + continue + return result + + async def get_video_all_level_two_comments( + self, + video_id: str, + level_one_comment_id: int, + order_mode: CommentOrderType, + ps: int = 10, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> Dict: + """ + get video all level two comments for a level one comment + :param video_id: 视频 ID + :param level_one_comment_id: 一级评论 ID + :param order_mode: + :param ps: 一页评论数 + :param crawl_interval: + :param callback: + :return: + """ + + pn = 1 + while True: + result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode) + comment_list: List[Dict] = result.get("replies", []) + if callback: # 如果有回调函数,就执行回调函数 + await callback(video_id, comment_list) + await asyncio.sleep(crawl_interval) + if (int(result["page"]["count"]) <= pn * ps): + break + + pn += 1 + + async def get_video_level_two_comments( + self, + video_id: str, + level_one_comment_id: int, + pn: int, + ps: int, + order_mode: CommentOrderType, + ) -> Dict: + """get video level two comments + :param video_id: 视频 ID + :param level_one_comment_id: 一级评论 ID + :param order_mode: 排序方式 + + :return: + """ + uri = "/x/v2/reply/reply" + post_data = { + "oid": video_id, + "mode": order_mode.value, + "type": 1, + "ps": ps, + "pn": pn, + "root": level_one_comment_id, + } + result = await self.get(uri, post_data) + return result + + async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: + """get all videos for a creator + :param creator_id: 创作者 ID + :param pn: 页数 + :param ps: 一页视频数 + :param order_mode: 排序方式 + + :return: + """ + uri = "/x/space/wbi/arc/search" + post_data = { + "mid": creator_id, + "pn": pn, + "ps": ps, + "order": order_mode, + } + return await self.get(uri, post_data) + + async def get_creator_info(self, creator_id: int) -> Dict: + """ + get creator info + :param creator_id: 作者 ID + """ + uri = "/x/space/wbi/acc/info" + post_data = { + "mid": creator_id, + } + return await self.get(uri, post_data) + + async def get_creator_fans( + self, + creator_id: int, + pn: int, + ps: int = 24, + ) -> Dict: + """ + get creator fans + :param creator_id: 创作者 ID + :param pn: 开始页数 + :param ps: 每页数量 + :return: + """ + uri = "/x/relation/fans" + post_data = { + 'vmid': creator_id, + "pn": pn, + "ps": ps, + "gaia_source": "main_web", + } + return await self.get(uri, post_data) + + async def get_creator_followings( + self, + creator_id: int, + pn: int, + ps: int = 24, + ) -> Dict: + """ + get creator followings + :param creator_id: 创作者 ID + :param pn: 开始页数 + :param ps: 每页数量 + :return: + """ + uri = "/x/relation/followings" + post_data = { + "vmid": creator_id, + "pn": pn, + "ps": ps, + "gaia_source": "main_web", + } + return await self.get(uri, post_data) + + async def get_creator_dynamics(self, creator_id: int, offset: str = ""): + """ + get creator comments + :param creator_id: 创作者 ID + :param offset: 发送请求所需参数 + :return: + """ + uri = "/x/polymer/web-dynamic/v1/feed/space" + post_data = { + "offset": offset, + "host_mid": creator_id, + "platform": "web", + } + + return await self.get(uri, post_data) + + async def get_creator_all_fans( + self, + creator_info: Dict, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 100, + ) -> List: + """ + get creator all fans + :param creator_info: + :param crawl_interval: + :param callback: + :param max_count: 一个up主爬取的最大粉丝数量 + + :return: up主粉丝数列表 + """ + creator_id = creator_info["id"] + result = [] + pn = config.START_CONTACTS_PAGE + while len(result) < max_count: + fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn) + fans_list: List[Dict] = fans_res.get("list", []) + + pn += 1 + if len(result) + len(fans_list) > max_count: + fans_list = fans_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(creator_info, fans_list) + await asyncio.sleep(crawl_interval) + if not fans_list: + break + result.extend(fans_list) + return result + + async def get_creator_all_followings( + self, + creator_info: Dict, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 100, + ) -> List: + """ + get creator all followings + :param creator_info: + :param crawl_interval: + :param callback: + :param max_count: 一个up主爬取的最大关注者数量 + + :return: up主关注者列表 + """ + creator_id = creator_info["id"] + result = [] + pn = config.START_CONTACTS_PAGE + while len(result) < max_count: + followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn) + followings_list: List[Dict] = followings_res.get("list", []) + + pn += 1 + if len(result) + len(followings_list) > max_count: + followings_list = followings_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(creator_info, followings_list) + await asyncio.sleep(crawl_interval) + if not followings_list: + break + result.extend(followings_list) + return result + + async def get_creator_all_dynamics( + self, + creator_info: Dict, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 20, + ) -> List: + """ + get creator all followings + :param creator_info: + :param crawl_interval: + :param callback: + :param max_count: 一个up主爬取的最大动态数量 + + :return: up主关注者列表 + """ + creator_id = creator_info["id"] + result = [] + offset = "" + has_more = True + while has_more and len(result) < max_count: + dynamics_res = await self.get_creator_dynamics(creator_id, offset) + dynamics_list: List[Dict] = dynamics_res["items"] + has_more = dynamics_res["has_more"] + offset = dynamics_res["offset"] + if len(result) + len(dynamics_list) > max_count: + dynamics_list = dynamics_list[:max_count - len(result)] + if callback: + await callback(creator_info, dynamics_list) + await asyncio.sleep(crawl_interval) + result.extend(dynamics_list) + return result diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py new file mode 100644 index 0000000..e63d31a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py @@ -0,0 +1,657 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:44 +# @Desc : B站爬虫 + +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple, Union +from datetime import datetime, timedelta +import pandas as pd + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) +from playwright._impl._errors import TargetClosedError + +import config +from base.base_crawler import AbstractCrawler +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import bilibili as bilibili_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import BilibiliClient +from .exception import DataFetchError +from .field import SearchOrderType +from .login import BilibiliLogin + + +class BilibiliCrawler(AbstractCrawler): + context_page: Page + bili_client: BilibiliClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self): + self.index_url = "https://www.bilibili.com" + self.user_agent = utils.get_user_agent() + self.cdp_manager = None + + async def start(self): + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + self.user_agent, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url) + + # Create a client to interact with the xiaohongshu website. + self.bili_client = await self.create_bilibili_client(httpx_proxy_format) + if not await self.bili_client.pong(): + login_obj = BilibiliLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.bili_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST) + elif config.CRAWLER_TYPE == "creator": + if config.CREATOR_MODE: + for creator_id in config.BILI_CREATOR_ID_LIST: + await self.get_creator_videos(int(creator_id)) + else: + await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST) + else: + pass + utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...") + + async def search(self): + """ + search bilibili video + """ + # Search for video and retrieve their comment information. + if config.BILI_SEARCH_MODE == "normal": + await self.search_by_keywords() + elif config.BILI_SEARCH_MODE == "all_in_time_range": + await self.search_by_keywords_in_time_range(daily_limit=False) + elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range": + await self.search_by_keywords_in_time_range(daily_limit=True) + else: + utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}") + + @staticmethod + async def get_pubtime_datetime( + start: str = config.START_DAY, + end: str = config.END_DAY, + ) -> Tuple[str, str]: + """ + 获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s + --- + :param start: 发布日期起始时间,YYYY-MM-DD + :param end: 发布日期结束时间,YYYY-MM-DD + + Note + --- + - 搜索的时间范围为 start 至 end,包含 start 和 end + - 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒 + - 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399 + 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59) + - 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒 + - 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799 + 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59) + """ + # 转换 start 与 end 为 datetime 对象 + start_day: datetime = datetime.strptime(start, "%Y-%m-%d") + end_day: datetime = datetime.strptime(end, "%Y-%m-%d") + if start_day > end_day: + raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end") + elif start_day == end_day: # 搜索同一天的内容 + end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second + else: # 搜索 start 至 end + end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second + # 将其重新转换为时间戳 + return str(int(start_day.timestamp())), str(int(end_day.timestamp())) + + async def search_by_keywords(self): + """ + search bilibili video with keywords in normal mode + :return: + """ + utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords") + bili_limit_count = 20 # bilibili limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count + start_page = config.START_PAGE # start page number + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}") + page = 1 + while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}") + page += 1 + continue + + utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}") + video_id_list: List[str] = [] + videos_res = await self.bili_client.search_video_by_keyword( + keyword=keyword, + page=page, + page_size=bili_limit_count, + order=SearchOrderType.DEFAULT, + pubtime_begin_s=0, # 作品发布日期起始时间戳 + pubtime_end_s=0, # 作品发布日期结束日期时间戳 + ) + video_list: List[Dict] = videos_res.get("result") + + if not video_list: + utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.") + break + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [] + try: + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] + except Exception as e: + utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}") + video_items = await asyncio.gather(*task_list) + for video_item in video_items: + if video_item: + video_id_list.append(video_item.get("View").get("aid")) + await bilibili_store.update_bilibili_video(video_item) + await bilibili_store.update_up_info(video_item) + await self.get_bilibili_video(video_item, semaphore) + page += 1 + await self.batch_get_video_comments(video_id_list) + + async def search_by_keywords_in_time_range(self, daily_limit: bool): + """ + Search bilibili video with keywords in a given time range. + :param daily_limit: if True, strictly limit the number of notes per day and total. + """ + utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}") + bili_limit_count = 20 + start_page = config.START_PAGE + + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}") + total_notes_crawled_for_keyword = 0 + + for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq="D"): + if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.") + break + + if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.") + break + + pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d")) + page = 1 + notes_count_this_day = 0 + + while True: + if notes_count_this_day >= config.MAX_NOTES_PER_DAY: + utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.") + break + if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.") + break + if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + break + + try: + utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}") + video_id_list: List[str] = [] + videos_res = await self.bili_client.search_video_by_keyword( + keyword=keyword, + page=page, + page_size=bili_limit_count, + order=SearchOrderType.DEFAULT, + pubtime_begin_s=pubtime_begin_s, + pubtime_end_s=pubtime_end_s, + ) + video_list: List[Dict] = videos_res.get("result") + + if not video_list: + utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.") + break + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] + video_items = await asyncio.gather(*task_list) + + for video_item in video_items: + if video_item: + if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + break + if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT): + break + if notes_count_this_day >= config.MAX_NOTES_PER_DAY: + break + notes_count_this_day += 1 + total_notes_crawled_for_keyword += 1 + video_id_list.append(video_item.get("View").get("aid")) + await bilibili_store.update_bilibili_video(video_item) + await bilibili_store.update_up_info(video_item) + await self.get_bilibili_video(video_item, semaphore) + + page += 1 + await self.batch_get_video_comments(video_id_list) + + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}") + break + + async def batch_get_video_comments(self, video_id_list: List[str]): + """ + batch get video comments + :param video_id_list: + :return: + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + + utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for video_id in video_id_list: + task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): + """ + get comment for video id + :param video_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...") + await asyncio.sleep(random.uniform(0.5, 1.5)) + await self.bili_client.get_video_all_comments( + video_id=video_id, + crawl_interval=random.random(), + is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, + callback=bilibili_store.batch_update_bilibili_video_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}") + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}") + # Propagate the exception to be caught by the main loop + raise + + async def get_creator_videos(self, creator_id: int): + """ + get videos for a creator + :return: + """ + ps = 30 + pn = 1 + while True: + result = await self.bili_client.get_creator_videos(creator_id, pn, ps) + video_bvids_list = [video["bvid"] for video in result["list"]["vlist"]] + await self.get_specified_videos(video_bvids_list) + if int(result["page"]["count"]) <= pn * ps: + break + await asyncio.sleep(random.random()) + pn += 1 + + async def get_specified_videos(self, bvids_list: List[str]): + """ + get specified videos info + :return: + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list] + video_details = await asyncio.gather(*task_list) + video_aids_list = [] + for video_detail in video_details: + if video_detail is not None: + video_item_view: Dict = video_detail.get("View") + video_aid: str = video_item_view.get("aid") + if video_aid: + video_aids_list.append(video_aid) + await bilibili_store.update_bilibili_video(video_detail) + await bilibili_store.update_up_info(video_detail) + await self.get_bilibili_video(video_detail, semaphore) + await self.batch_get_video_comments(video_aids_list) + + async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + """ + Get video detail task + :param aid: + :param bvid: + :param semaphore: + :return: + """ + async with semaphore: + try: + result = await self.bili_client.get_video_info(aid=aid, bvid=bvid) + return result + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}") + return None + + async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]: + """ + Get video play url + :param aid: + :param cid: + :param semaphore: + :return: + """ + async with semaphore: + try: + result = await self.bili_client.get_video_play_url(aid=aid, cid=cid) + return result + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}") + return None + + async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: + """ + create bilibili client + :param httpx_proxy: httpx proxy + :return: bilibili client + """ + utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + bilibili_client_obj = BilibiliClient( + proxy=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Origin": "https://www.bilibili.com", + "Referer": "https://www.bilibili.com", + "Content-Type": "application/json;charset=UTF-8", + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return bilibili_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + launch browser and create browser context + :param chromium: chromium browser + :param playwright_proxy: playwright proxy + :param user_agent: user agent + :param headless: headless mode + :return: browser context + """ + utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={ + "width": 1920, + "height": 1080 + }, + user_agent=user_agent, + ) + return browser_context + else: + # type: ignore + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) + browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) + + async def close(self): + """Close browser context""" + try: + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + elif self.browser_context: + await self.browser_context.close() + utils.logger.info("[BilibiliCrawler.close] Browser context closed ...") + except TargetClosedError: + utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.") + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}") + + async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore): + """ + download bilibili video + :param video_item: + :param semaphore: + :return: + """ + if not config.ENABLE_GET_MEIDAS: + utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled") + return + video_item_view: Dict = video_item.get("View") + aid = video_item_view.get("aid") + cid = video_item_view.get("cid") + result = await self.get_video_play_url_task(aid, cid, semaphore) + if result is None: + utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed") + return + durl_list = result.get("durl") + max_size = -1 + video_url = "" + for durl in durl_list: + size = durl.get("size") + if size > max_size: + max_size = size + video_url = durl.get("url") + if video_url == "": + utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed") + return + + content = await self.bili_client.get_video_media(video_url) + await asyncio.sleep(random.random()) + if content is None: + return + extension_file_name = f"video.mp4" + await bilibili_store.store_video(aid, content, extension_file_name) + + async def get_all_creator_details(self, creator_id_list: List[int]): + """ + creator_id_list: get details for creator from creator_id_list + """ + utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator") + utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}") + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + try: + for creator_id in creator_id_list: + task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id) + task_list.append(task) + except Exception as e: + utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}") + + await asyncio.gather(*task_list) + + async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore): + """ + get details for creator id + :param creator_id: + :param semaphore: + :return: + """ + async with semaphore: + creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id) + creator_info: Dict = { + "id": creator_id, + "name": creator_unhandled_info.get("name"), + "sign": creator_unhandled_info.get("sign"), + "avatar": creator_unhandled_info.get("face"), + } + await self.get_fans(creator_info, semaphore) + await self.get_followings(creator_info, semaphore) + await self.get_dynamics(creator_info, semaphore) + + async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore): + """ + get fans for creator id + :param creator_info: + :param semaphore: + :return: + """ + creator_id = creator_info["id"] + async with semaphore: + try: + utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...") + await self.bili_client.get_creator_all_fans( + creator_info=creator_info, + crawl_interval=random.random(), + callback=bilibili_store.batch_update_bilibili_creator_fans, + max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}") + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}") + + async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore): + """ + get followings for creator id + :param creator_info: + :param semaphore: + :return: + """ + creator_id = creator_info["id"] + async with semaphore: + try: + utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...") + await self.bili_client.get_creator_all_followings( + creator_info=creator_info, + crawl_interval=random.random(), + callback=bilibili_store.batch_update_bilibili_creator_followings, + max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}") + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}") + + async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore): + """ + get dynamics for creator id + :param creator_info: + :param semaphore: + :return: + """ + creator_id = creator_info["id"] + async with semaphore: + try: + utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...") + await self.bili_client.get_creator_all_dynamics( + creator_info=creator_info, + crawl_interval=random.random(), + callback=bilibili_store.batch_update_bilibili_creator_dynamics, + max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES, + ) + + except DataFetchError as ex: + utils.logger.error(f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}") + except Exception as e: + utils.logger.error(f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/exception.py new file mode 100644 index 0000000..1f062d1 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/exception.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:44 +# @Desc : + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/field.py new file mode 100644 index 0000000..072ad37 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/field.py @@ -0,0 +1,45 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/3 16:20 +# @Desc : + +from enum import Enum + + +class SearchOrderType(Enum): + # 综合排序 + DEFAULT = "" + + # 最多点击 + MOST_CLICK = "click" + + # 最新发布 + LAST_PUBLISH = "pubdate" + + # 最多弹幕 + MOST_DANMU = "dm" + + # 最多收藏 + MOST_MARK = "stow" + + +class CommentOrderType(Enum): + # 仅按热度 + DEFAULT = 0 + + # 按热度+按时间 + MIXED = 1 + + # 按时间 + TIME = 2 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py new file mode 100644 index 0000000..b4e6221 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py @@ -0,0 +1,81 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + + # -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 23:26 +# @Desc : bilibili 请求参数签名 +# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95 +import urllib.parse +from hashlib import md5 +from typing import Dict + +from tools import utils + + +class BilibiliSign: + def __init__(self, img_key: str, sub_key: str): + self.img_key = img_key + self.sub_key = sub_key + self.map_table = [ + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, + 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, + 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, + 36, 20, 34, 44, 52 + ] + + def get_salt(self) -> str: + """ + 获取加盐的 key + :return: + """ + salt = "" + mixin_key = self.img_key + self.sub_key + for mt in self.map_table: + salt += mixin_key[mt] + return salt[:32] + + def sign(self, req_data: Dict) -> Dict: + """ + 请求参数中加上当前时间戳对请求参数中的key进行字典序排序 + 再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了 + :param req_data: + :return: + """ + current_ts = utils.get_unix_timestamp() + req_data.update({"wts": current_ts}) + req_data = dict(sorted(req_data.items())) + req_data = { + # 过滤 value 中的 "!'()*" 字符 + k: ''.join(filter(lambda ch: ch not in "!'()*", str(v))) + for k, v + in req_data.items() + } + query = urllib.parse.urlencode(req_data) + salt = self.get_salt() + wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid + req_data['w_rid'] = wbi_sign + return req_data + + +if __name__ == '__main__': + _img_key = "7cd084941338484aae1ad9425b84077c" + _sub_key = "4932caff0ff746eab6f01bf08b70ac45" + _search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654" + _req_data = dict() + for params in _search_url.split("&"): + kvalues = params.split("=") + key = kvalues[0] + value = kvalues[1] + _req_data[key] = value + print("pre req_data", _req_data) + _req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001}) + print(_req_data) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/login.py new file mode 100644 index 0000000..ffefb63 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/login.py @@ -0,0 +1,118 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:44 +# @Desc : bilibli登录实现类 + +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class BilibiliLogin(AbstractLogin): + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + async def begin(self): + """Start login bilibili""" + utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError( + "[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second + if max retry times reached, raise RetryError + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"): + return True + return False + + async def login_by_qrcode(self): + """login bilibili website and keep webdriver login state""" + utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...") + + # click login button + login_button_ele = self.context_page.locator( + "xpath=//div[@class='right-entry__outside go-login-btn']//div" + ) + await login_button_ele.click() + await asyncio.sleep(1) + # find login qrcode + qrcode_img_selector = "//div[@class='login-scan-box']//img" + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + sys.exit() + + # show login qrcode + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info( + f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_mobile(self): + pass + + async def login_by_cookies(self): + utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".bilibili.com", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/__init__.py new file mode 100644 index 0000000..4ff7ebf --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/__init__.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from .core import DouYinCrawler diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py new file mode 100644 index 0000000..46a3e8f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py @@ -0,0 +1,326 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import asyncio +import copy +import json +import urllib.parse +from typing import Any, Callable, Dict, Union, Optional + +import httpx +from playwright.async_api import BrowserContext + +from base.base_crawler import AbstractApiClient +from tools import utils +from var import request_keyword_var + +from .exception import * +from .field import * +from .help import * + + +class DouYinClient(AbstractApiClient): + + def __init__( + self, + timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间 + proxy=None, + *, + headers: Dict, + playwright_page: Optional[Page], + cookie_dict: Dict, + ): + self.proxy = proxy + self.timeout = timeout + self.headers = headers + self._host = "https://www.douyin.com" + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + + async def __process_req_params( + self, + uri: str, + params: Optional[Dict] = None, + headers: Optional[Dict] = None, + request_method="GET", + ): + + if not params: + return + headers = headers or self.headers + local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore + common_params = { + "device_platform": "webapp", + "aid": "6383", + "channel": "channel_pc_web", + "version_code": "190600", + "version_name": "19.6.0", + "update_version_code": "170400", + "pc_client_type": "1", + "cookie_enabled": "true", + "browser_language": "zh-CN", + "browser_platform": "MacIntel", + "browser_name": "Chrome", + "browser_version": "125.0.0.0", + "browser_online": "true", + "engine_name": "Blink", + "os_name": "Mac OS", + "os_version": "10.15.7", + "cpu_core_num": "8", + "device_memory": "8", + "engine_version": "109.0", + "platform": "PC", + "screen_width": "2560", + "screen_height": "1440", + 'effective_type': '4g', + "round_trip_time": "50", + "webid": get_web_id(), + "msToken": local_storage.get("xmst"), + } + params.update(common_params) + query_string = urllib.parse.urlencode(params) + + # 20240927 a-bogus更新(JS版本) + post_data = {} + if request_method == "POST": + post_data = params + a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page) + params["a_bogus"] = a_bogus + + async def request(self, method, url, **kwargs): + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + try: + if response.text == "" or response.text == "blocked": + utils.logger.error(f"request params incrr, response.text: {response.text}") + raise Exception("account blocked") + return response.json() + except Exception as e: + raise DataFetchError(f"{e}, {response.text}") + + async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None): + """ + GET请求 + """ + await self.__process_req_params(uri, params, headers) + headers = headers or self.headers + return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers) + + async def post(self, uri: str, data: dict, headers: Optional[Dict] = None): + await self.__process_req_params(uri, data, headers) + headers = headers or self.headers + return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers) + + async def pong(self, browser_context: BrowserContext) -> bool: + local_storage = await self.playwright_page.evaluate("() => window.localStorage") + if local_storage.get("HasUserLogin", "") == "1": + return True + + _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + return cookie_dict.get("LOGIN_STATUS") == "1" + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def search_info_by_keyword( + self, + keyword: str, + offset: int = 0, + search_channel: SearchChannelType = SearchChannelType.GENERAL, + sort_type: SearchSortType = SearchSortType.GENERAL, + publish_time: PublishTimeType = PublishTimeType.UNLIMITED, + search_id: str = "", + ): + """ + DouYin Web Search API + :param keyword: + :param offset: + :param search_channel: + :param sort_type: + :param publish_time: · + :param search_id: · + :return: + """ + query_params = { + 'search_channel': search_channel.value, + 'enable_history': '1', + 'keyword': keyword, + 'search_source': 'tab_search', + 'query_correct_type': '1', + 'is_filter_search': '0', + 'from_group_id': '7378810571505847586', + 'offset': offset, + 'count': '15', + 'need_filter_settings': '1', + 'list_type': 'multi', + 'search_id': search_id, + } + if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value: + query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)}) + query_params["is_filter_search"] = 1 + query_params["search_source"] = "tab_search" + referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general" + headers = copy.copy(self.headers) + headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') + return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers) + + async def get_video_by_id(self, aweme_id: str) -> Any: + """ + DouYin Video Detail API + :param aweme_id: + :return: + """ + params = {"aweme_id": aweme_id} + headers = copy.copy(self.headers) + del headers["Origin"] + res = await self.get("/aweme/v1/web/aweme/detail/", params, headers) + return res.get("aweme_detail", {}) + + async def get_aweme_comments(self, aweme_id: str, cursor: int = 0): + """get note comments + + """ + uri = "/aweme/v1/web/comment/list/" + params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0} + keywords = request_keyword_var.get() + referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' + headers = copy.copy(self.headers) + headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') + return await self.get(uri, params) + + async def get_sub_comments(self, aweme_id: str, comment_id: str, cursor: int = 0): + """ + 获取子评论 + """ + uri = "/aweme/v1/web/comment/list/reply/" + params = { + 'comment_id': comment_id, + "cursor": cursor, + "count": 20, + "item_type": 0, + "item_id": aweme_id, + } + keywords = request_keyword_var.get() + referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' + headers = copy.copy(self.headers) + headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') + return await self.get(uri, params) + + async def get_aweme_all_comments( + self, + aweme_id: str, + crawl_interval: float = 1.0, + is_fetch_sub_comments=False, + callback: Optional[Callable] = None, + max_count: int = 10, + ): + """ + 获取帖子的所有评论,包括子评论 + :param aweme_id: 帖子ID + :param crawl_interval: 抓取间隔 + :param is_fetch_sub_comments: 是否抓取子评论 + :param callback: 回调函数,用于处理抓取到的评论 + :param max_count: 一次帖子爬取的最大评论数量 + :return: 评论列表 + """ + result = [] + comments_has_more = 1 + comments_cursor = 0 + while comments_has_more and len(result) < max_count: + comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) + comments_has_more = comments_res.get("has_more", 0) + comments_cursor = comments_res.get("cursor", 0) + comments = comments_res.get("comments", []) + if not comments: + continue + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] + result.extend(comments) + if callback: # 如果有回调函数,就执行回调函数 + await callback(aweme_id, comments) + + await asyncio.sleep(crawl_interval) + if not is_fetch_sub_comments: + continue + # 获取二级评论 + for comment in comments: + reply_comment_total = comment.get("reply_comment_total") + + if reply_comment_total > 0: + comment_id = comment.get("cid") + sub_comments_has_more = 1 + sub_comments_cursor = 0 + + while sub_comments_has_more: + sub_comments_res = await self.get_sub_comments(aweme_id, comment_id, sub_comments_cursor) + sub_comments_has_more = sub_comments_res.get("has_more", 0) + sub_comments_cursor = sub_comments_res.get("cursor", 0) + sub_comments = sub_comments_res.get("comments", []) + + if not sub_comments: + continue + result.extend(sub_comments) + if callback: # 如果有回调函数,就执行回调函数 + await callback(aweme_id, sub_comments) + await asyncio.sleep(crawl_interval) + return result + + async def get_user_info(self, sec_user_id: str): + uri = "/aweme/v1/web/user/profile/other/" + params = { + "sec_user_id": sec_user_id, + "publish_video_strategy_type": 2, + "personal_center_strategy": 1, + } + return await self.get(uri, params) + + async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict: + uri = "/aweme/v1/web/aweme/post/" + params = { + "sec_user_id": sec_user_id, + "count": 18, + "max_cursor": max_cursor, + "locate_query": "false", + "publish_video_strategy_type": 2, + 'verifyFp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU', + 'fp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU' + } + return await self.get(uri, params) + + async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None): + posts_has_more = 1 + max_cursor = "" + result = [] + while posts_has_more == 1: + aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor) + posts_has_more = aweme_post_res.get("has_more", 0) + max_cursor = aweme_post_res.get("max_cursor") + aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else [] + utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}") + if callback: + await callback(aweme_list) + result.extend(aweme_list) + return result + + async def get_aweme_media(self, url: str) -> Union[bytes, None]: + async with httpx.AsyncClient(proxy=self.proxy) as client: + try: + response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + return None diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py new file mode 100644 index 0000000..1d7ce4d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py @@ -0,0 +1,393 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import asyncio +import os +import random +from asyncio import Task +from typing import Any, Dict, List, Optional, Tuple + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) + +import config +from base.base_crawler import AbstractCrawler +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import douyin as douyin_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import DouYinClient +from .exception import DataFetchError +from .field import PublishTimeType +from .login import DouYinLogin + + +class DouYinCrawler(AbstractCrawler): + context_page: Page + dy_client: DouYinClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self) -> None: + self.index_url = "https://www.douyin.com" + self.cdp_manager = None + + async def start(self) -> None: + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + None, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + playwright_proxy_format, + user_agent=None, + headless=config.HEADLESS, + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url) + + self.dy_client = await self.create_douyin_client(httpx_proxy_format) + if not await self.dy_client.pong(browser_context=self.browser_context): + login_obj = DouYinLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # you phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.dy_client.update_cookies(browser_context=self.browser_context) + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_awemes() + elif config.CRAWLER_TYPE == "creator": + # Get the information and comments of the specified creator + await self.get_creators_and_videos() + + utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...") + + async def search(self) -> None: + utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords") + dy_limit_count = 10 # douyin limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count + start_page = config.START_PAGE # start page number + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") + aweme_list: List[str] = [] + page = 0 + dy_search_id = "" + while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[DouYinCrawler.search] Skip {page}") + page += 1 + continue + try: + utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}") + posts_res = await self.dy_client.search_info_by_keyword( + keyword=keyword, + offset=page * dy_limit_count - dy_limit_count, + publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE), + search_id=dy_search_id, + ) + if posts_res.get("data") is None or posts_res.get("data") == []: + utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`") + break + except DataFetchError: + utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") + break + + page += 1 + if "data" not in posts_res: + utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。") + break + dy_search_id = posts_res.get("extra", {}).get("logid", "") + for post_item in posts_res.get("data"): + try: + aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0]) + except TypeError: + continue + aweme_list.append(aweme_info.get("aweme_id", "")) + await douyin_store.update_douyin_aweme(aweme_item=aweme_info) + await self.get_aweme_media(aweme_item=aweme_info) + utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}") + await self.batch_get_note_comments(aweme_list) + + async def get_specified_awemes(self): + """Get the information and comments of the specified post""" + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST] + aweme_details = await asyncio.gather(*task_list) + for aweme_detail in aweme_details: + if aweme_detail is not None: + await douyin_store.update_douyin_aweme(aweme_item=aweme_detail) + await self.get_aweme_media(aweme_item=aweme_detail) + await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST) + + async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any: + """Get note detail""" + async with semaphore: + try: + return await self.dy_client.get_video_by_id(aweme_id) + except DataFetchError as ex: + utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}") + return None + + async def batch_get_note_comments(self, aweme_list: List[str]) -> None: + """ + Batch get note comments + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + + task_list: List[Task] = [] + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + for aweme_id in aweme_list: + task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id) + task_list.append(task) + if len(task_list) > 0: + await asyncio.wait(task_list) + + async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: + async with semaphore: + try: + # 将关键词列表传递给 get_aweme_all_comments 方法 + await self.dy_client.get_aweme_all_comments( + aweme_id=aweme_id, + crawl_interval=random.random(), + is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, + callback=douyin_store.batch_update_dy_aweme_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") + except DataFetchError as e: + utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") + + async def get_creators_and_videos(self) -> None: + """ + Get the information and videos of the specified creator + """ + utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators") + for user_id in config.DY_CREATOR_ID_LIST: + creator_info: Dict = await self.dy_client.get_user_info(user_id) + if creator_info: + await douyin_store.save_creator(user_id, creator=creator_info) + + # Get all video information of the creator + all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail) + + video_ids = [video_item.get("aweme_id") for video_item in all_video_list] + await self.batch_get_note_comments(video_ids) + + async def fetch_creator_video_detail(self, video_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list] + + note_details = await asyncio.gather(*task_list) + for aweme_item in note_details: + if aweme_item is not None: + await douyin_store.update_douyin_aweme(aweme_item=aweme_item) + await self.get_aweme_media(aweme_item=aweme_item) + + async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient: + """Create douyin client""" + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore + douyin_client = DouYinClient( + proxy=httpx_proxy, + headers={ + "User-Agent": await self.context_page.evaluate("() => navigator.userAgent"), + "Cookie": cookie_str, + "Host": "www.douyin.com", + "Origin": "https://www.douyin.com/", + "Referer": "https://www.douyin.com/", + "Content-Type": "application/json;charset=UTF-8", + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return douyin_client + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """Launch browser and create browser context""" + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={ + "width": 1920, + "height": 1080 + }, + user_agent=user_agent, + ) # type: ignore + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 添加反检测脚本 + await self.cdp_manager.add_stealth_script() + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) + + async def close(self) -> None: + """Close browser context""" + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[DouYinCrawler.close] Browser context closed ...") + + async def get_aweme_media(self, aweme_item: Dict): + """ + 获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载 + + Args: + aweme_item (Dict): 抖音作品详情 + """ + if not config.ENABLE_GET_MEIDAS: + utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled") + return + # 笔记 urls 列表,若为短视频类型则返回为空列表 + note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item) + # 视频 url,永远存在,但为短视频类型时的文件其实是音频文件 + video_download_url: str = douyin_store._extract_video_download_url(aweme_item) + # TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取 + if note_download_url: + await self.get_aweme_images(aweme_item) + else: + await self.get_aweme_video(aweme_item) + + async def get_aweme_images(self, aweme_item: Dict): + """ + get aweme images. please use get_aweme_media + + Args: + aweme_item (Dict): 抖音作品详情 + """ + if not config.ENABLE_GET_MEIDAS: + return + aweme_id = aweme_item.get("aweme_id") + # 笔记 urls 列表,若为短视频类型则返回为空列表 + note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item) + + if not note_download_url: + return + picNum = 0 + for url in note_download_url: + if not url: + continue + content = await self.dy_client.get_aweme_media(url) + await asyncio.sleep(random.random()) + if content is None: + continue + extension_file_name = f"{picNum:>03d}.jpeg" + picNum += 1 + await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name) + + async def get_aweme_video(self, aweme_item: Dict): + """ + get aweme videos. please use get_aweme_media + + Args: + aweme_item (Dict): 抖音作品详情 + """ + if not config.ENABLE_GET_MEIDAS: + return + aweme_id = aweme_item.get("aweme_id") + + # 视频 url,永远存在,但为短视频类型时的文件其实是音频文件 + video_download_url: str = douyin_store._extract_video_download_url(aweme_item) + + if not video_download_url: + return + content = await self.dy_client.get_aweme_media(video_download_url) + await asyncio.sleep(random.random()) + if content is None: + return + extension_file_name = f"video.mp4" + await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/exception.py new file mode 100644 index 0000000..361e521 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/exception.py @@ -0,0 +1,20 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/field.py new file mode 100644 index 0000000..e3175ab --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/field.py @@ -0,0 +1,34 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from enum import Enum + + +class SearchChannelType(Enum): + """search channel type""" + GENERAL = "aweme_general" # 综合 + VIDEO = "aweme_video_web" # 视频 + USER = "aweme_user_web" # 用户 + LIVE = "aweme_live" # 直播 + + +class SearchSortType(Enum): + """search sort type""" + GENERAL = 0 # 综合排序 + MOST_LIKE = 1 # 最多点赞 + LATEST = 2 # 最新发布 + +class PublishTimeType(Enum): + """publish time type""" + UNLIMITED = 0 # 不限 + ONE_DAY = 1 # 一天内 + ONE_WEEK = 7 # 一周内 + SIX_MONTH = 180 # 半年内 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py new file mode 100644 index 0000000..1ed3111 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py @@ -0,0 +1,85 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/10 02:24 +# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除 + +import random + +import execjs +from playwright.async_api import Page + +douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read()) + +def get_web_id(): + """ + 生成随机的webid + Returns: + + """ + + def e(t): + if t is not None: + return str(t ^ (int(16 * random.random()) >> (t // 4))) + else: + return ''.join( + [str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))] + ) + + web_id = ''.join( + e(int(x)) if x in '018' else x for x in e(None) + ) + return web_id.replace('-', '')[:19] + + + +async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None): + """ + 获取 a_bogus 参数, 目前不支持post请求类型的签名 + """ + return get_a_bogus_from_js(url, params, user_agent) + +def get_a_bogus_from_js(url: str, params: str, user_agent: str): + """ + 通过js获取 a_bogus 参数 + Args: + url: + params: + user_agent: + + Returns: + + """ + sign_js_name = "sign_datail" + if "/reply" in url: + sign_js_name = "sign_reply" + return douyin_sign_obj.call(sign_js_name, params, user_agent) + + + +async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page): + """ + 通过playright获取 a_bogus 参数 + playwright版本已失效 + Returns: + + """ + if not post_data: + post_data = "" + a_bogus = await page.evaluate( + "([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])", + [params, post_data, user_agent]) + + return a_bogus + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/login.py new file mode 100644 index 0000000..f376267 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/login.py @@ -0,0 +1,265 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from cache.cache_factory import CacheFactory +from tools import utils + + +class DouYinLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, # type: ignore + context_page: Page, # type: ignore + login_phone: Optional[str] = "", + cookie_str: Optional[str] = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.scan_qrcode_time = 60 + self.cookie_str = cookie_str + + async def begin(self): + """ + Start login douyin website + 滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录 + """ + + # popup login dialog + await self.popup_login_dialog() + + # select login type + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") + + # 如果页面重定向到滑动验证码页面,需要再次滑动滑块 + await asyncio.sleep(6) + current_page_title = await self.context_page.title() + if "验证码中间页" in current_page_title: + await self.check_page_display_slider(move_step=3, slider_level="hard") + + # check login state + utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("[DouYinLogin.begin] login failed please confirm ...") + sys.exit() + + # wait for redirect + wait_redirect_seconds = 5 + utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self): + """Check if the current login status is successful and return True otherwise return False""" + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + + for page in self.browser_context.pages: + try: + local_storage = await page.evaluate("() => window.localStorage") + if local_storage.get("HasUserLogin", "") == "1": + return True + except Exception as e: + # utils.logger.warn(f"[DouYinLogin] check_login_state waring: {e}") + await asyncio.sleep(0.1) + + if cookie_dict.get("LOGIN_STATUS") == "1": + return True + + return False + + async def popup_login_dialog(self): + """If the login dialog box does not pop up automatically, we will manually click the login button""" + dialog_selector = "xpath=//div[@id='login-panel-new']" + try: + # check dialog box is auto popup and wait for 10 seconds + await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) + except Exception as e: + utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}") + utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button") + login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']") + await login_button_ele.click() + await asyncio.sleep(0.5) + + async def login_by_qrcode(self): + utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...") + qrcode_img_selector = "xpath=//div[@id='animate_qrcode_container']//img" + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...") + sys.exit() + + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + await asyncio.sleep(2) + + async def login_by_mobile(self): + utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...") + mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']") + await mobile_tap_ele.click() + await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']") + mobile_input_ele = self.context_page.locator("xpath=//input[@placeholder='手机号']") + await mobile_input_ele.fill(self.login_phone) + await asyncio.sleep(0.5) + send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']") + await send_sms_code_btn.click() + + # 检查是否有滑动验证码 + await self.check_page_display_slider(move_step=10, slider_level="easy") + cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY) + max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 + while max_get_sms_code_time > 0: + utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") + await asyncio.sleep(1) + sms_code_key = f"dy_{self.login_phone}" + sms_code_value = cache_client.get(sms_code_key) + if not sms_code_value: + max_get_sms_code_time -= 1 + continue + + sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']") + await sms_code_input_ele.fill(value=sms_code_value.decode()) + await asyncio.sleep(0.5) + submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']") + await submit_btn_ele.click() # 点击登录 + # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确 + break + + async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"): + """ + 检查页面是否出现滑动验证码 + :return: + """ + # 等待滑动验证码的出现 + back_selector = "#captcha-verify-image" + try: + await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000) + except PlaywrightTimeoutError: # 没有滑动验证码,直接返回 + return + + gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]' + max_slider_try_times = 20 + slider_verify_success = False + while not slider_verify_success: + if max_slider_try_times <= 0: + utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...") + sys.exit() + try: + await self.move_slider(back_selector, gap_selector, move_step, slider_level) + await asyncio.sleep(1) + + # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 + page_content = await self.context_page.content() + if "操作过慢" in page_content or "提示重新操作" in page_content: + utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...") + await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") + continue + + # 滑动成功后,等待滑块消失 + await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) + # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 + utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...") + slider_verify_success = True + except Exception as e: + utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}") + await asyncio.sleep(1) + max_slider_try_times -= 1 + utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}") + continue + + async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): + """ + Move the slider to the right to complete the verification + :param back_selector: 滑动验证码背景图片的选择器 + :param gap_selector: 滑动验证码的滑块选择器 + :param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢 + :param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块 + :return: + """ + + # get slider background image + slider_back_elements = await self.context_page.wait_for_selector( + selector=back_selector, + timeout=1000 * 10, # wait 10 seconds + ) + slide_back = str(await slider_back_elements.get_property("src")) # type: ignore + + # get slider gap image + gap_elements = await self.context_page.wait_for_selector( + selector=gap_selector, + timeout=1000 * 10, # wait 10 seconds + ) + gap_src = str(await gap_elements.get_property("src")) # type: ignore + + # 识别滑块位置 + slide_app = utils.Slide(gap=gap_src, bg=slide_back) + distance = slide_app.discern() + + # 获取移动轨迹 + tracks = utils.get_tracks(distance, slider_level) + new_1 = tracks[-1] - (sum(tracks) - distance) + tracks.pop() + tracks.append(new_1) + + # 根据轨迹拖拽滑块到指定位置 + element = await self.context_page.query_selector(gap_selector) + bounding_box = await element.bounding_box() # type: ignore + + await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore + bounding_box["y"] + bounding_box["height"] / 2) # type: ignore + # 这里获取到x坐标中心点位置 + x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore + # 模拟滑动操作 + await element.hover() # type: ignore + await self.context_page.mouse.down() + + for track in tracks: + # 循环鼠标按照轨迹移动 + # steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢 + await self.context_page.mouse.move(x + track, 0, steps=move_step) + x += track + await self.context_page.mouse.up() + + async def login_by_cookies(self): + utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".douyin.com", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/__init__.py new file mode 100644 index 0000000..82c5121 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/__init__.py @@ -0,0 +1,13 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from .core import KuaishouCrawler \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/client.py new file mode 100644 index 0000000..11401ed --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/client.py @@ -0,0 +1,313 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import asyncio +import json +from typing import Any, Callable, Dict, List, Optional +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext, Page + +import config +from base.base_crawler import AbstractApiClient +from tools import utils + +from .exception import DataFetchError +from .graphql import KuaiShouGraphQL + + +class KuaiShouClient(AbstractApiClient): + def __init__( + self, + timeout=10, + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxy = proxy + self.timeout = timeout + self.headers = headers + self._host = "https://www.kuaishou.com/graphql" + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + self.graphql = KuaiShouGraphQL() + + async def request(self, method, url, **kwargs) -> Any: + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + data: Dict = response.json() + if data.get("errors"): + raise DataFetchError(data.get("errors", "unkonw error")) + else: + return data.get("data", {}) + + async def get(self, uri: str, params=None) -> Dict: + final_uri = uri + if isinstance(params, dict): + final_uri = f"{uri}?" f"{urlencode(params)}" + return await self.request( + method="GET", url=f"{self._host}{final_uri}", headers=self.headers + ) + + async def post(self, uri: str, data: dict) -> Dict: + json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) + return await self.request( + method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers + ) + + async def pong(self) -> bool: + """get a note to check if login state is ok""" + utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...") + ping_flag = False + try: + post_data = { + "operationName": "visionProfileUserList", + "variables": { + "ftype": 1, + }, + "query": self.graphql.get("vision_profile_user_list"), + } + res = await self.post("", post_data) + if res.get("visionProfileUserList", {}).get("result") == 1: + ping_flag = True + except Exception as e: + utils.logger.error( + f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..." + ) + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def search_info_by_keyword( + self, keyword: str, pcursor: str, search_session_id: str = "" + ): + """ + KuaiShou web search api + :param keyword: search keyword + :param pcursor: limite page curson + :param search_session_id: search session id + :return: + """ + post_data = { + "operationName": "visionSearchPhoto", + "variables": { + "keyword": keyword, + "pcursor": pcursor, + "page": "search", + "searchSessionId": search_session_id, + }, + "query": self.graphql.get("search_query"), + } + return await self.post("", post_data) + + async def get_video_info(self, photo_id: str) -> Dict: + """ + Kuaishou web video detail api + :param photo_id: + :return: + """ + post_data = { + "operationName": "visionVideoDetail", + "variables": {"photoId": photo_id, "page": "search"}, + "query": self.graphql.get("video_detail"), + } + return await self.post("", post_data) + + async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict: + """get video comments + :param photo_id: photo id you want to fetch + :param pcursor: last you get pcursor, defaults to "" + :return: + """ + post_data = { + "operationName": "commentListQuery", + "variables": {"photoId": photo_id, "pcursor": pcursor}, + "query": self.graphql.get("comment_list"), + } + return await self.post("", post_data) + + async def get_video_sub_comments( + self, photo_id: str, rootCommentId: str, pcursor: str = "" + ) -> Dict: + """get video sub comments + :param photo_id: photo id you want to fetch + :param pcursor: last you get pcursor, defaults to "" + :return: + """ + post_data = { + "operationName": "visionSubCommentList", + "variables": { + "photoId": photo_id, + "pcursor": pcursor, + "rootCommentId": rootCommentId, + }, + "query": self.graphql.get("vision_sub_comment_list"), + } + return await self.post("", post_data) + + async def get_creator_profile(self, userId: str) -> Dict: + post_data = { + "operationName": "visionProfile", + "variables": {"userId": userId}, + "query": self.graphql.get("vision_profile"), + } + return await self.post("", post_data) + + async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict: + post_data = { + "operationName": "visionProfilePhotoList", + "variables": {"page": "profile", "pcursor": pcursor, "userId": userId}, + "query": self.graphql.get("vision_profile_photo_list"), + } + return await self.post("", post_data) + + async def get_video_all_comments( + self, + photo_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 10, + ): + """ + get video all comments include sub comments + :param photo_id: + :param crawl_interval: + :param callback: + :param max_count: + :return: + """ + + result = [] + pcursor = "" + + while pcursor != "no_more" and len(result) < max_count: + comments_res = await self.get_video_comments(photo_id, pcursor) + vision_commen_list = comments_res.get("visionCommentList", {}) + pcursor = vision_commen_list.get("pcursor", "") + comments = vision_commen_list.get("rootComments", []) + if len(result) + len(comments) > max_count: + comments = comments[: max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(photo_id, comments) + result.extend(comments) + await asyncio.sleep(crawl_interval) + sub_comments = await self.get_comments_all_sub_comments( + comments, photo_id, crawl_interval, callback + ) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments( + self, + comments: List[Dict], + photo_id, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Args: + comments: 评论列表 + photo_id: 视频id + crawl_interval: 爬取一次评论的延迟单位(秒) + callback: 一次评论爬取结束后 + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info( + f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled" + ) + return [] + + result = [] + for comment in comments: + sub_comments = comment.get("subComments") + if sub_comments and callback: + await callback(photo_id, sub_comments) + + sub_comment_pcursor = comment.get("subCommentsPcursor") + if sub_comment_pcursor == "no_more": + continue + + root_comment_id = comment.get("commentId") + sub_comment_pcursor = "" + + while sub_comment_pcursor != "no_more": + comments_res = await self.get_video_sub_comments( + photo_id, root_comment_id, sub_comment_pcursor + ) + vision_sub_comment_list = comments_res.get("visionSubCommentList", {}) + sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more") + + comments = vision_sub_comment_list.get("subComments", {}) + if callback: + await callback(photo_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + return result + + async def get_creator_info(self, user_id: str) -> Dict: + """ + eg: https://www.kuaishou.com/profile/3x4jtnbfter525a + 快手用户主页 + """ + + visionProfile = await self.get_creator_profile(user_id) + return visionProfile.get("userProfile") + + async def get_all_videos_by_creator( + self, + user_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 + Args: + user_id: 用户ID + crawl_interval: 爬取一次的延迟单位(秒) + callback: 一次分页爬取结束后的更新回调函数 + Returns: + + """ + result = [] + pcursor = "" + + while pcursor != "no_more": + videos_res = await self.get_video_by_creater(user_id, pcursor) + if not videos_res: + utils.logger.error( + f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data." + ) + break + + vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {}) + pcursor = vision_profile_photo_list.get("pcursor", "") + + videos = vision_profile_photo_list.get("feeds", []) + utils.logger.info( + f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}" + ) + + if callback: + await callback(videos) + await asyncio.sleep(crawl_interval) + result.extend(videos) + return result diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py new file mode 100644 index 0000000..4ae1d63 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py @@ -0,0 +1,396 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import os +import random +import time +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) + +import config +from base.base_crawler import AbstractCrawler +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import kuaishou as kuaishou_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import comment_tasks_var, crawler_type_var, source_keyword_var + +from .client import KuaiShouClient +from .exception import DataFetchError +from .login import KuaishouLogin + + +class KuaishouCrawler(AbstractCrawler): + context_page: Page + ks_client: KuaiShouClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self): + self.index_url = "https://www.kuaishou.com" + self.user_agent = utils.get_user_agent() + self.cdp_manager = None + + async def start(self): + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool( + config.IP_PROXY_POOL_COUNT, enable_validate_ip=True + ) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( + ip_proxy_info + ) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + self.user_agent, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, None, self.user_agent, headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(f"{self.index_url}?isHome=1") + + # Create a client to interact with the kuaishou website. + self.ks_client = await self.create_ks_client(httpx_proxy_format) + if not await self.ks_client.pong(): + login_obj = KuaishouLogin( + login_type=config.LOGIN_TYPE, + login_phone=httpx_proxy_format, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.ks_client.update_cookies( + browser_context=self.browser_context + ) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for videos and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_videos() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their videos and comments + await self.get_creators_and_videos() + else: + pass + + utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...") + + async def search(self): + utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords") + ks_limit_count = 20 # kuaishou limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + search_session_id = "" + source_keyword_var.set(keyword) + utils.logger.info( + f"[KuaishouCrawler.search] Current search keyword: {keyword}" + ) + page = 1 + while ( + page - start_page + 1 + ) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}") + page += 1 + continue + utils.logger.info( + f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}" + ) + video_id_list: List[str] = [] + videos_res = await self.ks_client.search_info_by_keyword( + keyword=keyword, + pcursor=str(page), + search_session_id=search_session_id, + ) + if not videos_res: + utils.logger.error( + f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data" + ) + continue + + vision_search_photo: Dict = videos_res.get("visionSearchPhoto") + if vision_search_photo.get("result") != 1: + utils.logger.error( + f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data " + ) + continue + search_session_id = vision_search_photo.get("searchSessionId", "") + for video_detail in vision_search_photo.get("feeds"): + video_id_list.append(video_detail.get("photo", {}).get("id")) + await kuaishou_store.update_kuaishou_video(video_item=video_detail) + + # batch fetch video comments + page += 1 + await self.batch_get_video_comments(video_id_list) + + async def get_specified_videos(self): + """Get the information and comments of the specified post""" + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(video_id=video_id, semaphore=semaphore) + for video_id in config.KS_SPECIFIED_ID_LIST + ] + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + await kuaishou_store.update_kuaishou_video(video_detail) + await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST) + + async def get_video_info_task( + self, video_id: str, semaphore: asyncio.Semaphore + ) -> Optional[Dict]: + """Get video detail task""" + async with semaphore: + try: + result = await self.ks_client.get_video_info(video_id) + utils.logger.info( + f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..." + ) + return result.get("visionVideoDetail") + except DataFetchError as ex: + utils.logger.error( + f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}" + ) + return None + except KeyError as ex: + utils.logger.error( + f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}" + ) + return None + + async def batch_get_video_comments(self, video_id_list: List[str]): + """ + batch get video comments + :param video_id_list: + :return: + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info( + f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled" + ) + return + + utils.logger.info( + f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}" + ) + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for video_id in video_id_list: + task = asyncio.create_task( + self.get_comments(video_id, semaphore), name=video_id + ) + task_list.append(task) + + comment_tasks_var.set(task_list) + await asyncio.gather(*task_list) + + async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): + """ + get comment for video id + :param video_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + utils.logger.info( + f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..." + ) + await self.ks_client.get_video_all_comments( + photo_id=video_id, + crawl_interval=random.random(), + callback=kuaishou_store.batch_update_ks_video_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + except DataFetchError as ex: + utils.logger.error( + f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}" + ) + except Exception as e: + utils.logger.error( + f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}" + ) + # use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task + # maybe kuaishou block our request, we will take a nap and update the cookie again + current_running_tasks = comment_tasks_var.get() + for task in current_running_tasks: + task.cancel() + time.sleep(20) + await self.context_page.goto(f"{self.index_url}?isHome=1") + await self.ks_client.update_cookies( + browser_context=self.browser_context + ) + + async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: + """Create ks client""" + utils.logger.info( + "[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..." + ) + cookie_str, cookie_dict = utils.convert_cookies( + await self.browser_context.cookies() + ) + ks_client_obj = KuaiShouClient( + proxy=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Origin": self.index_url, + "Referer": self.index_url, + "Content-Type": "application/json;charset=UTF-8", + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return ks_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info( + "[KuaishouCrawler.launch_browser] Begin create browser context ..." + ) + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join( + os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM + ) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent, + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, user_agent=user_agent + ) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error( + f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}" + ) + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser( + chromium, playwright_proxy, user_agent, headless + ) + + async def get_creators_and_videos(self) -> None: + """Get creator's videos and retrieve their comment information.""" + utils.logger.info( + "[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators" + ) + for user_id in config.KS_CREATOR_ID_LIST: + # get creator detail info from web html content + createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) + if createor_info: + await kuaishou_store.save_creator(user_id, creator=createor_info) + + # Get all video information of the creator + all_video_list = await self.ks_client.get_all_videos_by_creator( + user_id=user_id, + crawl_interval=random.random(), + callback=self.fetch_creator_video_detail, + ) + + video_ids = [ + video_item.get("photo", {}).get("id") for video_item in all_video_list + ] + await self.batch_get_video_comments(video_ids) + + async def fetch_creator_video_detail(self, video_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) + for post_item in video_list + ] + + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + await kuaishou_store.update_kuaishou_video(video_detail) + + async def close(self): + """Close browser context""" + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[KuaishouCrawler.close] Browser context closed ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/exception.py new file mode 100644 index 0000000..361e521 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/exception.py @@ -0,0 +1,20 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/field.py new file mode 100644 index 0000000..e1dff87 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/field.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql.py new file mode 100644 index 0000000..14a3982 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql.py @@ -0,0 +1,33 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# 快手的数据传输是基于GraphQL实现的 +# 这个类负责获取一些GraphQL的schema +from typing import Dict + + +class KuaiShouGraphQL: + graphql_queries: Dict[str, str]= {} + + def __init__(self): + self.graphql_dir = "media_platform/kuaishou/graphql/" + self.load_graphql_queries() + + def load_graphql_queries(self): + graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"] + + for file in graphql_files: + with open(self.graphql_dir + file, mode="r") as f: + query_name = file.split(".")[0] + self.graphql_queries[query_name] = f.read() + + def get(self, query_name: str) -> str: + return self.graphql_queries.get(query_name, "Query not found") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/comment_list.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/comment_list.graphql new file mode 100644 index 0000000..b216b8e --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/comment_list.graphql @@ -0,0 +1,39 @@ +query commentListQuery($photoId: String, $pcursor: String) { + visionCommentList(photoId: $photoId, pcursor: $pcursor) { + commentCount + pcursor + rootComments { + commentId + authorId + authorName + content + headurl + timestamp + likedCount + realLikedCount + liked + status + authorLiked + subCommentCount + subCommentsPcursor + subComments { + commentId + authorId + authorName + content + headurl + timestamp + likedCount + realLikedCount + liked + status + authorLiked + replyToUserName + replyTo + __typename + } + __typename + } + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/search_query.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/search_query.graphql new file mode 100644 index 0000000..cc3bd8f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/search_query.graphql @@ -0,0 +1,111 @@ +fragment photoContent on PhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked +} + +fragment recoPhotoFragment on recoPhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked +} + +fragment feedContent on Feed { + type + author { + id + name + headerUrl + following + headerUrls { + url + __typename + } + __typename + } + photo { + ...photoContent + ...recoPhotoFragment + __typename + } + canAddComment + llsid + status + currentPcursor + tags { + type + name + __typename + } + __typename +} + +query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) { + visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) { + result + llsid + webPageArea + feeds { + ...feedContent + __typename + } + searchSessionId + pcursor + aladdinBanner { + imgUrl + link + __typename + } + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/video_detail.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/video_detail.graphql new file mode 100644 index 0000000..ffb5309 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/video_detail.graphql @@ -0,0 +1,80 @@ +query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) { + visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) { + status + type + author { + id + name + following + headerUrl + __typename + } + photo { + id + duration + caption + likeCount + realLikeCount + coverUrl + photoUrl + liked + timestamp + expTag + llsid + viewCount + videoRatio + stereoType + musicBlocked + manifest { + mediaType + businessType + version + adaptationSet { + id + duration + representation { + id + defaultSelect + backupUrl + codecs + url + height + width + avgBitrate + maxBitrate + m3u8Slice + qualityType + qualityLabel + frameRate + featureP2sp + hidden + disableAdaptive + __typename + } + __typename + } + __typename + } + manifestH265 + photoH265Url + coronaCropManifest + coronaCropManifestH265 + croppedPhotoH265Url + croppedPhotoUrl + videoResource + __typename + } + tags { + type + name + __typename + } + commentLimit { + canAddComment + __typename + } + llsid + danmakuSwitch + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile.graphql new file mode 100644 index 0000000..5499600 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile.graphql @@ -0,0 +1,27 @@ +query visionProfile($userId: String) { + visionProfile(userId: $userId) { + result + hostName + userProfile { + ownerCount { + fan + photo + follow + photo_public + __typename + } + profile { + gender + user_name + user_id + headurl + user_text + user_profile_bg_url + __typename + } + isFollowing + __typename + } + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql new file mode 100644 index 0000000..328052e --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql @@ -0,0 +1,110 @@ +fragment photoContent on PhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment recoPhotoFragment on recoPhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment feedContent on Feed { + type + author { + id + name + headerUrl + following + headerUrls { + url + __typename + } + __typename + } + photo { + ...photoContent + ...recoPhotoFragment + __typename + } + canAddComment + llsid + status + currentPcursor + tags { + type + name + __typename + } + __typename +} + +query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) { + visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) { + result + llsid + webPageArea + feeds { + ...feedContent + __typename + } + hostName + pcursor + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_user_list.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_user_list.graphql new file mode 100644 index 0000000..148165a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_profile_user_list.graphql @@ -0,0 +1,16 @@ +query visionProfileUserList($pcursor: String, $ftype: Int) { + visionProfileUserList(pcursor: $pcursor, ftype: $ftype) { + result + fols { + user_name + headurl + user_text + isFollowing + user_id + __typename + } + hostName + pcursor + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql new file mode 100644 index 0000000..31730fc --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql @@ -0,0 +1,22 @@ +mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) { + visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) { + pcursor + subComments { + commentId + authorId + authorName + content + headurl + timestamp + likedCount + realLikedCount + liked + status + authorLiked + replyToUserName + replyTo + __typename + } + __typename + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/login.py new file mode 100644 index 0000000..432cf96 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/login.py @@ -0,0 +1,113 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class KuaishouLogin(AbstractLogin): + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + async def begin(self): + """Start login xiaohongshu""" + utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second + if max retry times reached, raise RetryError + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + kuaishou_pass_token = cookie_dict.get("passToken") + if kuaishou_pass_token: + return True + return False + + async def login_by_qrcode(self): + """login kuaishou website and keep webdriver login state""" + utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...") + + # click login button + login_button_ele = self.context_page.locator( + "xpath=//p[text()='登录']" + ) + await login_button_ele.click() + + # find login qrcode + qrcode_img_selector = "//div[@class='qrcode-img']//img" + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + sys.exit() + + + # show login qrcode + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_mobile(self): + pass + + async def login_by_cookies(self): + utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".kuaishou.com", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/__init__.py new file mode 100644 index 0000000..29bcc85 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/__init__.py @@ -0,0 +1,13 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from .core import TieBaCrawler \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py new file mode 100644 index 0000000..1b8c463 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py @@ -0,0 +1,385 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import asyncio +import json +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext +from tenacity import RetryError, retry, stop_after_attempt, wait_fixed + +import config +from base.base_crawler import AbstractApiClient +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote +from proxy.proxy_ip_pool import ProxyIpPool +from tools import utils + +from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor + + +class BaiduTieBaClient(AbstractApiClient): + + def __init__( + self, + timeout=10, + ip_pool=None, + default_ip_proxy=None, + ): + self.ip_pool: Optional[ProxyIpPool] = ip_pool + self.timeout = timeout + self.headers = { + "User-Agent": utils.get_user_agent(), + "Cookies": "", + } + self._host = "https://tieba.baidu.com" + self._page_extractor = TieBaExtractor() + self.default_ip_proxy = default_ip_proxy + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + return_ori_content: 是否返回原始内容 + proxies: 代理IP + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + actual_proxy = proxy if proxy else self.default_ip_proxy + async with httpx.AsyncClient(proxy=actual_proxy) as client: + response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs) + + if response.status_code != 200: + utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + utils.logger.error(f"Request failed, response: {response.text}") + raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + + if response.text == "" or response.text == "blocked": + utils.logger.error(f"request params incrr, response.text: {response.text}") + raise Exception("account blocked") + + if return_ori_content: + return response.text + + return response.json() + + async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + return_ori_content: 是否返回原始内容 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri = (f"{uri}?" + f"{urlencode(params)}") + try: + res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, **kwargs) + return res + except RetryError as e: + if self.ip_pool: + proxie_model = await self.ip_pool.get_proxy() + _, proxy = utils.format_proxy_info(proxie_model) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, proxy=proxy, **kwargs) + self.default_ip_proxy = proxy + return res + + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") + raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") + + async def post(self, uri: str, data: dict, **kwargs) -> Dict: + """ + POST请求,对请求头签名 + Args: + uri: 请求路由 + data: 请求体参数 + + Returns: + + """ + json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) + return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs) + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...") + try: + uri = "/mo/q/sync" + res: Dict = await self.get(uri) + utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}") + if res and res.get("no") == 0: + ping_flag = True + else: + utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...") + ping_flag = False + except Exception as e: + utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + pass + + async def get_notes_by_keyword( + self, + keyword: str, + page: int = 1, + page_size: int = 10, + sort: SearchSortType = SearchSortType.TIME_DESC, + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, + ) -> List[TiebaNote]: + """ + 根据关键词搜索贴吧帖子 + Args: + keyword: 关键词 + page: 分页第几页 + page_size: 每页大小 + sort: 结果排序方式 + note_type: 帖子类型(主题贴|主题+回复混合模式) + Returns: + + """ + uri = "/f/search/res" + params = { + "isnew": 1, + "qw": keyword, + "rn": page_size, + "pn": page, + "sm": sort.value, + "only_thread": note_type.value, + } + page_content = await self.get(uri, params=params, return_ori_content=True) + return self._page_extractor.extract_search_note_list(page_content) + + async def get_note_by_id(self, note_id: str) -> TiebaNote: + """ + 根据帖子ID获取帖子详情 + Args: + note_id: + + Returns: + + """ + uri = f"/p/{note_id}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_note_detail(page_content) + + async def get_note_all_comments( + self, + note_detail: TiebaNote, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 10, + ) -> List[TiebaComment]: + """ + 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + note_detail: 帖子详情对象 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + max_count: 一次帖子爬取的最大评论数量 + Returns: + + """ + uri = f"/p/{note_detail.note_id}" + result: List[TiebaComment] = [] + current_page = 1 + while note_detail.total_replay_page >= current_page and len(result) < max_count: + params = { + "pn": current_page, + } + page_content = await self.get(uri, params=params, return_ori_content=True) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) + if not comments: + break + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] + if callback: + await callback(note_detail.note_id, comments) + result.extend(comments) + # 获取所有子评论 + await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback) + await asyncio.sleep(crawl_interval) + current_page += 1 + return result + + async def get_comments_all_sub_comments( + self, + comments: List[TiebaComment], + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[TiebaComment]: + """ + 获取指定评论下的所有子评论 + Args: + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + uri = "/p/comment" + if not config.ENABLE_GET_SUB_COMMENTS: + return [] + + # # 贴吧获取所有子评论需要登录态 + # if self.headers.get("Cookies") == "" or not self.pong(): + # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") + + all_sub_comments: List[TiebaComment] = [] + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: + continue + + current_page = 1 + max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 + while max_sub_page_num >= current_page: + params = { + "tid": parment_comment.note_id, # 帖子ID + "pid": parment_comment.comment_id, # 父级评论ID + "fid": parment_comment.tieba_id, # 贴吧ID + "pn": current_page # 页码 + } + page_content = await self.get(uri, params=params, return_ori_content=True) + sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment) + + if not sub_comments: + break + if callback: + await callback(parment_comment.note_id, sub_comments) + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + current_page += 1 + return all_sub_comments + + async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: + """ + 根据贴吧名称获取帖子列表 + Args: + tieba_name: 贴吧名称 + page_num: 分页数量 + + Returns: + + """ + uri = f"/f?kw={tieba_name}&pn={page_num}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_tieba_note_list(page_content) + + async def get_creator_info_by_url(self, creator_url: str) -> str: + """ + 根据创作者ID获取创作者信息 + Args: + creator_url: 创作者主页URL + + Returns: + + """ + page_content = await self.request(method="GET", url=creator_url, return_ori_content=True) + return page_content + + async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: + """ + 根据创作者获取创作者的所有帖子 + Args: + user_name: + page_number: + + Returns: + + """ + uri = f"/home/get/getthread" + params = { + "un": user_name, + "pn": page_number, + "id": "utf-8", + "_": utils.get_current_timestamp(), + } + return await self.get(uri, params=params) + + async def get_all_notes_by_creator_user_name( + self, + user_name: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_note_count: int = 0, + creator_page_html_content: str = None, + ) -> List[TiebaNote]: + """ + 根据创作者用户名获取创作者所有帖子 + Args: + user_name: 创作者用户名 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数 + max_note_count: 帖子最大获取数量,如果为0则获取所有 + creator_page_html_content: 创作者主页HTML内容 + + Returns: + + """ + # 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到 + result: List[TiebaNote] = [] + if creator_page_html_content: + thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content)) + utils.logger.info(f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}") + note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list] + notes = await asyncio.gather(*note_detail_task) + if callback: + await callback(notes) + result.extend(notes) + + notes_has_more = 1 + page_number = 1 + page_per_count = 20 + total_get_count = 0 + while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count): + notes_res = await self.get_notes_by_creator(user_name, page_number) + if not notes_res or notes_res.get("no") != 0: + utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}") + break + notes_data = notes_res.get("data") + notes_has_more = notes_data.get("has_more") + notes = notes_data["thread_list"] + utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}") + + note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes] + notes = await asyncio.gather(*note_detail_task) + if callback: + await callback(notes) + await asyncio.sleep(crawl_interval) + result.extend(notes) + page_number += 1 + total_get_count += page_per_count + return result diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py new file mode 100644 index 0000000..8635104 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py @@ -0,0 +1,418 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) + +import config +from base.base_crawler import AbstractCrawler +from model.m_baidu_tieba import TiebaCreator, TiebaNote +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import tieba as tieba_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import BaiduTieBaClient +from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor +from .login import BaiduTieBaLogin + + +class TieBaCrawler(AbstractCrawler): + context_page: Page + tieba_client: BaiduTieBaClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self) -> None: + self.index_url = "https://tieba.baidu.com" + self.user_agent = utils.get_user_agent() + self._page_extractor = TieBaExtractor() + self.cdp_manager = None + + async def start(self) -> None: + """ + Start the crawler + Returns: + + """ + ip_proxy_pool, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + utils.logger.info( + "[BaiduTieBaCrawler.start] Begin create ip proxy pool ..." + ) + ip_proxy_pool = await create_ip_pool( + config.IP_PROXY_POOL_COUNT, enable_validate_ip=True + ) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + _, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) + utils.logger.info( + f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}" + ) + + # Create a client to interact with the baidutieba website. + self.tieba_client = BaiduTieBaClient( + ip_pool=ip_proxy_pool, + default_ip_proxy=httpx_proxy_format, + ) + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + await self.get_specified_tieba_notes() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + await self.get_creators_and_notes() + else: + pass + + utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...") + + async def search(self) -> None: + """ + Search for notes and retrieve their comment information. + Returns: + + """ + utils.logger.info( + "[BaiduTieBaCrawler.search] Begin search baidu tieba keywords" + ) + tieba_limit_count = 10 # tieba limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info( + f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}" + ) + page = 1 + while ( + page - start_page + 1 + ) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}") + page += 1 + continue + try: + utils.logger.info( + f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}" + ) + notes_list: List[TiebaNote] = ( + await self.tieba_client.get_notes_by_keyword( + keyword=keyword, + page=page, + page_size=tieba_limit_count, + sort=SearchSortType.TIME_DESC, + note_type=SearchNoteType.FIXED_THREAD, + ) + ) + if not notes_list: + utils.logger.info( + f"[BaiduTieBaCrawler.search] Search note list is empty" + ) + break + utils.logger.info( + f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}" + ) + await self.get_specified_notes( + note_id_list=[note_detail.note_id for note_detail in notes_list] + ) + page += 1 + except Exception as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}" + ) + break + + async def get_specified_tieba_notes(self): + """ + Get the information and comments of the specified post by tieba name + Returns: + + """ + tieba_limit_count = 50 + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + for tieba_name in config.TIEBA_NAME_LIST: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}" + ) + page_number = 0 + while page_number <= config.CRAWLER_MAX_NOTES_COUNT: + note_list: List[TiebaNote] = ( + await self.tieba_client.get_notes_by_tieba_name( + tieba_name=tieba_name, page_num=page_number + ) + ) + if not note_list: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty" + ) + break + + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}" + ) + await self.get_specified_notes([note.note_id for note in note_list]) + page_number += tieba_limit_count + + async def get_specified_notes( + self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST + ): + """ + Get the information and comments of the specified post + Args: + note_id_list: + + Returns: + + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) + for note_id in note_id_list + ] + note_details = await asyncio.gather(*task_list) + note_details_model: List[TiebaNote] = [] + for note_detail in note_details: + if note_detail is not None: + note_details_model.append(note_detail) + await tieba_store.update_tieba_note(note_detail) + await self.batch_get_note_comments(note_details_model) + + async def get_note_detail_async_task( + self, note_id: str, semaphore: asyncio.Semaphore + ) -> Optional[TiebaNote]: + """ + Get note detail + Args: + note_id: baidu tieba note id + semaphore: asyncio semaphore + + Returns: + + """ + async with semaphore: + try: + utils.logger.info( + f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}" + ) + note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id) + if not note_detail: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}" + ) + return None + return note_detail + except Exception as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}" + ) + return None + except KeyError as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}" + ) + return None + + async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]): + """ + Batch get note comments + Args: + note_detail_list: + + Returns: + + """ + if not config.ENABLE_GET_COMMENTS: + return + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for note_detail in note_detail_list: + task = asyncio.create_task( + self.get_comments_async_task(note_detail, semaphore), + name=note_detail.note_id, + ) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments_async_task( + self, note_detail: TiebaNote, semaphore: asyncio.Semaphore + ): + """ + Get comments async task + Args: + note_detail: + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info( + f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}" + ) + await self.tieba_client.get_note_all_comments( + note_detail=note_detail, + crawl_interval=random.random(), + callback=tieba_store.batch_update_tieba_note_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + + async def get_creators_and_notes(self) -> None: + """ + Get creator's information and their notes and comments + Returns: + + """ + utils.logger.info( + "[WeiboCrawler.get_creators_and_notes] Begin get weibo creators" + ) + for creator_url in config.TIEBA_CREATOR_URL_LIST: + creator_page_html_content = await self.tieba_client.get_creator_info_by_url( + creator_url=creator_url + ) + creator_info: TiebaCreator = self._page_extractor.extract_creator_info( + creator_page_html_content + ) + if creator_info: + utils.logger.info( + f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}" + ) + if not creator_info: + raise Exception("Get creator info error") + + await tieba_store.save_creator(user_info=creator_info) + + # Get all note information of the creator + all_notes_list = ( + await self.tieba_client.get_all_notes_by_creator_user_name( + user_name=creator_info.user_name, + crawl_interval=0, + callback=tieba_store.batch_update_tieba_notes, + max_note_count=config.CRAWLER_MAX_NOTES_COUNT, + creator_page_html_content=creator_page_html_content, + ) + ) + + await self.batch_get_note_comments(all_notes_list) + + else: + utils.logger.error( + f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}" + ) + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + Launch browser and create browser + Args: + chromium: + playwright_proxy: + user_agent: + headless: + + Returns: + + """ + utils.logger.info( + "[BaiduTieBaCrawler.launch_browser] Begin create browser context ..." + ) + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join( + os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM + ) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent, + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, user_agent=user_agent + ) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser( + chromium, playwright_proxy, user_agent, headless + ) + + async def close(self): + """ + Close browser context + Returns: + + """ + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/field.py new file mode 100644 index 0000000..1f978fd --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/field.py @@ -0,0 +1,29 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from enum import Enum + + +class SearchSortType(Enum): + """search sort type""" + # 按时间倒序 + TIME_DESC = "1" + # 按时间顺序 + TIME_ASC = "0" + # 按相关性顺序 + RELEVANCE_ORDER = "2" + + +class SearchNoteType(Enum): + # 只看主题贴 + MAIN_THREAD = "1" + # 混合模式(帖子+回复) + FIXED_THREAD = "0" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/help.py new file mode 100644 index 0000000..539ed11 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/help.py @@ -0,0 +1,418 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import html +import json +import re +from typing import Dict, List, Tuple +from urllib.parse import parse_qs, unquote + +from parsel import Selector + +from constant import baidu_tieba as const +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote +from tools import utils + +GENDER_MALE = "sex_male" +GENDER_FEMALE = "sex_female" + + +class TieBaExtractor: + def __init__(self): + pass + + @staticmethod + def extract_search_note_list(page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据 + Args: + page_content: 页面内容的HTML字符串 + + Returns: + 包含帖子信息的字典列表 + """ + xpath_selector = "//div[@class='s_post']" + post_list = Selector(text=page_content).xpath(xpath_selector) + result: List[TiebaNote] = [] + for post in post_list: + tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), + title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), + desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), + note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get( + default=''), + user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get( + default='').strip(), user_link=const.TIEBA_URL + post.xpath( + ".//a[starts-with(@href, '/home/main')]/@href").get(default=''), + tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), + tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get( + default=''), + publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get( + default='').strip(), ) + result.append(tieba_note) + return result + + def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表 + Args: + page_content: + + Returns: + + """ + page_content = page_content.replace(' + + + + + + + + +
网球风云吧 关注:48,523贴子:5,418,043
+
+ + +
+ 贴子管理 +
    + +
+
+
+
+

【强烈恭喜】全红婵陈宇汐包揽跳水女子10米台巴黎奥运金银牌!

只看楼主收藏回复

+
+ +
+
+
+
+
中国队第22金!无悬念!



IP属地:福建来自Android客户端1楼2024-08-06 22:09回复
    +
    + +
    +
    全后卫冕成功,还是动作质量高,小炸也赢了


    IP属地:福建来自Android客户端2楼2024-08-06 22:10
    收起回复
      全后卫冕,太好了


      IP属地:江苏来自Android客户端3楼2024-08-06 22:10
      收起回复
        毫无悬念


        IP属地:上海来自Android客户端4楼2024-08-06 22:10
        收起回复
          皇后回宫


          IP属地:广西来自Android客户端5楼2024-08-06 22:10
          收起回复
            可惜了,既生婵何生汐


            IP属地:湖北来自Android客户端6楼2024-08-06 22:10
            收起回复
              全最后水花那么大 居然不是8分


              IP属地:中国澳门来自Android客户端7楼2024-08-06 22:10
              收起回复
                除了第三跳小炸一下,其余的都很棒了…


                IP属地:四川来自iPhone客户端8楼2024-08-06 22:10
                收起回复
                  陈完美发挥了还是打不过,没办法


                  IP属地:福建来自iPhone客户端9楼2024-08-06 22:10
                  收起回复


                    IP属地:广东来自Android客户端10楼2024-08-06 22:11
                    回复
                      恭喜全,陈也蛮惨的,好在是有女双金


                      IP属地:江苏来自iPhone客户端11楼2024-08-06 22:11
                      收起回复
                        陈芋汐简直就是跳水队版孙颖莎


                        IP属地:陕西来自Android客户端12楼2024-08-06 22:11
                        收起回复
                          恭喜全后卫冕 也恭喜汐贵妃银牌,汐贵妃挺遗憾的,不管怎么样还是恭喜两位


                          IP属地:广东来自iPhone客户端13楼2024-08-06 22:11
                          收起回复
                            强烈恭喜


                            IP属地:广西来自Android客户端14楼2024-08-06 22:11
                            回复
                              恭喜全后卫冕成功


                              IP属地:江苏来自iPhone客户端15楼2024-08-06 22:11
                              回复
                                恭喜全后卫冕


                                IP属地:上海来自Android客户端16楼2024-08-06 22:11
                                收起回复
                                  全后真的是后。。。确实今天有点紧,正常应该在440-450左右。。。


                                  IP属地:上海17楼2024-08-06 22:11
                                  收起回复
                                    这俩看谁能先熬过对方吧,恭喜


                                    IP属地:上海18楼2024-08-06 22:11
                                    收起回复
                                      全身体姿态确实更好看


                                      IP属地:广西来自Android客户端19楼2024-08-06 22:12
                                      收起回复
                                        质量好,分数没啥问题,主要是207不炸基本没悬念


                                        IP属地:上海来自Android客户端20楼2024-08-06 22:12
                                        收起回复
                                          恭喜


                                          IP属地:山东来自Android客户端21楼2024-08-06 22:12
                                          收起回复
                                            陈这个周期是不是压着全,吊打了,结果巴黎还是输了好难过哦


                                            IP属地:上海来自Android客户端22楼2024-08-06 22:12
                                            收起回复
                                              207没炸炸了6组动作也没想到


                                              IP属地:山东来自iPhone客户端23楼2024-08-06 22:12
                                              收起回复
                                                恭喜两位,都很棒


                                                IP属地:广东来自iPhone客户端24楼2024-08-06 22:12
                                                收起回复
                                                  汐贵妃有点惨。。相比预赛半决赛已经特别好了,今天机会很大的。。。


                                                  IP属地:上海25楼2024-08-06 22:12
                                                  收起回复
                                                    稳稳的幸福


                                                    IP属地:安徽来自iPhone客户端26楼2024-08-06 22:12
                                                    回复
                                                      心疼陈宇汐一秒


                                                      IP属地:安徽27楼2024-08-06 22:12
                                                      收起回复
                                                        全后居然因为赢而哭,真的长大不少,汐贵妃好无奈


                                                        IP属地:广东来自iPhone客户端28楼2024-08-06 22:12
                                                        收起回复
                                                          陈真的太遗憾了


                                                          IP属地:安徽来自Android客户端29楼2024-08-06 22:12
                                                          回复
                                                            汐贵妃最后神情有点落寞


                                                            IP属地:湖南来自iPhone客户端30楼2024-08-06 22:12
                                                            收起回复
                                                              +
                                                              + + + + + + + + + + + + + + + + +
                                                              + 广告 + +
                                                              + + + +
                                                              +
                                                              + +
                                                              \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_detail.html b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_detail.html new file mode 100644 index 0000000..e4ecae6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_detail.html @@ -0,0 +1,839 @@ +对于一个父亲来说,这个女儿14岁就死了【以太比特吧】_百度贴吧 + + + + + + + + +
                                                              以太比特吧 关注:309,573贴子:5,368,434
                                                              +
                                                              + + +
                                                              + 贴子管理 +
                                                                + +
                                                              +
                                                              +
                                                              +
                                                              +

                                                              对于一个父亲来说,这个女儿14岁就死了

                                                              只看楼主收藏回复

                                                              +
                                                              + +
                                                              +
                                                              +
                                                              +
                                                              +
                                                              点击展开,查看完整图片


                                                              IP属地:广东来自Android客户端1楼2024-08-05 16:56回复
                                                                本来觉得就凭14岁的这点叛逆父亲不再理她觉得这个这个父亲是有点问题的,后来看到母亲也不理了,我就知道这女的肯定隐藏了很多自己干得垃圾事没说,她活该


                                                                IP属地:广东来自Android客户端2楼2024-08-05 17:07
                                                                收起回复
                                                                  • 铭寒号废了重练一个而已,只是她妈后来才明白这一点
                                                                  • youxi卡米糯小错一般都能包容,能这样多半是原则上大是大非
                                                                  • 你的隔壁王哥十四岁能把人逼到没有一点犹豫的跳楼,有多大的学习压力想过没?这种家庭内为了子女成才会不记一切代价,甚至是以折磨的方式,而之后的一切变故都是由于这次跳楼父亲不闻不问的态度,换作是你心灰意泠后只会做的比他更过分,亲情破裂会让最后一丝克制也一同丧失。
                                                                  • 快拉黑尔父回复 你的隔壁王哥 :闷油瓶的话还能理解一下,小太妹为了得到什么说跳就跳我是一点也不怀疑也不同情的。你现在同情小心以后糟老罪咯。真要对她不好也不至于长大了好多事想明白了反而一直想修复关系。
                                                                  • 你的隔壁王哥回复 快拉黑尔父 :十四岁第一次逃学,还在担心父母会不会打他,说明在此之前完全就是个乖乖女。初三才逃第一次学,如果是太妹初二就已经插着翅膀到处飞了,而且跳楼母亲没有任何心里准备,就说明在以往的形象里是不可能做出这事,说明从一开始就只是正常女学生。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :人变成太妹,性格一完全变成了很难理解吗?初中时代常有的事
                                                                    • 你的隔壁王哥回复 快拉黑尔父 :如果说是太妹,那么跳楼之前必然会有各种前车之鉴,换句话说为了得到某样需求常用跳楼作为威胁。这种头也不回没有任何犹豫的跳楼,显然不是为了得到什么,就是单纯的寻死,你觉得太妹会这么纯粹的寻死吗?太妹的心理承受能力可高多了,只有未经世事的小白心里破防了才会这么干脆。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :完全的一面之词,结果可以看到的是什么?14岁钱的好父亲当她死了。对她一直很好的母亲也断了联系。想修复关系的反而是她。告诉你一个众所周知的事,人发言,一定,一定会下意识的美化自己。这是下意识。然后你再看看这个故事。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :而你所说的这个想索求什么,全包含在了一句叛逆期懂得都懂这一句话里面隐藏了。这就是她下意识的掩盖的事了。
                                                                    • 你的隔壁王哥回复 快拉黑尔父 :你要分析心理啊,纯粹的寻死只会在心里破防的时候才会存在,你如果接触过混社会的太妹,你就会发现他们会以寻求刺激为炫耀的资本,在这种群体内心理承受能力高的离谱。要想让一个学生心里破防,只能让她的天塌了,脆弱的心里才会在极短时间内崩溃,只有长期压抑才会产生这种心理。
                                                                  • 我也说一句

                                                                    还有118条回复,点击查看

                                                                  这女的晚上不回家她爹去找她,被黄毛打进医院,也没来医院看过,最后和黄毛结婚也不来往。想起三套房想爆她爹金币,结果找不到求助平台。幸好她爹跑得快。


                                                                  IP属地:福建来自Android客户端4楼2024-08-05 17:38
                                                                  收起回复
                                                                    我知道,可怜之人,必有()


                                                                    IP属地:浙江来自Android客户端7楼2024-08-05 18:38
                                                                    收起回复
                                                                      太假了,混社会不良太妹,还考高中,选专业。当没有大专么


                                                                      IP属地:天津来自Android客户端8楼2024-08-05 18:43
                                                                      收起回复
                                                                        边倪m蓖


                                                                        IP属地:广东来自Android客户端9楼2024-08-05 18:52
                                                                        回复
                                                                          父亲问题很大,应该在14岁那年再生一个或者领养一个


                                                                          IP属地:河北来自Android客户端10楼2024-08-05 18:59
                                                                          收起回复
                                                                            她爸怎么忍住不创小号的


                                                                            IP属地:浙江来自Android客户端12楼2024-08-05 19:09
                                                                            收起回复
                                                                              站在作者的角度来看,肯定都是挑了对自己及其有利的东西来说了,然而


                                                                              IP属地:四川来自Android客户端13楼2024-08-05 19:11
                                                                              收起回复
                                                                                这个好像是之前新闻里的


                                                                                IP属地:江苏来自Android客户端17楼2024-08-05 19:31
                                                                                收起回复
                                                                                  叛逆期你懂的这6个字包含了不知道多少事父母没对他发火而是耐心劝导也不知道包含了多少,我不好说,而且14岁逃学混社会初三高一的学生这么弄基本也是烂了


                                                                                  IP属地:黑龙江来自Android客户端21楼2024-08-05 20:06
                                                                                  收起回复
                                                                                    我们群有个女的。。。他说他爹家暴。。。喝点酒打她跟他妈。。她还轻生过。。。慢慢的的了解了。。。。他爹好像没那么不堪。。。一个月4000多生活费给她。。。她上学都打出租车。。。他爹还不怎么喝酒。。。他有抑郁症他爹还带她去看病。。。。还学了中医给她食补。。但是他就记得他爹喝酒打她跟他妈,。。。。我就纳了闷了。。。。这两个版本的故事不大对。。。。女人嘴里没实话啊。。。。。她说她爹喝酒打他妈,他直接拿水果刀给他爹捅了。淌了好多血,所以他爹送她进精神病院 反正挺混乱的。。。挺漂亮的一个高中女孩,就喜欢酒吧喝酒。。蹦迪。。。说全班男的都给她表过白。。。但是就喜欢小混混。。。
                                                                                    我得出一个结论。这家伙真有病。。。。她爹绝对对她不错。。。。。也是贱高中家庭好,还喜欢混混很蹦迪。。。。高考才两百还是三百多让同学骂了一顿。。。。破防了在群里哭跑路了。。。


                                                                                    IP属地:山东来自Android客户端22楼2024-08-05 20:19
                                                                                    收起回复
                                                                                      女的独生,八成是结婚嫁了混混日子不如意,想着爆父母金币3套房,后来连母亲都躲着她足以说明一切


                                                                                      IP属地:广东来自Android客户端23楼2024-08-05 20:35
                                                                                      收起回复
                                                                                        活该,早点死别耽误别人


                                                                                        IP属地:江西来自Android客户端24楼2024-08-05 20:37
                                                                                        回复
                                                                                          对自己闭口不谈,不好评价


                                                                                          IP属地:安徽来自Android客户端25楼2024-08-05 22:54
                                                                                          回复
                                                                                            再叛逆也不至于寻死
                                                                                            硬要死那就满足你当你死了


                                                                                            IP属地:广西来自Android客户端26楼2024-08-05 22:57
                                                                                            回复
                                                                                              xxn的话一个标点符号都不能信


                                                                                              IP属地:广西来自Android客户端27楼2024-08-05 23:03
                                                                                              回复
                                                                                                故事太过于离谱,是没讲完还是编的


                                                                                                IP属地:湖北来自Android客户端28楼2024-08-05 23:05
                                                                                                收起回复
                                                                                                  她的母亲从前那么希望这个家和好,对女儿也很好,结果突然也躲着她


                                                                                                  IP属地:新疆来自Android客户端30楼2024-08-05 23:32
                                                                                                  回复
                                                                                                    一眼就是避重就轻,能说的都是最轻的了


                                                                                                    IP属地:广东来自iPhone客户端31楼2024-08-05 23:45
                                                                                                    收起回复
                                                                                                      网传的被隐瞒的另一部分故事,不保真


                                                                                                      IP属地:湖南来自Android客户端32楼2024-08-06 00:08
                                                                                                      收起回复
                                                                                                        一般人做不到的绝情,可疑


                                                                                                        IP属地:陕西来自Android客户端33楼2024-08-06 00:08
                                                                                                        收起回复
                                                                                                          快马加编


                                                                                                          IP属地:四川来自Android客户端35楼2024-08-06 00:13
                                                                                                          回复
                                                                                                            默认信xxn说的话已经很反映现在的环境了


                                                                                                            IP属地:上海来自iPhone客户端36楼2024-08-06 00:30
                                                                                                            回复
                                                                                                              這是最後一個教訓了
                                                                                                              父親給的最後一個教訓,停止了你的反叛期,永久有效


                                                                                                              IP属地:中国香港来自Android客户端37楼2024-08-06 00:39
                                                                                                              回复


                                                                                                                IP属地:河北来自Android客户端38楼2024-08-06 00:39
                                                                                                                回复
                                                                                                                  哇,是没头没尾的讲故事,甚至比聊天记录还干净,这下不得不信了


                                                                                                                  IP属地:湖北来自Android客户端39楼2024-08-06 00:40
                                                                                                                  收起回复


                                                                                                                    IP属地:湖北来自iPhone客户端40楼2024-08-06 00:46
                                                                                                                    收起回复
                                                                                                                      叛逆期,是我懂的那个吗?
                                                                                                                      就是咒他爸要死还找烂仔来对付他爸,给人当街一顿打自己跑路了那个吗?
                                                                                                                      要我说,父母都体现出最大的斯文和忍让了,换作素质低点的可能牙齿都给人干碎了。


                                                                                                                      IP属地:广西来自Android客户端41楼2024-08-06 00:52
                                                                                                                      收起回复
                                                                                                                        自己犯贱能怪谁呢


                                                                                                                        IP属地:浙江来自Android客户端42楼2024-08-06 00:55
                                                                                                                        回复
                                                                                                                          +
                                                                                                                          + + + + + + + + + + + + + + + + +
                                                                                                                          \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_sub_comments.html b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_sub_comments.html new file mode 100644 index 0000000..a8fe3eb --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/note_sub_comments.html @@ -0,0 +1,189 @@ +
                                                                                                                        • + + + + +
                                                                                                                          + heinzfrentzen + : + + + + +
                                                                                                                          + + + 2024-8-6 22:11 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 可爱的搬运工94 + :陈芋汐水花也不小 +
                                                                                                                          + + + 2024-8-6 22:12 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 国际体坛巨星青椒肉丝 + :你怀孕了吗 老是呕吐 +
                                                                                                                          + + + 2024-8-6 22:12 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 茗花少帅 + :你就只看水花,不看空中姿态吗 +
                                                                                                                          + + + 2024-8-6 22:12 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 东华武兰 + :经典只看水花 +
                                                                                                                          + + + 2024-8-6 22:12 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 上下班要注意 + :额,分数正常吧 +
                                                                                                                          + + + 2024-8-6 22:13 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 静看蚂蚁上树 + : + + 回复 国际体坛巨星青椒肉丝 + :吃酸黄瓜吃多了 + + + +
                                                                                                                          + + + 2024-8-6 22:14 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 不懂取啥名字😜 + : + + 请你去跟国际泳联投诉 + +
                                                                                                                          + + + 2024-8-6 22:15 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 💫泽赫拉💯 + :第五跳陈空中分腿了,空中姿态明显全红婵更好 +
                                                                                                                          + + + 2024-8-6 22:17 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + + +
                                                                                                                          + 嗯嗯哦哦啊啊🐶 + : + + 回复 美味蟹黄堡💞 + :你不会看起跳高度和空中姿态? + +
                                                                                                                          + + + 2024-8-6 22:17 + 回复 +
                                                                                                                          +
                                                                                                                          +
                                                                                                                        • +
                                                                                                                        • + + + 我也说一句 + +

                                                                                                                          + 1 + 2 + 下一页 + 尾页 +

                                                                                                                          +
                                                                                                                        • diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/search_keyword_notes.html b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/search_keyword_notes.html new file mode 100644 index 0000000..d15d8ce --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/search_keyword_notes.html @@ -0,0 +1,96 @@ +
                                                                                                                          +
                                                                                                                          武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠 +
                                                                                                                          + 全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。 + 项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、 +
                                                                                                                          + 贴吧:武汉交互空间作者:VR虚拟达人 + 2024-08-05 16:45
                                                                                                                          +
                                                                                                                          请各位急用玛尼的小心,骗子最多 +
                                                                                                                          + 这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收 +
                                                                                                                          + 贴吧:背包客作者:贴吧用户_GC64AUS + 2024-08-03 07:35
                                                                                                                          +
                                                                                                                          *2025泰国冷链制冷运输展*东南亚外贸出口 +
                                                                                                                          **2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察 + 展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司 + 人员跟团观展补贴!为您节省成本,寻找适合您的市场: + 本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市 +
                                                                                                                          + 贴吧:国际展会作者:zhaot_188 2024-07-19 15:44
                                                                                                                          +
                                                                                                                          京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 +
                                                                                                                          来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫 + 京湘楼,KING + HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品 +
                                                                                                                          + 贴吧:京湘楼作者:天神渡尘 2024-07-17 23:43
                                                                                                                          +
                                                                                                                          广州能争取到迪士尼与环球落户吗? +
                                                                                                                          + 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
                                                                                                                          + 贴吧:地理作者:SeaRoutes 2024-07-13 20:17
                                                                                                                          +
                                                                                                                          #城市GDP#广州应该全力去争取迪士尼和环球影城 +
                                                                                                                          + 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
                                                                                                                          + 贴吧:城市gdp作者:SeaRoutes 2024-07-13 20:14
                                                                                                                          +
                                                                                                                          云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设 +
                                                                                                                          + 7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省 +
                                                                                                                          + 贴吧:昆明作者: 2024-07-12 23:04
                                                                                                                          +
                                                                                                                          寻找弟弟,很久没跟家里联系 +
                                                                                                                          Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团
                                                                                                                          + 贴吧:东南亚作者:贴吧用户_GC2CtRa + 2024-07-11 07:53
                                                                                                                          +
                                                                                                                          拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧? +
                                                                                                                          拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立 + 跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。 +
                                                                                                                          + 贴吧:历史作者:yoursagain 2024-07-10 09:00
                                                                                                                          +
                                                                                                                          东南亚,园区【 工 价 低 】 +
                                                                                                                          + 贴吧:园区招商作者:QQ59052966 2024-06-30 12:09
                                                                                                                          +
                                                                                                                          \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/tieba_note_list.html b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/tieba_note_list.html new file mode 100644 index 0000000..abd423c --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/test_data/tieba_note_list.html @@ -0,0 +1,3627 @@ + + + + + + + + + + 盗墓笔记吧-百度贴吧--喜爱盗墓笔记的有爱稻米聚集地--盗墓笔记吧致力于为广大喜爱《盗墓笔记》的吧友服务,传递官方最新资讯,小说相关同人作品,鼓励吧友原创精品,解密分析、图片、文章等。 + + + + + + + + + + + + + + +
                                                                                                                          + + + + + + +
                                                                                                                          +
                                                                                                                          + +
                                                                                                                          +
                                                                                                                          + + + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + +
                                                                                                                          + + + + + +
                                                                                                                          +
                                                                                                                          + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/__init__.py new file mode 100644 index 0000000..0b71591 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/__init__.py @@ -0,0 +1,18 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/23 15:40 +# @Desc : +from .client import WeiboClient +from .core import WeiboCrawler +from .login import WeiboLogin diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/client.py new file mode 100644 index 0000000..08c82da --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/client.py @@ -0,0 +1,381 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/23 15:40 +# @Desc : 微博爬虫 API 请求 client + +import asyncio +import copy +import json +import re +from typing import Callable, Dict, List, Optional, Union +from urllib.parse import parse_qs, unquote, urlencode + +import httpx +from httpx import Response +from playwright.async_api import BrowserContext, Page + +import config +from tools import utils + +from .exception import DataFetchError +from .field import SearchType + + +class WeiboClient: + + def __init__( + self, + timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间 + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxy = proxy + self.timeout = timeout + self.headers = headers + self._host = "https://m.weibo.cn" + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + self._image_agent_host = "https://i1.wp.com/" + + async def request(self, method, url, **kwargs) -> Union[Response, Dict]: + enable_return_response = kwargs.pop("return_response", False) + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + + if enable_return_response: + return response + + data: Dict = response.json() + ok_code = data.get("ok") + if ok_code == 0: # response error + utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}") + raise DataFetchError(data.get("msg", "response error")) + elif ok_code != 1: # unknown error + utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}") + raise DataFetchError(data.get("msg", "unknown error")) + else: # response right + return data.get("data", {}) + + async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]: + final_uri = uri + if isinstance(params, dict): + final_uri = (f"{uri}?" + f"{urlencode(params)}") + + if headers is None: + headers = self.headers + return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs) + + async def post(self, uri: str, data: dict) -> Dict: + json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) + return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers) + + async def pong(self) -> bool: + """get a note to check if login state is ok""" + utils.logger.info("[WeiboClient.pong] Begin pong weibo...") + ping_flag = False + try: + uri = "/api/config" + resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers) + if resp_data.get("login"): + ping_flag = True + else: + utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...") + except Exception as e: + utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def get_note_by_keyword( + self, + keyword: str, + page: int = 1, + search_type: SearchType = SearchType.DEFAULT, + ) -> Dict: + """ + search note by keyword + :param keyword: 微博搜搜的关键词 + :param page: 分页参数 -当前页码 + :param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType + :return: + """ + uri = "/api/container/getIndex" + containerid = f"100103type={search_type.value}&q={keyword}" + params = { + "containerid": containerid, + "page_type": "searchall", + "page": page, + } + return await self.get(uri, params) + + async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict: + """get notes comments + :param mid_id: 微博ID + :param max_id: 分页参数ID + :param max_id_type: 分页参数ID类型 + :return: + """ + uri = "/comments/hotflow" + params = { + "id": mid_id, + "mid": mid_id, + "max_id_type": max_id_type, + } + if max_id > 0: + params.update({"max_id": max_id}) + referer_url = f"https://m.weibo.cn/detail/{mid_id}" + headers = copy.copy(self.headers) + headers["Referer"] = referer_url + + return await self.get(uri, params, headers=headers) + + async def get_note_all_comments( + self, + note_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 10, + ): + """ + get note all comments include sub comments + :param note_id: + :param crawl_interval: + :param callback: + :param max_count: + :return: + """ + result = [] + is_end = False + max_id = -1 + max_id_type = 0 + while not is_end and len(result) < max_count: + comments_res = await self.get_note_comments(note_id, max_id, max_id_type) + max_id: int = comments_res.get("max_id") + max_id_type: int = comments_res.get("max_id_type") + comment_list: List[Dict] = comments_res.get("data", []) + is_end = max_id == 0 + if len(result) + len(comment_list) > max_count: + comment_list = comment_list[:max_count - len(result)] + if callback: # 如果有回调函数,就执行回调函数 + await callback(note_id, comment_list) + await asyncio.sleep(crawl_interval) + result.extend(comment_list) + sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback) + result.extend(sub_comment_result) + return result + + @staticmethod + async def get_comments_all_sub_comments( + note_id: str, + comment_list: List[Dict], + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取评论的所有子评论 + Args: + note_id: + comment_list: + callback: + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + return [] + + res_sub_comments = [] + for comment in comment_list: + sub_comments = comment.get("comments") + if sub_comments and isinstance(sub_comments, list): + await callback(note_id, sub_comments) + res_sub_comments.extend(sub_comments) + return res_sub_comments + + async def get_note_info_by_id(self, note_id: str) -> Dict: + """ + 根据帖子ID获取详情 + :param note_id: + :return: + """ + url = f"{self._host}/detail/{note_id}" + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) + if response.status_code != 200: + raise DataFetchError(f"get weibo detail err: {response.text}") + match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL) + if match: + render_data_json = match.group(1) + render_data_dict = json.loads(render_data_json) + note_detail = render_data_dict[0].get("status") + note_item = {"mblog": note_detail} + return note_item + else: + utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值") + return dict() + + async def get_note_image(self, image_url: str) -> bytes: + image_url = image_url[8:] # 去掉 https:// + sub_url = image_url.split("/") + image_url = "" + for i in range(len(sub_url)): + if i == 1: + image_url += "large/" # 都获取高清大图 + elif i == len(sub_url) - 1: + image_url += sub_url[i] + else: + image_url += sub_url[i] + "/" + # 微博图床对外存在防盗链,所以需要代理访问 + # 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下 + final_uri = (f"{self._image_agent_host}" + f"{image_url}") + async with httpx.AsyncClient(proxy=self.proxy) as client: + try: + response = await client.request("GET", final_uri, timeout=self.timeout) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + return None + + async def get_creator_container_info(self, creator_id: str) -> Dict: + """ + 获取用户的容器ID, 容器信息代表着真实请求的API路径 + fid_container_id:用户的微博详情API的容器ID + lfid_container_id:用户的微博列表API的容器ID + Args: + creator_id: + + Returns: { + + """ + response = await self.get(f"/u/{creator_id}", return_response=True) + m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS") + if not m_weibocn_params: + raise DataFetchError("get containerid failed") + m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params)) + return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]} + + async def get_creator_info_by_id(self, creator_id: str) -> Dict: + """ + 根据用户ID获取用户详情 + Args: + creator_id: + + Returns: + + """ + uri = "/api/container/getIndex" + container_info = await self.get_creator_container_info(creator_id) + if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "": + utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed") + raise DataFetchError("get containerid failed") + params = { + "jumpfrom": "weibocom", + "type": "uid", + "value": creator_id, + "containerid": container_info["fid_container_id"], + } + + user_res = await self.get(uri, params) + + if user_res.get("tabsInfo"): + tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", []) + for tab in tabs: + if tab.get("tabKey") == "weibo": + container_info["lfid_container_id"] = tab.get("containerid") + break + + user_res.update(container_info) + return user_res + + async def get_notes_by_creator( + self, + creator: str, + container_id: str, + since_id: str = "0", + ) -> Dict: + """ + 获取博主的笔记 + Args: + creator: 博主ID + container_id: 容器ID + since_id: 上一页最后一条笔记的ID + Returns: + + """ + + uri = "/api/container/getIndex" + params = { + "jumpfrom": "weibocom", + "type": "uid", + "value": creator, + "containerid": container_id, + "since_id": since_id, + } + return await self.get(uri, params) + + async def get_all_notes_by_creator_id( + self, + creator_id: str, + container_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 + Args: + creator_id: + container_id: + crawl_interval: + callback: + + Returns: + + """ + result = [] + notes_has_more = True + since_id = "" + crawler_total_count = 0 + while notes_has_more: + notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id) + if not notes_res: + utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.") + break + since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0") + if "cards" not in notes_res: + utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}") + break + + notes = notes_res["cards"] + utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}") + notes = [note for note in notes if note.get("card_type") == 9] + if callback: + await callback(notes) + await asyncio.sleep(crawl_interval) + result.extend(notes) + crawler_total_count += 10 + notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count + return result diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py new file mode 100644 index 0000000..552801f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py @@ -0,0 +1,373 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/23 15:41 +# @Desc : 微博爬虫主流程代码 + +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) + +import config +from base.base_crawler import AbstractCrawler +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import weibo as weibo_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import WeiboClient +from .exception import DataFetchError +from .field import SearchType +from .help import filter_search_result_card +from .login import WeiboLogin + + +class WeiboCrawler(AbstractCrawler): + context_page: Page + wb_client: WeiboClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self): + self.index_url = "https://www.weibo.com" + self.mobile_index_url = "https://m.weibo.cn" + self.user_agent = utils.get_user_agent() + self.mobile_user_agent = utils.get_mobile_user_agent() + self.cdp_manager = None + + async def start(self): + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + self.mobile_user_agent, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.mobile_index_url) + + # Create a client to interact with the xiaohongshu website. + self.wb_client = await self.create_weibo_client(httpx_proxy_format) + if not await self.wb_client.pong(): + login_obj = WeiboLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + + # 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie + utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform") + await self.context_page.goto(self.mobile_index_url) + await asyncio.sleep(2) + await self.wb_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for video and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + await self.get_creators_and_notes() + else: + pass + utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...") + + async def search(self): + """ + search weibo note with keywords + :return: + """ + utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords") + weibo_limit_count = 10 # weibo limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count + start_page = config.START_PAGE + + # Set the search type based on the configuration for weibo + if config.WEIBO_SEARCH_TYPE == "default": + search_type = SearchType.DEFAULT + elif config.WEIBO_SEARCH_TYPE == "real_time": + search_type = SearchType.REAL_TIME + elif config.WEIBO_SEARCH_TYPE == "popular": + search_type = SearchType.POPULAR + elif config.WEIBO_SEARCH_TYPE == "video": + search_type = SearchType.VIDEO + else: + utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}") + return + + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") + page = 1 + while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}") + page += 1 + continue + utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}") + search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type) + note_id_list: List[str] = [] + note_list = filter_search_result_card(search_res.get("cards")) + for note_item in note_list: + if note_item: + mblog: Dict = note_item.get("mblog") + if mblog: + note_id_list.append(mblog.get("id")) + await weibo_store.update_weibo_note(note_item) + await self.get_note_images(mblog) + + page += 1 + await self.batch_get_notes_comments(note_id_list) + + async def get_specified_notes(self): + """ + get specified notes info + :return: + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST] + video_details = await asyncio.gather(*task_list) + for note_item in video_details: + if note_item: + await weibo_store.update_weibo_note(note_item) + await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST) + + async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + """ + Get note detail task + :param note_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + result = await self.wb_client.get_note_info_by_id(note_id) + return result + except DataFetchError as ex: + utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}") + return None + + async def batch_get_notes_comments(self, note_id_list: List[str]): + """ + batch get notes comments + :param note_id_list: + :return: + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + + utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for note_id in note_id_list: + task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore): + """ + get comment for note id + :param note_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...") + await self.wb_client.get_note_all_comments( + note_id=note_id, + crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些 + callback=weibo_store.batch_update_weibo_note_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + except DataFetchError as ex: + utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}") + except Exception as e: + utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}") + + async def get_note_images(self, mblog: Dict): + """ + get note images + :param mblog: + :return: + """ + if not config.ENABLE_GET_MEIDAS: + utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled") + return + + pics: Dict = mblog.get("pics") + if not pics: + return + for pic in pics: + url = pic.get("url") + if not url: + continue + content = await self.wb_client.get_note_image(url) + await asyncio.sleep(random.random()) + if content != None: + extension_file_name = url.split(".")[-1] + await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name) + + async def get_creators_and_notes(self) -> None: + """ + Get creator's information and their notes and comments + Returns: + + """ + utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators") + for user_id in config.WEIBO_CREATOR_ID_LIST: + createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id) + if createor_info_res: + createor_info: Dict = createor_info_res.get("userInfo", {}) + utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}") + if not createor_info: + raise DataFetchError("Get creator info error") + await weibo_store.save_creator(user_id, user_info=createor_info) + + # Get all note information of the creator + all_notes_list = await self.wb_client.get_all_notes_by_creator_id( + creator_id=user_id, + container_id=createor_info_res.get("lfid_container_id"), + crawl_interval=0, + callback=weibo_store.batch_update_weibo_notes, + ) + + note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")] + await self.batch_get_notes_comments(note_ids) + + else: + utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}") + + async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient: + """Create xhs client""" + utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + weibo_client_obj = WeiboClient( + proxy=httpx_proxy, + headers={ + "User-Agent": utils.get_mobile_user_agent(), + "Cookie": cookie_str, + "Origin": "https://m.weibo.cn", + "Referer": "https://m.weibo.cn", + "Content-Type": "application/json;charset=UTF-8", + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return weibo_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={ + "width": 1920, + "height": 1080 + }, + user_agent=user_agent, + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) + + async def close(self): + """Close browser context""" + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[WeiboCrawler.close] Browser context closed ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/exception.py new file mode 100644 index 0000000..1f062d1 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/exception.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 18:44 +# @Desc : + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/field.py new file mode 100644 index 0000000..ac25a22 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/field.py @@ -0,0 +1,30 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/23 15:41 +# @Desc : +from enum import Enum + + +class SearchType(Enum): + # 综合 + DEFAULT = "1" + + # 实时 + REAL_TIME = "61" + + # 热门 + POPULAR = "60" + + # 视频 + VIDEO = "64" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/help.py new file mode 100644 index 0000000..0916a6d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/help.py @@ -0,0 +1,36 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/24 17:37 +# @Desc : + +from typing import Dict, List + + +def filter_search_result_card(card_list: List[Dict]) -> List[Dict]: + """ + 过滤微博搜索的结果,只保留card_type为9类型的数据 + :param card_list: + :return: + """ + note_list: List[Dict] = [] + for card_item in card_list: + if card_item.get("card_type") == 9: + note_list.append(card_item) + if len(card_item.get("card_group", [])) > 0: + card_group = card_item.get("card_group") + for card_group_item in card_group: + if card_group_item.get("card_type") == 9: + note_list.append(card_group_item) + + return note_list diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/login.py new file mode 100644 index 0000000..c21d0fd --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/login.py @@ -0,0 +1,123 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/23 15:42 +# @Desc : 微博登录实现 + +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class WeiboLogin(AbstractLogin): + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog" + + async def begin(self): + """Start login weibo""" + utils.logger.info("[WeiboLogin.begin] Begin login weibo ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError( + "[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") + + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self, no_logged_in_session: str) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second + if max retry times reached, raise RetryError + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + if cookie_dict.get("SSOLoginState"): + return True + current_web_session = cookie_dict.get("WBPSESS") + if current_web_session != no_logged_in_session: + return True + return False + + async def login_by_qrcode(self): + """login weibo website and keep webdriver login state""" + utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...") + await self.context_page.goto(self.weibo_sso_login_url) + # find login qrcode + qrcode_img_selector = "xpath=//img[@class='w-full h-full']" + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + sys.exit() + + # show login qrcode + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s") + + # get not logged session + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("WBPSESS") + + try: + await self.check_login_state(no_logged_in_session) + except RetryError: + utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info( + f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_mobile(self): + pass + + async def login_by_cookies(self): + utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".weibo.cn", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/__init__.py new file mode 100644 index 0000000..f4fb0aa --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/__init__.py @@ -0,0 +1,13 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from .core import XiaoHongShuCrawler +from .field import * diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py new file mode 100644 index 0000000..982373a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py @@ -0,0 +1,592 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import asyncio +import json +import re +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext, Page +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result + +import config +from base.base_crawler import AbstractApiClient +from tools import utils +from html import unescape + +from .exception import DataFetchError, IPBlockError +from .field import SearchNoteType, SearchSortType +from .help import get_search_id, sign + + +class XiaoHongShuClient(AbstractApiClient): + + def __init__( + self, + timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间 + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxy = proxy + self.timeout = timeout + self.headers = headers + self._host = "https://edith.xiaohongshu.com" + self._domain = "https://www.xiaohongshu.com" + self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试" + self.IP_ERROR_CODE = 300012 + self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看" + self.NOTE_ABNORMAL_CODE = -510001 + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + + async def _pre_headers(self, url: str, data=None) -> Dict: + """ + 请求头参数签名 + Args: + url: + data: + + Returns: + + """ + encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data]) + local_storage = await self.playwright_page.evaluate("() => window.localStorage") + signs = sign( + a1=self.cookie_dict.get("a1", ""), + b1=local_storage.get("b1", ""), + x_s=encrypt_params.get("X-s", ""), + x_t=str(encrypt_params.get("X-t", "")), + ) + + headers = { + "X-S": signs["x-s"], + "X-T": signs["x-t"], + "x-S-Common": signs["x-s-common"], + "X-B3-Traceid": signs["x-b3-traceid"], + } + self.headers.update(headers) + return self.headers + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + # return response.text + return_response = kwargs.pop("return_response", False) + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + + if response.status_code == 471 or response.status_code == 461: + # someday someone maybe will bypass captcha + verify_type = response.headers["Verifytype"] + verify_uuid = response.headers["Verifyuuid"] + msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}" + utils.logger.error(msg) + raise Exception(msg) + + if return_response: + return response.text + data: Dict = response.json() + if data["success"]: + return data.get("data", data.get("success", {})) + elif data["code"] == self.IP_ERROR_CODE: + raise IPBlockError(self.IP_ERROR_STR) + else: + raise DataFetchError(data.get("msg", None)) + + async def get(self, uri: str, params=None) -> Dict: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri = f"{uri}?" f"{urlencode(params)}" + headers = await self._pre_headers(final_uri) + return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) + + async def post(self, uri: str, data: dict, **kwargs) -> Dict: + """ + POST请求,对请求头签名 + Args: + uri: 请求路由 + data: 请求体参数 + + Returns: + + """ + headers = await self._pre_headers(uri, data) + json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) + return await self.request( + method="POST", + url=f"{self._host}{uri}", + data=json_str, + headers=headers, + **kwargs, + ) + + async def get_note_media(self, url: str) -> Union[bytes, None]: + async with httpx.AsyncClient(proxy=self.proxy) as client: + try: + response = await client.request("GET", url, timeout=self.timeout) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + return None + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + """get a note to check if login state is ok""" + utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...") + ping_flag = False + try: + note_card: Dict = await self.get_note_by_keyword(keyword="小红书") + if note_card.get("items"): + ping_flag = True + except Exception as e: + utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def get_note_by_keyword( + self, + keyword: str, + search_id: str = get_search_id(), + page: int = 1, + page_size: int = 20, + sort: SearchSortType = SearchSortType.GENERAL, + note_type: SearchNoteType = SearchNoteType.ALL, + ) -> Dict: + """ + 根据关键词搜索笔记 + Args: + keyword: 关键词参数 + page: 分页第几页 + page_size: 分页数据长度 + sort: 搜索结果排序指定 + note_type: 搜索的笔记类型 + + Returns: + + """ + uri = "/api/sns/web/v1/search/notes" + data = { + "keyword": keyword, + "page": page, + "page_size": page_size, + "search_id": search_id, + "sort": sort.value, + "note_type": note_type.value, + } + return await self.post(uri, data) + + async def get_note_by_id( + self, + note_id: str, + xsec_source: str, + xsec_token: str, + ) -> Dict: + """ + 获取笔记详情API + Args: + note_id:笔记ID + xsec_source: 渠道来源 + xsec_token: 搜索关键字之后返回的比较列表中返回的token + + Returns: + + """ + if xsec_source == "": + xsec_source = "pc_search" + + data = { + "source_note_id": note_id, + "image_formats": ["jpg", "webp", "avif"], + "extra": { + "need_body_topic": 1 + }, + "xsec_source": xsec_source, + "xsec_token": xsec_token, + } + uri = "/api/sns/web/v1/feed" + res = await self.post(uri, data) + if res and res.get("items"): + res_dict: Dict = res["items"][0]["note_card"] + return res_dict + # 爬取频繁了可能会出现有的笔记能有结果有的没有 + utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}") + return dict() + + async def get_note_comments( + self, + note_id: str, + xsec_token: str, + cursor: str = "", + ) -> Dict: + """ + 获取一级评论的API + Args: + note_id: 笔记ID + xsec_token: 验证token + cursor: 分页游标 + + Returns: + + """ + uri = "/api/sns/web/v2/comment/page" + params = { + "note_id": note_id, + "cursor": cursor, + "top_comment_id": "", + "image_formats": "jpg,webp,avif", + "xsec_token": xsec_token, + } + return await self.get(uri, params) + + async def get_note_sub_comments( + self, + note_id: str, + root_comment_id: str, + xsec_token: str, + num: int = 10, + cursor: str = "", + ): + """ + 获取指定父评论下的子评论的API + Args: + note_id: 子评论的帖子ID + root_comment_id: 根评论ID + xsec_token: 验证token + num: 分页数量 + cursor: 分页游标 + + Returns: + + """ + uri = "/api/sns/web/v2/comment/sub/page" + params = { + "note_id": note_id, + "root_comment_id": root_comment_id, + "num": num, + "cursor": cursor, + "image_formats": "jpg,webp,avif", + "top_comment_id": "", + "xsec_token": xsec_token, + } + return await self.get(uri, params) + + async def get_note_all_comments( + self, + note_id: str, + xsec_token: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + max_count: int = 10, + ) -> List[Dict]: + """ + 获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + note_id: 笔记ID + xsec_token: 验证token + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + max_count: 一次笔记爬取的最大评论数量 + Returns: + + """ + result = [] + comments_has_more = True + comments_cursor = "" + while comments_has_more and len(result) < max_count: + comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor) + comments_has_more = comments_res.get("has_more", False) + comments_cursor = comments_res.get("cursor", "") + if "comments" not in comments_res: + utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") + break + comments = comments_res["comments"] + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] + if callback: + await callback(note_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + sub_comments = await self.get_comments_all_sub_comments( + comments=comments, + xsec_token=xsec_token, + crawl_interval=crawl_interval, + callback=callback, + ) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments( + self, + comments: List[Dict], + xsec_token: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Args: + comments: 评论列表 + xsec_token: 验证token + crawl_interval: 爬取一次评论的延迟单位(秒) + callback: 一次评论爬取结束后 + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + return [] + + result = [] + for comment in comments: + note_id = comment.get("note_id") + sub_comments = comment.get("sub_comments") + if sub_comments and callback: + await callback(note_id, sub_comments) + + sub_comment_has_more = comment.get("sub_comment_has_more") + if not sub_comment_has_more: + continue + + root_comment_id = comment.get("id") + sub_comment_cursor = comment.get("sub_comment_cursor") + + while sub_comment_has_more: + comments_res = await self.get_note_sub_comments( + note_id=note_id, + root_comment_id=root_comment_id, + xsec_token=xsec_token, + num=10, + cursor=sub_comment_cursor, + ) + + if comments_res is None: + utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}") + continue + sub_comment_has_more = comments_res.get("has_more", False) + sub_comment_cursor = comments_res.get("cursor", "") + if "comments" not in comments_res: + utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}") + break + comments = comments_res["comments"] + if callback: + await callback(note_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + return result + + async def get_creator_info(self, user_id: str) -> Dict: + """ + 通过解析网页版的用户主页HTML,获取用户个人简要信息 + PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可 + eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 + """ + uri = f"/user/profile/{user_id}" + html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers) + match = re.search(r"", html)[0].replace("undefined", '""') + + if state != "{}": + note_dict = transform_json_keys(state) + return note_dict["note"]["note_detail_map"][note_id]["note"] + return {} + + try: + return get_note_dict(html) + except: + return None diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py new file mode 100644 index 0000000..9c88f1c --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py @@ -0,0 +1,485 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import asyncio +import os +import random +import time +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) +from tenacity import RetryError + +import config +from base.base_crawler import AbstractCrawler +from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES +from model.m_xiaohongshu import NoteUrlInfo +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import xhs as xhs_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import XiaoHongShuClient +from .exception import DataFetchError +from .field import SearchSortType +from .help import parse_note_info_from_note_url, get_search_id +from .login import XiaoHongShuLogin + + +class XiaoHongShuCrawler(AbstractCrawler): + context_page: Page + xhs_client: XiaoHongShuClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self) -> None: + self.index_url = "https://www.xiaohongshu.com" + # self.user_agent = utils.get_user_agent() + self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" + self.cdp_manager = None + + async def start(self) -> None: + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + self.user_agent, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + playwright_proxy_format, + self.user_agent, + headless=config.HEADLESS, + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url) + + # Create a client to interact with the xiaohongshu website. + self.xhs_client = await self.create_xhs_client(httpx_proxy_format) + if not await self.xhs_client.pong(): + login_obj = XiaoHongShuLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # input your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.xhs_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + await self.get_creators_and_notes() + else: + pass + + utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...") + + async def search(self) -> None: + """Search for notes and retrieve their comment information.""" + utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords") + xhs_limit_count = 20 # xhs limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") + page = 1 + search_id = get_search_id() + while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") + page += 1 + continue + + try: + utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}") + note_ids: List[str] = [] + xsec_tokens: List[str] = [] + notes_res = await self.xhs_client.get_note_by_keyword( + keyword=keyword, + search_id=search_id, + page=page, + sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL), + ) + utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") + if not notes_res or not notes_res.get("has_more", False): + utils.logger.info("No more content!") + break + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail_async_task( + note_id=post_item.get("id"), + xsec_source=post_item.get("xsec_source"), + xsec_token=post_item.get("xsec_token"), + semaphore=semaphore, + ) for post_item in notes_res.get("items", {}) if post_item.get("model_type") not in ("rec_query", "hot_query") + ] + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail: + await xhs_store.update_xhs_note(note_detail) + await self.get_notice_media(note_detail) + note_ids.append(note_detail.get("note_id")) + xsec_tokens.append(note_detail.get("xsec_token")) + page += 1 + utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") + await self.batch_get_note_comments(note_ids, xsec_tokens) + except DataFetchError: + utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error") + break + + async def get_creators_and_notes(self) -> None: + """Get creator's notes and retrieve their comment information.""" + utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators") + for user_id in config.XHS_CREATOR_ID_LIST: + # get creator detail info from web html content + createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id) + if createor_info: + await xhs_store.save_creator(user_id, creator=createor_info) + + # When proxy is not enabled, increase the crawling interval + if config.ENABLE_IP_PROXY: + crawl_interval = random.random() + else: + crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) + # Get all note information of the creator + all_notes_list = await self.xhs_client.get_all_notes_by_creator( + user_id=user_id, + crawl_interval=crawl_interval, + callback=self.fetch_creator_notes_detail, + ) + + note_ids = [] + xsec_tokens = [] + for note_item in all_notes_list: + note_ids.append(note_item.get("note_id")) + xsec_tokens.append(note_item.get("xsec_token")) + await self.batch_get_note_comments(note_ids, xsec_tokens) + + async def fetch_creator_notes_detail(self, note_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail_async_task( + note_id=post_item.get("note_id"), + xsec_source=post_item.get("xsec_source"), + xsec_token=post_item.get("xsec_token"), + semaphore=semaphore, + ) for post_item in note_list + ] + + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail: + await xhs_store.update_xhs_note(note_detail) + await self.get_notice_media(note_detail) + + async def get_specified_notes(self): + """ + Get the information and comments of the specified post + must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️ + Returns: + + """ + get_note_detail_task_list = [] + for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST: + note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url) + utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}") + crawler_task = self.get_note_detail_async_task( + note_id=note_url_info.note_id, + xsec_source=note_url_info.xsec_source, + xsec_token=note_url_info.xsec_token, + semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM), + ) + get_note_detail_task_list.append(crawler_task) + + need_get_comment_note_ids = [] + xsec_tokens = [] + note_details = await asyncio.gather(*get_note_detail_task_list) + for note_detail in note_details: + if note_detail: + need_get_comment_note_ids.append(note_detail.get("note_id", "")) + xsec_tokens.append(note_detail.get("xsec_token", "")) + await xhs_store.update_xhs_note(note_detail) + await self.get_notice_media(note_detail) + await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens) + + async def get_note_detail_async_task( + self, + note_id: str, + xsec_source: str, + xsec_token: str, + semaphore: asyncio.Semaphore, + ) -> Optional[Dict]: + """Get note detail + + Args: + note_id: + xsec_source: + xsec_token: + semaphore: + + Returns: + Dict: note detail + """ + note_detail = None + async with semaphore: + try: + utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}") + + try: + note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) + except RetryError as e: + pass + + if not note_detail: + note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True) + if not note_detail: + raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") + + note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) + return note_detail + + except DataFetchError as ex: + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}") + return None + + async def batch_get_note_comments(self, note_list: List[str], xsec_tokens: List[str]): + """Batch get note comments""" + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + + utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for index, note_id in enumerate(note_list): + task = asyncio.create_task( + self.get_comments(note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore), + name=note_id, + ) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments(self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore): + """Get note comments with keyword filtering and quantity limitation""" + async with semaphore: + utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}") + # When proxy is not enabled, increase the crawling interval + if config.ENABLE_IP_PROXY: + crawl_interval = random.random() + else: + crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) + await self.xhs_client.get_note_all_comments( + note_id=note_id, + xsec_token=xsec_token, + crawl_interval=crawl_interval, + callback=xhs_store.batch_update_xhs_note_comments, + max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + ) + + async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: + """Create xhs client""" + utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + xhs_client_obj = XiaoHongShuClient( + proxy=httpx_proxy, + headers={ + "accept": "application/json, text/plain, */*", + "accept-language": "zh-CN,zh;q=0.9", + "cache-control": "no-cache", + "content-type": "application/json;charset=UTF-8", + "origin": "https://www.xiaohongshu.com", + "pragma": "no-cache", + "priority": "u=1, i", + "referer": "https://www.xiaohongshu.com/", + "sec-ch-ua": '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-site", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", + "Cookie": cookie_str, + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return xhs_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={ + "width": 1920, + "height": 1080 + }, + user_agent=user_agent, + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) + + async def close(self): + """Close browser context""" + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") + + async def get_notice_media(self, note_detail: Dict): + if not config.ENABLE_GET_MEIDAS: + utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled") + return + await self.get_note_images(note_detail) + await self.get_notice_video(note_detail) + + async def get_note_images(self, note_item: Dict): + """ + get note images. please use get_notice_media + :param note_item: + :return: + """ + if not config.ENABLE_GET_MEIDAS: + return + note_id = note_item.get("note_id") + image_list: List[Dict] = note_item.get("image_list", []) + + for img in image_list: + if img.get("url_default") != "": + img.update({"url": img.get("url_default")}) + + if not image_list: + return + picNum = 0 + for pic in image_list: + url = pic.get("url") + if not url: + continue + content = await self.xhs_client.get_note_media(url) + await asyncio.sleep(random.random()) + if content is None: + continue + extension_file_name = f"{picNum}.jpg" + picNum += 1 + await xhs_store.update_xhs_note_image(note_id, content, extension_file_name) + + async def get_notice_video(self, note_item: Dict): + """ + get note videos. please use get_notice_media + :param note_item: + :return: + """ + if not config.ENABLE_GET_MEIDAS: + return + note_id = note_item.get("note_id") + + videos = xhs_store.get_video_url_arr(note_item) + + if not videos: + return + videoNum = 0 + for url in videos: + content = await self.xhs_client.get_note_media(url) + await asyncio.sleep(random.random()) + if content is None: + continue + extension_file_name = f"{videoNum}.mp4" + videoNum += 1 + await xhs_store.update_xhs_note_video(note_id, content, extension_file_name) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/exception.py new file mode 100644 index 0000000..361e521 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/exception.py @@ -0,0 +1,20 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/field.py new file mode 100644 index 0000000..0aef3c9 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/field.py @@ -0,0 +1,83 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from enum import Enum +from typing import NamedTuple + + +class FeedType(Enum): + # 推荐 + RECOMMEND = "homefeed_recommend" + # 穿搭 + FASION = "homefeed.fashion_v3" + # 美食 + FOOD = "homefeed.food_v3" + # 彩妆 + COSMETICS = "homefeed.cosmetics_v3" + # 影视 + MOVIE = "homefeed.movie_and_tv_v3" + # 职场 + CAREER = "homefeed.career_v3" + # 情感 + EMOTION = "homefeed.love_v3" + # 家居 + HOURSE = "homefeed.household_product_v3" + # 游戏 + GAME = "homefeed.gaming_v3" + # 旅行 + TRAVEL = "homefeed.travel_v3" + # 健身 + FITNESS = "homefeed.fitness_v3" + + +class NoteType(Enum): + NORMAL = "normal" + VIDEO = "video" + + +class SearchSortType(Enum): + """search sort type""" + # default + GENERAL = "general" + # most popular + MOST_POPULAR = "popularity_descending" + # Latest + LATEST = "time_descending" + + +class SearchNoteType(Enum): + """search note type + """ + # default + ALL = 0 + # only video + VIDEO = 1 + # only image + IMAGE = 2 + + +class Note(NamedTuple): + """note tuple""" + note_id: str + title: str + desc: str + type: str + user: dict + img_urls: list + video_url: str + tag_list: list + at_user_list: list + collected_count: str + comment_count: str + liked_count: str + share_count: str + time: int + last_update_time: int diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/help.py new file mode 100644 index 0000000..3d96811 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/help.py @@ -0,0 +1,316 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import ctypes +import json +import random +import time +import urllib.parse + +from model.m_xiaohongshu import NoteUrlInfo +from tools.crawler_util import extract_url_params_to_dict + + +def sign(a1="", b1="", x_s="", x_t=""): + """ + takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t". + """ + common = { + "s0": 3, # getPlatformCode + "s1": "", + "x0": "1", # localStorage.getItem("b1b1") + "x1": "3.7.8-2", # version + "x2": "Mac OS", + "x3": "xhs-pc-web", + "x4": "4.27.2", + "x5": a1, # cookie of a1 + "x6": x_t, + "x7": x_s, + "x8": b1, # localStorage.getItem("b1") + "x9": mrc(x_t + x_s + b1), + "x10": 154, # getSigCount + } + encode_str = encodeUtf8(json.dumps(common, separators=(',', ':'))) + x_s_common = b64Encode(encode_str) + x_b3_traceid = get_b3_trace_id() + return { + "x-s": x_s, + "x-t": x_t, + "x-s-common": x_s_common, + "x-b3-traceid": x_b3_traceid + } + + +def get_b3_trace_id(): + re = "abcdef0123456789" + je = 16 + e = "" + for t in range(16): + e += re[random.randint(0, je - 1)] + return e + + +def mrc(e): + ie = [ + 0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685, + 2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995, + 2125561021, 3887607047, 2428444049, 498536548, 1789927666, 4089016648, + 2227061214, 450548861, 1843258603, 4107580753, 2211677639, 325883990, + 1684777152, 4251122042, 2321926636, 335633487, 1661365465, 4195302755, + 2366115317, 997073096, 1281953886, 3579855332, 2724688242, 1006888145, + 1258607687, 3524101629, 2768942443, 901097722, 1119000684, 3686517206, + 2898065728, 853044451, 1172266101, 3705015759, 2882616665, 651767980, + 1373503546, 3369554304, 3218104598, 565507253, 1454621731, 3485111705, + 3099436303, 671266974, 1594198024, 3322730930, 2970347812, 795835527, + 1483230225, 3244367275, 3060149565, 1994146192, 31158534, 2563907772, + 4023717930, 1907459465, 112637215, 2680153253, 3904427059, 2013776290, + 251722036, 2517215374, 3775830040, 2137656763, 141376813, 2439277719, + 3865271297, 1802195444, 476864866, 2238001368, 4066508878, 1812370925, + 453092731, 2181625025, 4111451223, 1706088902, 314042704, 2344532202, + 4240017532, 1658658271, 366619977, 2362670323, 4224994405, 1303535960, + 984961486, 2747007092, 3569037538, 1256170817, 1037604311, 2765210733, + 3554079995, 1131014506, 879679996, 2909243462, 3663771856, 1141124467, + 855842277, 2852801631, 3708648649, 1342533948, 654459306, 3188396048, + 3373015174, 1466479909, 544179635, 3110523913, 3462522015, 1591671054, + 702138776, 2966460450, 3352799412, 1504918807, 783551873, 3082640443, + 3233442989, 3988292384, 2596254646, 62317068, 1957810842, 3939845945, + 2647816111, 81470997, 1943803523, 3814918930, 2489596804, 225274430, + 2053790376, 3826175755, 2466906013, 167816743, 2097651377, 4027552580, + 2265490386, 503444072, 1762050814, 4150417245, 2154129355, 426522225, + 1852507879, 4275313526, 2312317920, 282753626, 1742555852, 4189708143, + 2394877945, 397917763, 1622183637, 3604390888, 2714866558, 953729732, + 1340076626, 3518719985, 2797360999, 1068828381, 1219638859, 3624741850, + 2936675148, 906185462, 1090812512, 3747672003, 2825379669, 829329135, + 1181335161, 3412177804, 3160834842, 628085408, 1382605366, 3423369109, + 3138078467, 570562233, 1426400815, 3317316542, 2998733608, 733239954, + 1555261956, 3268935591, 3050360625, 752459403, 1541320221, 2607071920, + 3965973030, 1969922972, 40735498, 2617837225, 3943577151, 1913087877, + 83908371, 2512341634, 3803740692, 2075208622, 213261112, 2463272603, + 3855990285, 2094854071, 198958881, 2262029012, 4057260610, 1759359992, + 534414190, 2176718541, 4139329115, 1873836001, 414664567, 2282248934, + 4279200368, 1711684554, 285281116, 2405801727, 4167216745, 1634467795, + 376229701, 2685067896, 3608007406, 1308918612, 956543938, 2808555105, + 3495958263, 1231636301, 1047427035, 2932959818, 3654703836, 1088359270, + 936918000, 2847714899, 3736837829, 1202900863, 817233897, 3183342108, + 3401237130, 1404277552, 615818150, 3134207493, 3453421203, 1423857449, + 601450431, 3009837614, 3294710456, 1567103746, 711928724, 3020668471, + 3272380065, 1510334235, 755167117, + ] + o = -1 + + def right_without_sign(num: int, bit: int=0) -> int: + val = ctypes.c_uint32(num).value >> bit + MAX32INT = 4294967295 + return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1 + + for n in range(57): + o = ie[(o & 255) ^ ord(e[n])] ^ right_without_sign(o, 8) + return o ^ -1 ^ 3988292384 + + +lookup = [ + "Z", + "m", + "s", + "e", + "r", + "b", + "B", + "o", + "H", + "Q", + "t", + "N", + "P", + "+", + "w", + "O", + "c", + "z", + "a", + "/", + "L", + "p", + "n", + "g", + "G", + "8", + "y", + "J", + "q", + "4", + "2", + "K", + "W", + "Y", + "j", + "0", + "D", + "S", + "f", + "d", + "i", + "k", + "x", + "3", + "V", + "T", + "1", + "6", + "I", + "l", + "U", + "A", + "F", + "M", + "9", + "7", + "h", + "E", + "C", + "v", + "u", + "R", + "X", + "5", +] + + +def tripletToBase64(e): + return ( + lookup[63 & (e >> 18)] + + lookup[63 & (e >> 12)] + + lookup[(e >> 6) & 63] + + lookup[e & 63] + ) + + +def encodeChunk(e, t, r): + m = [] + for b in range(t, r, 3): + n = (16711680 & (e[b] << 16)) + \ + ((e[b + 1] << 8) & 65280) + (e[b + 2] & 255) + m.append(tripletToBase64(n)) + return ''.join(m) + + +def b64Encode(e): + P = len(e) + W = P % 3 + U = [] + z = 16383 + H = 0 + Z = P - W + while H < Z: + U.append(encodeChunk(e, H, Z if H + z > Z else H + z)) + H += z + if 1 == W: + F = e[P - 1] + U.append(lookup[F >> 2] + lookup[(F << 4) & 63] + "==") + elif 2 == W: + F = (e[P - 2] << 8) + e[P - 1] + U.append(lookup[F >> 10] + lookup[63 & (F >> 4)] + + lookup[(F << 2) & 63] + "=") + return "".join(U) + + +def encodeUtf8(e): + b = [] + m = urllib.parse.quote(e, safe='~()*!.\'') + w = 0 + while w < len(m): + T = m[w] + if T == "%": + E = m[w + 1] + m[w + 2] + S = int(E, 16) + b.append(S) + w += 2 + else: + b.append(ord(T[0])) + w += 1 + return b + + +def base36encode(number, alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'): + """Converts an integer to a base36 string.""" + if not isinstance(number, int): + raise TypeError('number must be an integer') + + base36 = '' + sign = '' + + if number < 0: + sign = '-' + number = -number + + if 0 <= number < len(alphabet): + return sign + alphabet[number] + + while number != 0: + number, i = divmod(number, len(alphabet)) + base36 = alphabet[i] + base36 + + return sign + base36 + + +def base36decode(number): + return int(number, 36) + + +def get_search_id(): + e = int(time.time() * 1000) << 64 + t = int(random.uniform(0, 2147483646)) + return base36encode((e + t)) + + +img_cdns = [ + "https://sns-img-qc.xhscdn.com", + "https://sns-img-hw.xhscdn.com", + "https://sns-img-bd.xhscdn.com", + "https://sns-img-qn.xhscdn.com", +] + +def get_img_url_by_trace_id(trace_id: str, format_type: str = "png"): + return f"{random.choice(img_cdns)}/{trace_id}?imageView2/format/{format_type}" + + +def get_img_urls_by_trace_id(trace_id: str, format_type: str = "png"): + return [f"{cdn}/{trace_id}?imageView2/format/{format_type}" for cdn in img_cdns] + + +def get_trace_id(img_url: str): + # 浏览器端上传的图片多了 /spectrum/ 这个路径 + return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1] + + +def parse_note_info_from_note_url(url: str) -> NoteUrlInfo: + """ + 从小红书笔记url中解析出笔记信息 + Args: + url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" + Returns: + + """ + note_id = url.split("/")[-1].split("?")[0] + params = extract_url_params_to_dict(url) + xsec_token = params.get("xsec_token", "") + xsec_source = params.get("xsec_source", "") + return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source) + + +if __name__ == '__main__': + _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3" + # 获取一个图片地址在多个cdn下的url地址 + # final_img_urls = get_img_urls_by_trace_id(get_trace_id(_img_url)) + final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url)) + print(final_img_url) + + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/login.py new file mode 100644 index 0000000..4c8a51f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/login.py @@ -0,0 +1,197 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from cache.cache_factory import CacheFactory +from tools import utils + + +class XiaoHongShuLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self, no_logged_in_session: str) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second + if max retry times reached, raise RetryError + """ + + if "请通过验证" in await self.context_page.content(): + utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证") + + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + current_web_session = cookie_dict.get("web_session") + if current_web_session != no_logged_in_session: + return True + return False + + async def begin(self): + """Start login xiaohongshu""" + utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") + + async def login_by_mobile(self): + """Login xiaohongshu by mobile""" + utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...") + await asyncio.sleep(1) + try: + # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 + login_button_ele = await self.context_page.wait_for_selector( + selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button", + timeout=5000 + ) + await login_button_ele.click() + # 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的 + # 另一种是需要点击切换到手机登录的 + element = await self.context_page.wait_for_selector( + selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]', + timeout=5000 + ) + await element.click() + except Exception as e: + utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...") + + await asyncio.sleep(1) + login_container_ele = await self.context_page.wait_for_selector("div.login-container") + input_ele = await login_container_ele.query_selector("label.phone > input") + await input_ele.fill(self.login_phone) + await asyncio.sleep(0.5) + + send_btn_ele = await login_container_ele.query_selector("label.auth-code > span") + await send_btn_ele.click() # 点击发送验证码 + sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") + submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") + cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY) + max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 + no_logged_in_session = "" + while max_get_sms_code_time > 0: + utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...") + await asyncio.sleep(1) + sms_code_key = f"xhs_{self.login_phone}" + sms_code_value = cache_client.get(sms_code_key) + if not sms_code_value: + max_get_sms_code_time -= 1 + continue + + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("web_session") + + await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码 + await asyncio.sleep(0.5) + agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']") + await agree_privacy_ele.click() # 点击同意隐私协议 + await asyncio.sleep(0.5) + + await submit_btn_ele.click() # 点击登录 + + # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确 + break + + try: + await self.check_login_state(no_logged_in_session) + except RetryError: + utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_qrcode(self): + """login xiaohongshu website and keep webdriver login state""" + utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...") + # login_selector = "div.login-container > div.left > div.qrcode > img" + qrcode_img_selector = "xpath=//img[@class='qrcode-img']" + # find login qrcode + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + # if this website does not automatically popup login dialog box, we will manual click login button + await asyncio.sleep(0.5) + login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") + await login_button_ele.click() + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + sys.exit() + + # get not logged session + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("web_session") + + # show login qrcode + # fix issue #12 + # we need to use partial function to call show_qrcode function and run in executor + # then current asyncio event loop will not be blocked + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s") + try: + await self.check_login_state(no_logged_in_session) + except RetryError: + utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_cookies(self): + """login xiaohongshu website by cookies""" + utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + if key != "web_session": # only set web_session cookie attr + continue + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".xiaohongshu.com", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/__init__.py new file mode 100644 index 0000000..641f486 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/__init__.py @@ -0,0 +1,13 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from .core import ZhihuCrawler \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/client.py new file mode 100644 index 0000000..ac74fc8 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/client.py @@ -0,0 +1,568 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +import asyncio +import json +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from httpx import Response +from playwright.async_api import BrowserContext, Page +from tenacity import retry, stop_after_attempt, wait_fixed + +import config +from base.base_crawler import AbstractApiClient +from constant import zhihu as zhihu_constant +from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator +from tools import utils + +from .exception import DataFetchError, ForbiddenError +from .field import SearchSort, SearchTime, SearchType +from .help import ZhihuExtractor, sign + + +class ZhiHuClient(AbstractApiClient): + + def __init__( + self, + timeout=10, + proxy=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxy = proxy + self.timeout = timeout + self.default_headers = headers + self.cookie_dict = cookie_dict + self._extractor = ZhihuExtractor() + + async def _pre_headers(self, url: str) -> Dict: + """ + 请求头参数签名 + Args: + url: 请求的URL需要包含请求的参数 + Returns: + + """ + d_c0 = self.cookie_dict.get("d_c0") + if not d_c0: + raise Exception("d_c0 not found in cookies") + sign_res = sign(url, self.default_headers["cookie"]) + headers = self.default_headers.copy() + headers['x-zst-81'] = sign_res["x-zst-81"] + headers['x-zse-96'] = sign_res["x-zse-96"] + return headers + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + # return response.text + return_response = kwargs.pop('return_response', False) + + async with httpx.AsyncClient(proxy=self.proxy) as client: + response = await client.request(method, url, timeout=self.timeout, **kwargs) + + if response.status_code != 200: + utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}") + if response.status_code == 403: + raise ForbiddenError(response.text) + elif response.status_code == 404: # 如果一个content没有评论也是404 + return {} + + raise DataFetchError(response.text) + + if return_response: + return response.text + try: + data: Dict = response.json() + if data.get("error"): + utils.logger.error(f"[ZhiHuClient.request] Request error: {data}") + raise DataFetchError(data.get("error", {}).get("message")) + return data + except json.JSONDecodeError: + utils.logger.error(f"[ZhiHuClient.request] Request error: {response.text}") + raise DataFetchError(response.text) + + async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri += '?' + urlencode(params) + headers = await self._pre_headers(final_uri) + base_url = (zhihu_constant.ZHIHU_URL if "/p/" not in uri else zhihu_constant.ZHIHU_ZHUANLAN_URL) + return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs) + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + utils.logger.info("[ZhiHuClient.pong] Begin to pong zhihu...") + ping_flag = False + try: + res = await self.get_current_user_info() + if res.get("uid") and res.get("name"): + ping_flag = True + utils.logger.info("[ZhiHuClient.pong] Ping zhihu successfully") + else: + utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed, response data: {res}") + except Exception as e: + utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.default_headers["cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def get_current_user_info(self) -> Dict: + """ + 获取当前登录用户信息 + Returns: + + """ + params = {"include": "email,is_active,is_bind_phone"} + return await self.get("/api/v4/me", params) + + async def get_note_by_keyword( + self, + keyword: str, + page: int = 1, + page_size: int = 20, + sort: SearchSort = SearchSort.DEFAULT, + note_type: SearchType = SearchType.DEFAULT, + search_time: SearchTime = SearchTime.DEFAULT, + ) -> List[ZhihuContent]: + """ + 根据关键词搜索 + Args: + keyword: 关键词 + page: 第几页 + page_size: 分页size + sort: 排序 + note_type: 搜索结果类型 + search_time: 搜索多久时间的结果 + + Returns: + + """ + uri = "/api/v4/search_v3" + params = { + "gk_version": "gz-gaokao", + "t": "general", + "q": keyword, + "correction": 1, + "offset": (page - 1) * page_size, + "limit": page_size, + "filter_fields": "", + "lc_idx": (page - 1) * page_size, + "show_all_topics": 0, + "search_source": "Filter", + "time_interval": search_time.value, + "sort": sort.value, + "vertical": note_type.value, + } + search_res = await self.get(uri, params) + utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}") + return self._extractor.extract_contents_from_search(search_res) + + async def get_root_comments( + self, + content_id: str, + content_type: str, + offset: str = "", + limit: int = 10, + order_by: str = "score", + ) -> Dict: + """ + 获取内容的一级评论 + Args: + content_id: 内容ID + content_type: 内容类型(answer, article, zvideo) + offset: + limit: + order_by: + + Returns: + + """ + uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment" + params = {"order": order_by, "offset": offset, "limit": limit} + return await self.get(uri, params) + # uri = f"/api/v4/{content_type}s/{content_id}/root_comments" + # params = { + # "order": order_by, + # "offset": offset, + # "limit": limit + # } + # return await self.get(uri, params) + + async def get_child_comments( + self, + root_comment_id: str, + offset: str = "", + limit: int = 10, + order_by: str = "sort", + ) -> Dict: + """ + 获取一级评论下的子评论 + Args: + root_comment_id: + offset: + limit: + order_by: + + Returns: + + """ + uri = f"/api/v4/comment_v5/comment/{root_comment_id}/child_comment" + params = { + "order": order_by, + "offset": offset, + "limit": limit, + } + return await self.get(uri, params) + + async def get_note_all_comments( + self, + content: ZhihuContent, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuComment]: + """ + 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + content: 内容详情对象(问题|文章|视频) + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + result: List[ZhihuComment] = [] + is_end: bool = False + offset: str = "" + limit: int = 10 + while not is_end: + root_comment_res = await self.get_root_comments(content.content_id, content.content_type, offset, limit) + if not root_comment_res: + break + paging_info = root_comment_res.get("paging", {}) + is_end = paging_info.get("is_end") + offset = self._extractor.extract_offset(paging_info) + comments = self._extractor.extract_comments(content, root_comment_res.get("data")) + + if not comments: + break + + if callback: + await callback(comments) + + result.extend(comments) + await self.get_comments_all_sub_comments(content, comments, crawl_interval=crawl_interval, callback=callback) + await asyncio.sleep(crawl_interval) + return result + + async def get_comments_all_sub_comments( + self, + content: ZhihuContent, + comments: List[ZhihuComment], + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuComment]: + """ + 获取指定评论下的所有子评论 + Args: + content: 内容详情对象(问题|文章|视频) + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + return [] + + all_sub_comments: List[ZhihuComment] = [] + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: + continue + + is_end: bool = False + offset: str = "" + limit: int = 10 + while not is_end: + child_comment_res = await self.get_child_comments(parment_comment.comment_id, offset, limit) + if not child_comment_res: + break + paging_info = child_comment_res.get("paging", {}) + is_end = paging_info.get("is_end") + offset = self._extractor.extract_offset(paging_info) + sub_comments = self._extractor.extract_comments(content, child_comment_res.get("data")) + + if not sub_comments: + break + + if callback: + await callback(sub_comments) + + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + return all_sub_comments + + async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]: + """ + 获取创作者信息 + Args: + url_token: + + Returns: + + """ + uri = f"/people/{url_token}" + html_content: str = await self.get(uri, return_response=True) + return self._extractor.extract_creator(url_token, html_content) + + async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict: + """ + 获取创作者的回答 + Args: + url_token: + offset: + limit: + + Returns: + + + """ + uri = f"/api/v4/members/{url_token}/answers" + params = { + "include": + "data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship", + "offset": offset, + "limit": limit, + "order_by": "created" + } + return await self.get(uri, params) + + async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict: + """ + 获取创作者的文章 + Args: + url_token: + offset: + limit: + + Returns: + + """ + uri = f"/api/v4/members/{url_token}/articles" + params = { + "include": + "data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;", + "offset": offset, + "limit": limit, + "order_by": "created" + } + return await self.get(uri, params) + + async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict: + """ + 获取创作者的视频 + Args: + url_token: + offset: + limit: + + Returns: + + """ + uri = f"/api/v4/members/{url_token}/zvideos" + params = { + "include": "similar_zvideo,creation_relationship,reaction_instruction", + "offset": offset, + "limit": limit, + "similar_aggregation": "true", + } + return await self.get(uri, params) + + async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]: + """ + 获取创作者的所有回答 + Args: + creator: 创作者信息 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + all_contents: List[ZhihuContent] = [] + is_end: bool = False + offset: int = 0 + limit: int = 20 + while not is_end: + res = await self.get_creator_answers(creator.url_token, offset, limit) + if not res: + break + utils.logger.info(f"[ZhiHuClient.get_all_anwser_by_creator] Get creator {creator.url_token} answers: {res}") + paging_info = res.get("paging", {}) + is_end = paging_info.get("is_end") + contents = self._extractor.extract_content_list_from_creator(res.get("data")) + if callback: + await callback(contents) + all_contents.extend(contents) + offset += limit + await asyncio.sleep(crawl_interval) + return all_contents + + async def get_all_articles_by_creator( + self, + creator: ZhihuCreator, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuContent]: + """ + 获取创作者的所有文章 + Args: + creator: + crawl_interval: + callback: + + Returns: + + """ + all_contents: List[ZhihuContent] = [] + is_end: bool = False + offset: int = 0 + limit: int = 20 + while not is_end: + res = await self.get_creator_articles(creator.url_token, offset, limit) + if not res: + break + paging_info = res.get("paging", {}) + is_end = paging_info.get("is_end") + contents = self._extractor.extract_content_list_from_creator(res.get("data")) + if callback: + await callback(contents) + all_contents.extend(contents) + offset += limit + await asyncio.sleep(crawl_interval) + return all_contents + + async def get_all_videos_by_creator( + self, + creator: ZhihuCreator, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[ZhihuContent]: + """ + 获取创作者的所有视频 + Args: + creator: + crawl_interval: + callback: + + Returns: + + """ + all_contents: List[ZhihuContent] = [] + is_end: bool = False + offset: int = 0 + limit: int = 20 + while not is_end: + res = await self.get_creator_videos(creator.url_token, offset, limit) + if not res: + break + paging_info = res.get("paging", {}) + is_end = paging_info.get("is_end") + contents = self._extractor.extract_content_list_from_creator(res.get("data")) + if callback: + await callback(contents) + all_contents.extend(contents) + offset += limit + await asyncio.sleep(crawl_interval) + return all_contents + + async def get_answer_info( + self, + question_id: str, + answer_id: str, + ) -> Optional[ZhihuContent]: + """ + 获取回答信息 + Args: + question_id: + answer_id: + + Returns: + + """ + uri = f"/question/{question_id}/answer/{answer_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_answer_content_from_html(response_html) + + async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]: + """ + 获取文章信息 + Args: + article_id: + + Returns: + + """ + uri = f"/p/{article_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_article_content_from_html(response_html) + + async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]: + """ + 获取视频信息 + Args: + video_id: + + Returns: + + """ + uri = f"/zvideo/{video_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_zvideo_content_from_html(response_html) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/core.py new file mode 100644 index 0000000..9ef72b6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/core.py @@ -0,0 +1,455 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple, cast + +from playwright.async_api import ( + BrowserContext, + BrowserType, + Page, + Playwright, + async_playwright, +) + +import config +from constant import zhihu as constant +from base.base_crawler import AbstractCrawler +from model.m_zhihu import ZhihuContent, ZhihuCreator +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import zhihu as zhihu_store +from tools import utils +from tools.cdp_browser import CDPBrowserManager +from var import crawler_type_var, source_keyword_var + +from .client import ZhiHuClient +from .exception import DataFetchError +from .help import ZhihuExtractor, judge_zhihu_url +from .login import ZhiHuLogin + + +class ZhihuCrawler(AbstractCrawler): + context_page: Page + zhihu_client: ZhiHuClient + browser_context: BrowserContext + cdp_manager: Optional[CDPBrowserManager] + + def __init__(self) -> None: + self.index_url = "https://www.zhihu.com" + # self.user_agent = utils.get_user_agent() + self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" + self._extractor = ZhihuExtractor() + self.cdp_manager = None + + async def start(self) -> None: + """ + Start the crawler + Returns: + + """ + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool( + config.IP_PROXY_POOL_COUNT, enable_validate_ip=True + ) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( + ip_proxy_info + ) + + async with async_playwright() as playwright: + # 根据配置选择启动模式 + if config.ENABLE_CDP_MODE: + utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器") + self.browser_context = await self.launch_browser_with_cdp( + playwright, + playwright_proxy_format, + self.user_agent, + headless=config.CDP_HEADLESS, + ) + else: + utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器") + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, None, self.user_agent, headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url, wait_until="domcontentloaded") + + # Create a client to interact with the zhihu website. + self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format) + if not await self.zhihu_client.pong(): + login_obj = ZhiHuLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # input your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES, + ) + await login_obj.begin() + await self.zhihu_client.update_cookies( + browser_context=self.browser_context + ) + + # 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行 + utils.logger.info( + "[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右" + ) + await self.context_page.goto( + f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content" + ) + await asyncio.sleep(5) + await self.zhihu_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + await self.get_creators_and_notes() + else: + pass + + utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...") + + async def search(self) -> None: + """Search for notes and retrieve their comment information.""" + utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords") + zhihu_limit_count = 20 # zhihu limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info( + f"[ZhihuCrawler.search] Current search keyword: {keyword}" + ) + page = 1 + while ( + page - start_page + 1 + ) * zhihu_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[ZhihuCrawler.search] Skip page {page}") + page += 1 + continue + + try: + utils.logger.info( + f"[ZhihuCrawler.search] search zhihu keyword: {keyword}, page: {page}" + ) + content_list: List[ZhihuContent] = ( + await self.zhihu_client.get_note_by_keyword( + keyword=keyword, + page=page, + ) + ) + utils.logger.info( + f"[ZhihuCrawler.search] Search contents :{content_list}" + ) + if not content_list: + utils.logger.info("No more content!") + break + + page += 1 + for content in content_list: + await zhihu_store.update_zhihu_content(content) + + await self.batch_get_content_comments(content_list) + except DataFetchError: + utils.logger.error("[ZhihuCrawler.search] Search content error") + return + + async def batch_get_content_comments(self, content_list: List[ZhihuContent]): + """ + Batch get content comments + Args: + content_list: + + Returns: + + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info( + f"[ZhihuCrawler.batch_get_content_comments] Crawling comment mode is not enabled" + ) + return + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for content_item in content_list: + task = asyncio.create_task( + self.get_comments(content_item, semaphore), name=content_item.content_id + ) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments( + self, content_item: ZhihuContent, semaphore: asyncio.Semaphore + ): + """ + Get note comments with keyword filtering and quantity limitation + Args: + content_item: + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info( + f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}" + ) + await self.zhihu_client.get_note_all_comments( + content=content_item, + crawl_interval=random.random(), + callback=zhihu_store.batch_update_zhihu_note_comments, + ) + + async def get_creators_and_notes(self) -> None: + """ + Get creator's information and their notes and comments + Returns: + + """ + utils.logger.info( + "[ZhihuCrawler.get_creators_and_notes] Begin get xiaohongshu creators" + ) + for user_link in config.ZHIHU_CREATOR_URL_LIST: + utils.logger.info( + f"[ZhihuCrawler.get_creators_and_notes] Begin get creator {user_link}" + ) + user_url_token = user_link.split("/")[-1] + # get creator detail info from web html content + createor_info: ZhihuCreator = await self.zhihu_client.get_creator_info( + url_token=user_url_token + ) + if not createor_info: + utils.logger.info( + f"[ZhihuCrawler.get_creators_and_notes] Creator {user_url_token} not found" + ) + continue + + utils.logger.info( + f"[ZhihuCrawler.get_creators_and_notes] Creator info: {createor_info}" + ) + await zhihu_store.save_creator(creator=createor_info) + + # 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可 + + # Get all anwser information of the creator + all_content_list = await self.zhihu_client.get_all_anwser_by_creator( + creator=createor_info, + crawl_interval=random.random(), + callback=zhihu_store.batch_update_zhihu_contents, + ) + + # Get all articles of the creator's contents + # all_content_list = await self.zhihu_client.get_all_articles_by_creator( + # creator=createor_info, + # crawl_interval=random.random(), + # callback=zhihu_store.batch_update_zhihu_contents + # ) + + # Get all videos of the creator's contents + # all_content_list = await self.zhihu_client.get_all_videos_by_creator( + # creator=createor_info, + # crawl_interval=random.random(), + # callback=zhihu_store.batch_update_zhihu_contents + # ) + + # Get all comments of the creator's contents + await self.batch_get_content_comments(all_content_list) + + async def get_note_detail( + self, full_note_url: str, semaphore: asyncio.Semaphore + ) -> Optional[ZhihuContent]: + """ + Get note detail + Args: + full_note_url: str + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}" + ) + # judge note type + note_type: str = judge_zhihu_url(full_note_url) + if note_type == constant.ANSWER_NAME: + question_id = full_note_url.split("/")[-3] + answer_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}" + ) + return await self.zhihu_client.get_answer_info(question_id, answer_id) + + elif note_type == constant.ARTICLE_NAME: + article_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}" + ) + return await self.zhihu_client.get_article_info(article_id) + + elif note_type == constant.VIDEO_NAME: + video_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}" + ) + return await self.zhihu_client.get_video_info(video_id) + + async def get_specified_notes(self): + """ + Get the information and comments of the specified post + Returns: + + """ + get_note_detail_task_list = [] + for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST: + # remove query params + full_note_url = full_note_url.split("?")[0] + crawler_task = self.get_note_detail( + full_note_url=full_note_url, + semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM), + ) + get_note_detail_task_list.append(crawler_task) + + need_get_comment_notes: List[ZhihuContent] = [] + note_details = await asyncio.gather(*get_note_detail_task_list) + for index, note_detail in enumerate(note_details): + if not note_detail: + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found" + ) + continue + + note_detail = cast(ZhihuContent, note_detail) # only for type check + need_get_comment_notes.append(note_detail) + await zhihu_store.update_zhihu_content(note_detail) + + await self.batch_get_content_comments(need_get_comment_notes) + + async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient: + """Create zhihu client""" + utils.logger.info( + "[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ..." + ) + cookie_str, cookie_dict = utils.convert_cookies( + await self.browser_context.cookies() + ) + zhihu_client_obj = ZhiHuClient( + proxy=httpx_proxy, + headers={ + "accept": "*/*", + "accept-language": "zh-CN,zh;q=0.9", + "cookie": cookie_str, + "priority": "u=1, i", + "referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content", + "user-agent": self.user_agent, + "x-api-version": "3.0.91", + "x-app-za": "OS=Web", + "x-requested-with": "fetch", + "x-zse-93": "101_3_3.0", + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return zhihu_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info( + "[ZhihuCrawler.launch_browser] Begin create browser context ..." + ) + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join( + os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM + ) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent, + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, user_agent=user_agent + ) + return browser_context + + async def launch_browser_with_cdp( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, + ) -> BrowserContext: + """ + 使用CDP模式启动浏览器 + """ + try: + self.cdp_manager = CDPBrowserManager() + browser_context = await self.cdp_manager.launch_and_connect( + playwright=playwright, + playwright_proxy=playwright_proxy, + user_agent=user_agent, + headless=headless, + ) + + # 显示浏览器信息 + browser_info = await self.cdp_manager.get_browser_info() + utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}") + + return browser_context + + except Exception as e: + utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}") + # 回退到标准模式 + chromium = playwright.chromium + return await self.launch_browser( + chromium, playwright_proxy, user_agent, headless + ) + + async def close(self): + """Close browser context""" + # 如果使用CDP模式,需要特殊处理 + if self.cdp_manager: + await self.cdp_manager.cleanup() + self.cdp_manager = None + else: + await self.browser_context.close() + utils.logger.info("[ZhihuCrawler.close] Browser context closed ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/exception.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/exception.py new file mode 100644 index 0000000..82d90e2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/exception.py @@ -0,0 +1,23 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" + +class ForbiddenError(RequestError): + """Forbidden""" \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/field.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/field.py new file mode 100644 index 0000000..3c63a12 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/field.py @@ -0,0 +1,47 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from enum import Enum +from typing import NamedTuple + +from constant import zhihu as zhihu_constant + + +class SearchTime(Enum): + """ + 搜索时间范围 + """ + DEFAULT = "" # 不限时间 + ONE_DAY = "a_day" # 一天内 + ONE_WEEK = "a_week" # 一周内 + ONE_MONTH = "a_month" # 一个月内 + THREE_MONTH = "three_months" # 三个月内 + HALF_YEAR = "half_a_year" # 半年内 + ONE_YEAR = "a_year" # 一年内 + + +class SearchType(Enum): + """ + 搜索结果类型 + """ + DEFAULT = "" # 不限类型 + ANSWER = zhihu_constant.ANSWER_NAME # 只看回答 + ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章 + VIDEO = zhihu_constant.VIDEO_NAME # 只看视频 + + +class SearchSort(Enum): + """ + 搜索结果排序 + """ + DEFAULT = "" # 综合排序 + UPVOTED_COUNT = "upvoted_count" # 最多赞同 + CREATE_TIME = "created_time" # 最新发布 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/help.py new file mode 100644 index 0000000..c0c75db --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/help.py @@ -0,0 +1,467 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import json +from typing import Dict, List, Optional +from urllib.parse import parse_qs, urlparse + +import execjs +from parsel import Selector + +from constant import zhihu as zhihu_constant +from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator +from tools import utils +from tools.crawler_util import extract_text_from_html + +ZHIHU_SGIN_JS = None + + +def sign(url: str, cookies: str) -> Dict: + """ + zhihu sign algorithm + Args: + url: request url with query string + cookies: request cookies with d_c0 key + + Returns: + + """ + global ZHIHU_SGIN_JS + if not ZHIHU_SGIN_JS: + with open("libs/zhihu.js", mode="r", encoding="utf-8-sig") as f: + ZHIHU_SGIN_JS = execjs.compile(f.read()) + + return ZHIHU_SGIN_JS.call("get_sign", url, cookies) + + +class ZhihuExtractor: + def __init__(self): + pass + + def extract_contents_from_search(self, json_data: Dict) -> List[ZhihuContent]: + """ + extract zhihu contents + Args: + json_data: zhihu json data + + Returns: + + """ + if not json_data: + return [] + + search_result: List[Dict] = json_data.get("data", []) + search_result = [s_item for s_item in search_result if s_item.get("type") in ['search_result', 'zvideo']] + return self._extract_content_list([sr_item.get("object") for sr_item in search_result if sr_item.get("object")]) + + + def _extract_content_list(self, content_list: List[Dict]) -> List[ZhihuContent]: + """ + extract zhihu content list + Args: + content_list: + + Returns: + + """ + if not content_list: + return [] + + res: List[ZhihuContent] = [] + for content in content_list: + if content.get("type") == zhihu_constant.ANSWER_NAME: + res.append(self._extract_answer_content(content)) + elif content.get("type") == zhihu_constant.ARTICLE_NAME: + res.append(self._extract_article_content(content)) + elif content.get("type") == zhihu_constant.VIDEO_NAME: + res.append(self._extract_zvideo_content(content)) + else: + continue + return res + + def _extract_answer_content(self, answer: Dict) -> ZhihuContent: + """ + extract zhihu answer content + Args: + answer: zhihu answer + + Returns: + """ + res = ZhihuContent() + res.content_id = answer.get("id") + res.content_type = answer.get("type") + res.content_text = extract_text_from_html(answer.get("content", "")) + res.question_id = answer.get("question").get("id") + res.content_url = f"{zhihu_constant.ZHIHU_URL}/question/{res.question_id}/answer/{res.content_id}" + res.title = extract_text_from_html(answer.get("title", "")) + res.desc = extract_text_from_html(answer.get("description", "") or answer.get("excerpt", "")) + res.created_time = answer.get("created_time") + res.updated_time = answer.get("updated_time") + res.voteup_count = answer.get("voteup_count", 0) + res.comment_count = answer.get("comment_count", 0) + + # extract author info + author_info = self._extract_content_or_comment_author(answer.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + res.user_url_token = author_info.url_token + return res + + def _extract_article_content(self, article: Dict) -> ZhihuContent: + """ + extract zhihu article content + Args: + article: zhihu article + + Returns: + + """ + res = ZhihuContent() + res.content_id = article.get("id") + res.content_type = article.get("type") + res.content_text = extract_text_from_html(article.get("content")) + res.content_url = f"{zhihu_constant.ZHIHU_ZHUANLAN_URL}/p/{res.content_id}" + res.title = extract_text_from_html(article.get("title")) + res.desc = extract_text_from_html(article.get("excerpt")) + res.created_time = article.get("created_time", 0) or article.get("created", 0) + res.updated_time = article.get("updated_time", 0) or article.get("updated", 0) + res.voteup_count = article.get("voteup_count", 0) + res.comment_count = article.get("comment_count", 0) + + # extract author info + author_info = self._extract_content_or_comment_author(article.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + res.user_url_token = author_info.url_token + return res + + def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent: + """ + extract zhihu zvideo content + Args: + zvideo: + + Returns: + + """ + res = ZhihuContent() + + if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的 + res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}" + res.created_time = zvideo.get("published_at") + res.updated_time = zvideo.get("updated_at") + else: + res.content_url = zvideo.get("video_url") + res.created_time = zvideo.get("created_at") + res.content_id = zvideo.get("id") + res.content_type = zvideo.get("type") + res.title = extract_text_from_html(zvideo.get("title")) + res.desc = extract_text_from_html(zvideo.get("description")) + res.voteup_count = zvideo.get("voteup_count") + res.comment_count = zvideo.get("comment_count") + + # extract author info + author_info = self._extract_content_or_comment_author(zvideo.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + res.user_url_token = author_info.url_token + return res + + @staticmethod + def _extract_content_or_comment_author(author: Dict) -> ZhihuCreator: + """ + extract zhihu author + Args: + author: + + Returns: + + """ + res = ZhihuCreator() + try: + if not author: + return res + if not author.get("id"): + author = author.get("member") + res.user_id = author.get("id") + res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{author.get('url_token')}" + res.user_nickname = author.get("name") + res.user_avatar = author.get("avatar_url") + res.url_token = author.get("url_token") + + except Exception as e : + utils.logger.warning( + f"[ZhihuExtractor._extract_content_or_comment_author] User Maybe Blocked. {e}" + ) + return res + + def extract_comments(self, page_content: ZhihuContent, comments: List[Dict]) -> List[ZhihuComment]: + """ + extract zhihu comments + Args: + page_content: zhihu content object + comments: zhihu comments + + Returns: + + """ + if not comments: + return [] + res: List[ZhihuComment] = [] + for comment in comments: + if comment.get("type") != "comment": + continue + res.append(self._extract_comment(page_content, comment)) + return res + + def _extract_comment(self, page_content: ZhihuContent, comment: Dict) -> ZhihuComment: + """ + extract zhihu comment + Args: + page_content: comment with content object + comment: zhihu comment + + Returns: + + """ + res = ZhihuComment() + res.comment_id = str(comment.get("id", "")) + res.parent_comment_id = comment.get("reply_comment_id") + res.content = extract_text_from_html(comment.get("content")) + res.publish_time = comment.get("created_time") + res.ip_location = self._extract_comment_ip_location(comment.get("comment_tag", [])) + res.sub_comment_count = comment.get("child_comment_count") + res.like_count = comment.get("like_count") if comment.get("like_count") else 0 + res.dislike_count = comment.get("dislike_count") if comment.get("dislike_count") else 0 + res.content_id = page_content.content_id + res.content_type = page_content.content_type + + # extract author info + author_info = self._extract_content_or_comment_author(comment.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + return res + + @staticmethod + def _extract_comment_ip_location(comment_tags: List[Dict]) -> str: + """ + extract comment ip location + Args: + comment_tags: + + Returns: + + """ + if not comment_tags: + return "" + + for ct in comment_tags: + if ct.get("type") == "ip_info": + return ct.get("text") + + return "" + + @staticmethod + def extract_offset(paging_info: Dict) -> str: + """ + extract offset + Args: + paging_info: + + Returns: + + """ + # https://www.zhihu.com/api/v4/comment_v5/zvideos/1424368906836807681/root_comment?limit=10&offset=456770961_10125996085_0&order_by=score + next_url = paging_info.get("next") + if not next_url: + return "" + + parsed_url = urlparse(next_url) + query_params = parse_qs(parsed_url.query) + offset = query_params.get('offset', [""])[0] + return offset + + @staticmethod + def _foramt_gender_text(gender: int) -> str: + """ + format gender text + Args: + gender: + + Returns: + + """ + if gender == 1: + return "男" + elif gender == 0: + return "女" + else: + return "未知" + + + def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]: + """ + extract zhihu creator + Args: + user_url_token : zhihu creator url token + html_content: zhihu creator html content + + Returns: + + """ + if not html_content: + return None + + js_init_data = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="").strip() + if not js_init_data: + return None + + js_init_data_dict: Dict = json.loads(js_init_data) + users_info: Dict = js_init_data_dict.get("initialState", {}).get("entities", {}).get("users", {}) + if not users_info: + return None + + creator_info: Dict = users_info.get(user_url_token) + if not creator_info: + return None + + res = ZhihuCreator() + res.user_id = creator_info.get("id") + res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{user_url_token}" + res.user_nickname = creator_info.get("name") + res.user_avatar = creator_info.get("avatarUrl") + res.url_token = creator_info.get("urlToken") or user_url_token + res.gender = self._foramt_gender_text(creator_info.get("gender")) + res.ip_location = creator_info.get("ipInfo") + res.follows = creator_info.get("followingCount") + res.fans = creator_info.get("followerCount") + res.anwser_count = creator_info.get("answerCount") + res.video_count = creator_info.get("zvideoCount") + res.question_count = creator_info.get("questionCount") + res.article_count = creator_info.get("articlesCount") + res.column_count = creator_info.get("columnsCount") + res.get_voteup_count = creator_info.get("voteupCount") + return res + + + def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[ZhihuContent]: + """ + extract content list from creator + Args: + anwser_list: + + Returns: + + """ + if not anwser_list: + return [] + + return self._extract_content_list(anwser_list) + + + + + def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu answer content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {}) + if not answer_info: + return None + + return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0])) + + def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu article content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {}) + if not article_info: + return None + + return self._extract_article_content(article_info.get(list(article_info.keys())[0])) + + def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu zvideo content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {}) + users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {}) + if not zvideo_info: + return None + + # handler user info and video info + video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0]) + if not video_detail_info: + return None + if isinstance(video_detail_info.get("author"), str): + author_name: str = video_detail_info.get("author") + video_detail_info["author"] = users.get(author_name) + + return self._extract_zvideo_content(video_detail_info) + + +def judge_zhihu_url(note_detail_url: str) -> str: + """ + judge zhihu url type + Args: + note_detail_url: + eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer + eg2: https://www.zhihu.com/p/123456789 # article + eg3: https://www.zhihu.com/zvideo/123456789 # zvideo + + Returns: + + """ + if "/answer/" in note_detail_url: + return zhihu_constant.ANSWER_NAME + elif "/p/" in note_detail_url: + return zhihu_constant.ARTICLE_NAME + elif "/zvideo/" in note_detail_url: + return zhihu_constant.VIDEO_NAME + else: + return "" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/login.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/login.py new file mode 100644 index 0000000..dfcaaef --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/zhihu/login.py @@ -0,0 +1,115 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class ZhiHuLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + Returns: + + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + current_web_session = cookie_dict.get("z_c0") + if current_web_session: + return True + return False + + async def begin(self): + """Start login zhihu""" + utils.logger.info("[ZhiHu.begin] Begin login zhihu ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[ZhiHu.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") + + async def login_by_mobile(self): + """Login zhihu by mobile""" + # todo implement login by mobile + + async def login_by_qrcode(self): + """login zhihu website and keep webdriver login state""" + utils.logger.info("[ZhiHu.login_by_qrcode] Begin login zhihu by qrcode ...") + qrcode_img_selector = "canvas.Qrcode-qrcode" + # find login qrcode + base64_qrcode_img = await utils.find_qrcode_img_from_canvas( + self.context_page, + canvas_selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[ZhiHu.login_by_qrcode] login failed , have not found qrcode please check ....") + if not base64_qrcode_img: + sys.exit() + + # show login qrcode + # fix issue #12 + # we need to use partial function to call show_qrcode function and run in executor + # then current asyncio event loop will not be blocked + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[ZhiHu.login_by_qrcode] waiting for scan code login, remaining time is 120s") + try: + await self.check_login_state() + + except RetryError: + utils.logger.info("[ZhiHu.login_by_qrcode] Login zhihu failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info( + f"[ZhiHu.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_cookies(self): + """login zhihu website by cookies""" + utils.logger.info("[ZhiHu.login_by_cookies] Begin login zhihu by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".zhihu.com", + 'path': "/" + }]) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/__init__.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/__init__.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_baidu_tieba.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_baidu_tieba.py new file mode 100644 index 0000000..0829d9d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_baidu_tieba.py @@ -0,0 +1,71 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import Optional + +from pydantic import BaseModel, Field + + +class TiebaNote(BaseModel): + """ + 百度贴吧帖子 + """ + note_id: str = Field(..., description="帖子ID") + title: str = Field(..., description="帖子标题") + desc: str = Field(default="", description="帖子描述") + note_url: str = Field(..., description="帖子链接") + publish_time: str = Field(default="", description="发布时间") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + tieba_name: str = Field(..., description="贴吧名称") + tieba_link: str = Field(..., description="贴吧链接") + total_replay_num: int = Field(default=0, description="回复总数") + total_replay_page: int = Field(default=0, description="回复总页数") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + source_keyword: str = Field(default="", description="来源关键词") + + +class TiebaComment(BaseModel): + """ + 百度贴吧评论 + """ + + comment_id: str = Field(..., description="评论ID") + parent_comment_id: str = Field(default="", description="父评论ID") + content: str = Field(..., description="评论内容") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + publish_time: str = Field(default="", description="发布时间") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + sub_comment_count: int = Field(default=0, description="子评论数") + note_id: str = Field(..., description="帖子ID") + note_url: str = Field(..., description="帖子链接") + tieba_id: str = Field(..., description="所属的贴吧ID") + tieba_name: str = Field(..., description="所属的贴吧名称") + tieba_link: str = Field(..., description="贴吧链接") + + +class TiebaCreator(BaseModel): + """ + 百度贴吧创作者 + """ + user_id: str = Field(..., description="用户ID") + user_name: str = Field(..., description="用户名") + nickname: str = Field(..., description="用户昵称") + gender: str = Field(default="", description="用户性别") + avatar: str = Field(..., description="用户头像地址") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + follows: int = Field(default=0, description="关注数") + fans: int = Field(default=0, description="粉丝数") + registration_duration: str = Field(default="", description="注册时长") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_douyin.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_douyin.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_douyin.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_kuaishou.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_kuaishou.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_kuaishou.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_weibo.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_weibo.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_weibo.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_xiaohongshu.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_xiaohongshu.py new file mode 100644 index 0000000..53294c6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_xiaohongshu.py @@ -0,0 +1,21 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- + + +from pydantic import BaseModel, Field + + +class NoteUrlInfo(BaseModel): + note_id: str = Field(title="note id") + xsec_token: str = Field(title="xsec token") + xsec_source: str = Field(title="xsec source") \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_zhihu.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_zhihu.py new file mode 100644 index 0000000..ba60001 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/model/m_zhihu.py @@ -0,0 +1,83 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import Optional + +from pydantic import BaseModel, Field + + +class ZhihuContent(BaseModel): + """ + 知乎内容(回答、文章、视频) + """ + content_id: str = Field(default="", description="内容ID") + content_type: str = Field(default="", description="内容类型(article | answer | zvideo)") + content_text: str = Field(default="", description="内容文本, 如果是视频类型这里为空") + content_url: str = Field(default="", description="内容落地链接") + question_id: str = Field(default="", description="问题ID, type为answer时有值") + title: str = Field(default="", description="内容标题") + desc: str = Field(default="", description="内容描述") + created_time: int = Field(default=0, description="创建时间") + updated_time: int = Field(default=0, description="更新时间") + voteup_count: int = Field(default=0, description="赞同人数") + comment_count: int = Field(default=0, description="评论数量") + source_keyword: str = Field(default="", description="来源关键词") + + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + user_url_token: str = Field(default="", description="用户url_token") + + +class ZhihuComment(BaseModel): + """ + 知乎评论 + """ + + comment_id: str = Field(default="", description="评论ID") + parent_comment_id: str = Field(default="", description="父评论ID") + content: str = Field(default="", description="评论内容") + publish_time: int = Field(default=0, description="发布时间") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + sub_comment_count: int = Field(default=0, description="子评论数") + like_count: int = Field(default=0, description="点赞数") + dislike_count: int = Field(default=0, description="踩数") + content_id: str = Field(default="", description="内容ID") + content_type: str = Field(default="", description="内容类型(article | answer | zvideo)") + + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + + +class ZhihuCreator(BaseModel): + """ + 知乎创作者 + """ + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + url_token: str = Field(default="", description="用户url_token") + gender: str = Field(default="", description="用户性别") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + follows: int = Field(default=0, description="关注数") + fans: int = Field(default=0, description="粉丝数") + anwser_count: int = Field(default=0, description="回答数") + video_count: int = Field(default=0, description="视频数") + question_count: int = Field(default=0, description="提问数") + article_count: int = Field(default=0, description="文章数") + column_count: int = Field(default=0, description="专栏数") + get_voteup_count: int = Field(default=0, description="获得的赞同数") + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/mypy.ini b/MindSpider/DeepSentimentCrawling/MediaCrawler/mypy.ini new file mode 100644 index 0000000..433f610 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/mypy.ini @@ -0,0 +1,9 @@ +[mypy] +warn_return_any = True +warn_unused_configs = True + +[mypy-cv2] +ignore_missing_imports = True + +[mypy-execjs] +ignore_missing_imports = True \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/package-lock.json b/MindSpider/DeepSentimentCrawling/MediaCrawler/package-lock.json new file mode 100644 index 0000000..933a4e4 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/package-lock.json @@ -0,0 +1,2521 @@ +{ + "name": "MediaCrawler", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "devDependencies": { + "vitepress": "^1.3.4" + } + }, + "node_modules/@algolia/autocomplete-core": { + "version": "1.9.3", + "resolved": "https://registry.npmmirror.com/@algolia/autocomplete-core/-/autocomplete-core-1.9.3.tgz", + "integrity": "sha512-009HdfugtGCdC4JdXUbVJClA0q0zh24yyePn+KUGk3rP7j8FEe/m5Yo/z65gn6nP/cM39PxpzqKrL7A6fP6PPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/autocomplete-plugin-algolia-insights": "1.9.3", + "@algolia/autocomplete-shared": "1.9.3" + } + }, + "node_modules/@algolia/autocomplete-plugin-algolia-insights": { + "version": "1.9.3", + "resolved": "https://registry.npmmirror.com/@algolia/autocomplete-plugin-algolia-insights/-/autocomplete-plugin-algolia-insights-1.9.3.tgz", + "integrity": "sha512-a/yTUkcO/Vyy+JffmAnTWbr4/90cLzw+CC3bRbhnULr/EM0fGNvM13oQQ14f2moLMcVDyAx/leczLlAOovhSZg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/autocomplete-shared": "1.9.3" + }, + "peerDependencies": { + "search-insights": ">= 1 < 3" + } + }, + "node_modules/@algolia/autocomplete-preset-algolia": { + "version": "1.9.3", + "resolved": "https://registry.npmmirror.com/@algolia/autocomplete-preset-algolia/-/autocomplete-preset-algolia-1.9.3.tgz", + "integrity": "sha512-d4qlt6YmrLMYy95n5TB52wtNDr6EgAIPH81dvvvW8UmuWRgxEtY0NJiPwl/h95JtG2vmRM804M0DSwMCNZlzRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/autocomplete-shared": "1.9.3" + }, + "peerDependencies": { + "@algolia/client-search": ">= 4.9.1 < 6", + "algoliasearch": ">= 4.9.1 < 6" + } + }, + "node_modules/@algolia/autocomplete-shared": { + "version": "1.9.3", + "resolved": "https://registry.npmmirror.com/@algolia/autocomplete-shared/-/autocomplete-shared-1.9.3.tgz", + "integrity": "sha512-Wnm9E4Ye6Rl6sTTqjoymD+l8DjSTHsHboVRYrKgEt8Q7UHm9nYbqhN/i0fhUYA3OAEH7WA8x3jfpnmJm3rKvaQ==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "@algolia/client-search": ">= 4.9.1 < 6", + "algoliasearch": ">= 4.9.1 < 6" + } + }, + "node_modules/@algolia/cache-browser-local-storage": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/cache-browser-local-storage/-/cache-browser-local-storage-4.24.0.tgz", + "integrity": "sha512-t63W9BnoXVrGy9iYHBgObNXqYXM3tYXCjDSHeNwnsc324r4o5UiVKUiAB4THQ5z9U5hTj6qUvwg/Ez43ZD85ww==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/cache-common": "4.24.0" + } + }, + "node_modules/@algolia/cache-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/cache-common/-/cache-common-4.24.0.tgz", + "integrity": "sha512-emi+v+DmVLpMGhp0V9q9h5CdkURsNmFC+cOS6uK9ndeJm9J4TiqSvPYVu+THUP8P/S08rxf5x2P+p3CfID0Y4g==", + "dev": true, + "license": "MIT" + }, + "node_modules/@algolia/cache-in-memory": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/cache-in-memory/-/cache-in-memory-4.24.0.tgz", + "integrity": "sha512-gDrt2so19jW26jY3/MkFg5mEypFIPbPoXsQGQWAi6TrCPsNOSEYepBMPlucqWigsmEy/prp5ug2jy/N3PVG/8w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/cache-common": "4.24.0" + } + }, + "node_modules/@algolia/client-account": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-account/-/client-account-4.24.0.tgz", + "integrity": "sha512-adcvyJ3KjPZFDybxlqnf+5KgxJtBjwTPTeyG2aOyoJvx0Y8dUQAEOEVOJ/GBxX0WWNbmaSrhDURMhc+QeevDsA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/client-search": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-account/node_modules/@algolia/client-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-4.24.0.tgz", + "integrity": "sha512-bc2ROsNL6w6rqpl5jj/UywlIYC21TwSSoFHKl01lYirGMW+9Eek6r02Tocg4gZ8HAw3iBvu6XQiM3BEbmEMoiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-account/node_modules/@algolia/client-search": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-4.24.0.tgz", + "integrity": "sha512-uRW6EpNapmLAD0mW47OXqTP8eiIx5F6qN9/x/7HHO6owL3N1IXqydGwW5nhDFBrV+ldouro2W1VX3XlcUXEFCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-analytics": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-analytics/-/client-analytics-4.24.0.tgz", + "integrity": "sha512-y8jOZt1OjwWU4N2qr8G4AxXAzaa8DBvyHTWlHzX/7Me1LX8OayfgHexqrsL4vSBcoMmVw2XnVW9MhL+Y2ZDJXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/client-search": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-analytics/node_modules/@algolia/client-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-4.24.0.tgz", + "integrity": "sha512-bc2ROsNL6w6rqpl5jj/UywlIYC21TwSSoFHKl01lYirGMW+9Eek6r02Tocg4gZ8HAw3iBvu6XQiM3BEbmEMoiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-analytics/node_modules/@algolia/client-search": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-4.24.0.tgz", + "integrity": "sha512-uRW6EpNapmLAD0mW47OXqTP8eiIx5F6qN9/x/7HHO6owL3N1IXqydGwW5nhDFBrV+ldouro2W1VX3XlcUXEFCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-common": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-5.5.1.tgz", + "integrity": "sha512-LWW7RiOELxa6mlTJKNryTas+YxbkPQWa1K0PfzUU/NCFdYJTPtrMrwLIPO2VPGjZFKE1Mbffhn5TUHVQNAaBzQ==", + "dev": true, + "license": "MIT", + "peer": true, + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/@algolia/client-personalization": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-personalization/-/client-personalization-4.24.0.tgz", + "integrity": "sha512-l5FRFm/yngztweU0HdUzz1rC4yoWCFo3IF+dVIVTfEPg906eZg5BOd1k0K6rZx5JzyyoP4LdmOikfkfGsKVE9w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-personalization/node_modules/@algolia/client-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-4.24.0.tgz", + "integrity": "sha512-bc2ROsNL6w6rqpl5jj/UywlIYC21TwSSoFHKl01lYirGMW+9Eek6r02Tocg4gZ8HAw3iBvu6XQiM3BEbmEMoiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/client-search": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-5.5.1.tgz", + "integrity": "sha512-Awz1ps6dSVF1YQTsIspRvdEnlk6GoBBbPuIAV+6K7YaqdUnPLaSsr96+OGC4N1bpu3OtTcN6+nQr+GrGY6zmxw==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "@algolia/client-common": "5.5.1", + "@algolia/requester-browser-xhr": "5.5.1", + "@algolia/requester-fetch": "5.5.1", + "@algolia/requester-node-http": "5.5.1" + }, + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/@algolia/logger-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/logger-common/-/logger-common-4.24.0.tgz", + "integrity": "sha512-LLUNjkahj9KtKYrQhFKCzMx0BY3RnNP4FEtO+sBybCjJ73E8jNdaKJ/Dd8A/VA4imVHP5tADZ8pn5B8Ga/wTMA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@algolia/logger-console": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/logger-console/-/logger-console-4.24.0.tgz", + "integrity": "sha512-X4C8IoHgHfiUROfoRCV+lzSy+LHMgkoEEU1BbKcsfnV0i0S20zyy0NLww9dwVHUWNfPPxdMU+/wKmLGYf96yTg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/logger-common": "4.24.0" + } + }, + "node_modules/@algolia/recommend": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/recommend/-/recommend-4.24.0.tgz", + "integrity": "sha512-P9kcgerfVBpfYHDfVZDvvdJv0lEoCvzNlOy2nykyt5bK8TyieYyiD0lguIJdRZZYGre03WIAFf14pgE+V+IBlw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/cache-browser-local-storage": "4.24.0", + "@algolia/cache-common": "4.24.0", + "@algolia/cache-in-memory": "4.24.0", + "@algolia/client-common": "4.24.0", + "@algolia/client-search": "4.24.0", + "@algolia/logger-common": "4.24.0", + "@algolia/logger-console": "4.24.0", + "@algolia/requester-browser-xhr": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/requester-node-http": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/recommend/node_modules/@algolia/client-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-4.24.0.tgz", + "integrity": "sha512-bc2ROsNL6w6rqpl5jj/UywlIYC21TwSSoFHKl01lYirGMW+9Eek6r02Tocg4gZ8HAw3iBvu6XQiM3BEbmEMoiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/recommend/node_modules/@algolia/client-search": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-4.24.0.tgz", + "integrity": "sha512-uRW6EpNapmLAD0mW47OXqTP8eiIx5F6qN9/x/7HHO6owL3N1IXqydGwW5nhDFBrV+ldouro2W1VX3XlcUXEFCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/@algolia/recommend/node_modules/@algolia/requester-browser-xhr": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/requester-browser-xhr/-/requester-browser-xhr-4.24.0.tgz", + "integrity": "sha512-Z2NxZMb6+nVXSjF13YpjYTdvV3032YTBSGm2vnYvYPA6mMxzM3v5rsCiSspndn9rzIW4Qp1lPHBvuoKJV6jnAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0" + } + }, + "node_modules/@algolia/recommend/node_modules/@algolia/requester-node-http": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/requester-node-http/-/requester-node-http-4.24.0.tgz", + "integrity": "sha512-JF18yTjNOVYvU/L3UosRcvbPMGT9B+/GQWNWnenIImglzNVGpyzChkXLnrSf6uxwVNO6ESGu6oN8MqcGQcjQJw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0" + } + }, + "node_modules/@algolia/requester-browser-xhr": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@algolia/requester-browser-xhr/-/requester-browser-xhr-5.5.1.tgz", + "integrity": "sha512-75w2frEp1Q3Kdb5yhMr8VksOrd+esW+DyzBaV13hXiPmPMukx2GDXu9771sZj4zeAVCHEi/fem5the65c1M+9Q==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "@algolia/client-common": "5.5.1" + }, + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/@algolia/requester-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/requester-common/-/requester-common-4.24.0.tgz", + "integrity": "sha512-k3CXJ2OVnvgE3HMwcojpvY6d9kgKMPRxs/kVohrwF5WMr2fnqojnycZkxPoEg+bXm8fi5BBfFmOqgYztRtHsQA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@algolia/requester-fetch": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@algolia/requester-fetch/-/requester-fetch-5.5.1.tgz", + "integrity": "sha512-QFIfWqEkPpJHq9UicKTmzOKP5YZWixyUrEFMMfofI+aNxKug9ALVD0ldHzOcQGsbnhmMIFSEYckqDNSilaSzYQ==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "@algolia/client-common": "5.5.1" + }, + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/@algolia/requester-node-http": { + "version": "5.5.1", + "resolved": "https://registry.npmmirror.com/@algolia/requester-node-http/-/requester-node-http-5.5.1.tgz", + "integrity": "sha512-X1VRcMYDAIRqACYi0gd85PE9oEBU2+Y3AuIDpokxXNgAiGf0f/HO3/M8lfT2AXIzPFxMJk4mQV0So8wgR94s9Q==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "@algolia/client-common": "5.5.1" + }, + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/@algolia/transporter": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/transporter/-/transporter-4.24.0.tgz", + "integrity": "sha512-86nI7w6NzWxd1Zp9q3413dRshDqAzSbsQjhcDhPIatEFiZrL1/TjnHL8S7jVKFePlIMzDsZWXAXwXzcok9c5oA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/cache-common": "4.24.0", + "@algolia/logger-common": "4.24.0", + "@algolia/requester-common": "4.24.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.24.8", + "resolved": "https://registry.npmmirror.com/@babel/helper-string-parser/-/helper-string-parser-7.24.8.tgz", + "integrity": "sha512-pO9KhhRcuUyGnJWwyEgnRJTSIZHiT+vMD0kPeD+so0l7mxkMT19g3pjY9GTnHySck/hDzq+dtW/4VgnMkippsQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.24.7", + "resolved": "https://registry.npmmirror.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.24.7.tgz", + "integrity": "sha512-rR+PBcQ1SMQDDyF6X0wxtG8QyLCgUB0eRAGguqRLfkCA87l7yAP7ehq8SNj96OOGTO8OBV70KhuFYcIkHXOg0w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.25.6", + "resolved": "https://registry.npmmirror.com/@babel/parser/-/parser-7.25.6.tgz", + "integrity": "sha512-trGdfBdbD0l1ZPmcJ83eNxB9rbEax4ALFTF7fN386TMYbeCQbyme5cOEXQhbGXKebwGaB/J52w1mrklMcbgy6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.25.6" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/types": { + "version": "7.25.6", + "resolved": "https://registry.npmmirror.com/@babel/types/-/types-7.25.6.tgz", + "integrity": "sha512-/l42B1qxpG6RdfYf343Uw1vmDjeNhneUXtzhojE7pDgfpEypmRhI6j1kr17XCVv4Cgl9HdAiQY2x0GwKm7rWCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.24.8", + "@babel/helper-validator-identifier": "^7.24.7", + "to-fast-properties": "^2.0.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@docsearch/css": { + "version": "3.6.1", + "resolved": "https://registry.npmmirror.com/@docsearch/css/-/css-3.6.1.tgz", + "integrity": "sha512-VtVb5DS+0hRIprU2CO6ZQjK2Zg4QU5HrDM1+ix6rT0umsYvFvatMAnf97NHZlVWDaaLlx7GRfR/7FikANiM2Fg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@docsearch/js": { + "version": "3.6.1", + "resolved": "https://registry.npmmirror.com/@docsearch/js/-/js-3.6.1.tgz", + "integrity": "sha512-erI3RRZurDr1xES5hvYJ3Imp7jtrXj6f1xYIzDzxiS7nNBufYWPbJwrmMqWC5g9y165PmxEmN9pklGCdLi0Iqg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@docsearch/react": "3.6.1", + "preact": "^10.0.0" + } + }, + "node_modules/@docsearch/react": { + "version": "3.6.1", + "resolved": "https://registry.npmmirror.com/@docsearch/react/-/react-3.6.1.tgz", + "integrity": "sha512-qXZkEPvybVhSXj0K7U3bXc233tk5e8PfhoZ6MhPOiik/qUQxYC+Dn9DnoS7CxHQQhHfCvTiN0eY9M12oRghEXw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/autocomplete-core": "1.9.3", + "@algolia/autocomplete-preset-algolia": "1.9.3", + "@docsearch/css": "3.6.1", + "algoliasearch": "^4.19.1" + }, + "peerDependencies": { + "@types/react": ">= 16.8.0 < 19.0.0", + "react": ">= 16.8.0 < 19.0.0", + "react-dom": ">= 16.8.0 < 19.0.0", + "search-insights": ">= 1 < 3" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "react": { + "optional": true + }, + "react-dom": { + "optional": true + }, + "search-insights": { + "optional": true + } + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmmirror.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.22.0.tgz", + "integrity": "sha512-/IZQvg6ZR0tAkEi4tdXOraQoWeJy9gbQ/cx4I7k9dJaCk9qrXEcdouxRVz5kZXt5C2bQ9pILoAA+KB4C/d3pfw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.22.0.tgz", + "integrity": "sha512-ETHi4bxrYnvOtXeM7d4V4kZWixib2jddFacJjsOjwbgYSRsyXYtZHC4ht134OsslPIcnkqT+TKV4eU8rNBKyyQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.22.0.tgz", + "integrity": "sha512-ZWgARzhSKE+gVUX7QWaECoRQsPwaD8ZR0Oxb3aUpzdErTvlEadfQpORPXkKSdKbFci9v8MJfkTtoEHnnW9Ulng==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.22.0.tgz", + "integrity": "sha512-h0ZAtOfHyio8Az6cwIGS+nHUfRMWBDO5jXB8PQCARVF6Na/G6XS2SFxDl8Oem+S5ZsHQgtsI7RT4JQnI1qrlaw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.22.0.tgz", + "integrity": "sha512-9pxQJSPwFsVi0ttOmqLY4JJ9pg9t1gKhK0JDbV1yUEETSx55fdyCjt39eBQ54OQCzAF0nVGO6LfEH1KnCPvelA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.22.0.tgz", + "integrity": "sha512-YJ5Ku5BmNJZb58A4qSEo3JlIG4d3G2lWyBi13ABlXzO41SsdnUKi3HQHe83VpwBVG4jHFTW65jOQb8qyoR+qzg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.22.0.tgz", + "integrity": "sha512-U4G4u7f+QCqHlVg1Nlx+qapZy+QoG+NV6ux+upo/T7arNGwKvKP2kmGM4W5QTbdewWFgudQxi3kDNST9GT1/mg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.22.0.tgz", + "integrity": "sha512-aQpNlKmx3amwkA3a5J6nlXSahE1ijl0L9KuIjVOUhfOh7uw2S4piR3mtpxpRtbnK809SBtyPsM9q15CPTsY7HQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.22.0.tgz", + "integrity": "sha512-9fx6Zj/7vve/Fp4iexUFRKb5+RjLCff6YTRQl4CoDhdMfDoobWmhAxQWV3NfShMzQk1Q/iCnageFyGfqnsmeqQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.22.0.tgz", + "integrity": "sha512-VWQiCcN7zBgZYLjndIEh5tamtnKg5TGxyZPWcN9zBtXBwfcGSZ5cHSdQZfQH/GB4uRxk0D3VYbOEe/chJhPGLQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.22.0.tgz", + "integrity": "sha512-EHmPnPWvyYqncObwqrosb/CpH3GOjE76vWVs0g4hWsDRUVhg61hBmlVg5TPXqF+g+PvIbqkC7i3h8wbn4Gp2Fg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.22.0.tgz", + "integrity": "sha512-tsSWy3YQzmpjDKnQ1Vcpy3p9Z+kMFbSIesCdMNgLizDWFhrLZIoN21JSq01g+MZMDFF+Y1+4zxgrlqPjid5ohg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.22.0.tgz", + "integrity": "sha512-anr1Y11uPOQrpuU8XOikY5lH4Qu94oS6j0xrulHk3NkLDq19MlX8Ng/pVipjxBJ9a2l3+F39REZYyWQFkZ4/fw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.22.0.tgz", + "integrity": "sha512-7LB+Bh+Ut7cfmO0m244/asvtIGQr5pG5Rvjz/l1Rnz1kDzM02pSX9jPaS0p+90H5I1x4d1FkCew+B7MOnoatNw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.22.0.tgz", + "integrity": "sha512-+3qZ4rer7t/QsC5JwMpcvCVPRcJt1cJrYS/TMJZzXIJbxWFQEVhrIc26IhB+5Z9fT9umfVc+Es2mOZgl+7jdJQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.22.0.tgz", + "integrity": "sha512-YdicNOSJONVx/vuPkgPTyRoAPx3GbknBZRCOUkK84FJ/YTfs/F0vl/YsMscrB6Y177d+yDRcj+JWMPMCgshwrA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@shikijs/core": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/@shikijs/core/-/core-1.17.7.tgz", + "integrity": "sha512-ZnIDxFu/yvje3Q8owSHaEHd+bu/jdWhHAaJ17ggjXofHx5rc4bhpCSW+OjC6smUBi5s5dd023jWtZ1gzMu/yrw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@shikijs/engine-javascript": "1.17.7", + "@shikijs/engine-oniguruma": "1.17.7", + "@shikijs/types": "1.17.7", + "@shikijs/vscode-textmate": "^9.2.2", + "@types/hast": "^3.0.4", + "hast-util-to-html": "^9.0.2" + } + }, + "node_modules/@shikijs/engine-javascript": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/@shikijs/engine-javascript/-/engine-javascript-1.17.7.tgz", + "integrity": "sha512-wwSf7lKPsm+hiYQdX+1WfOXujtnUG6fnN4rCmExxa4vo+OTmvZ9B1eKauilvol/LHUPrQgW12G3gzem7pY5ckw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@shikijs/types": "1.17.7", + "@shikijs/vscode-textmate": "^9.2.2", + "oniguruma-to-js": "0.4.3" + } + }, + "node_modules/@shikijs/engine-oniguruma": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/@shikijs/engine-oniguruma/-/engine-oniguruma-1.17.7.tgz", + "integrity": "sha512-pvSYGnVeEIconU28NEzBXqSQC/GILbuNbAHwMoSfdTBrobKAsV1vq2K4cAgiaW1TJceLV9QMGGh18hi7cCzbVQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@shikijs/types": "1.17.7", + "@shikijs/vscode-textmate": "^9.2.2" + } + }, + "node_modules/@shikijs/transformers": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/@shikijs/transformers/-/transformers-1.17.7.tgz", + "integrity": "sha512-Nu7DaUT/qHDqbEsWBBqX6MyPMFbR4hUZcK11TA+zU/nPu9eDFE8v0p+n+eT4A3+3mxX6czMSF81W4QNsQ/NSpQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "shiki": "1.17.7" + } + }, + "node_modules/@shikijs/types": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/@shikijs/types/-/types-1.17.7.tgz", + "integrity": "sha512-+qA4UyhWLH2q4EFd+0z4K7GpERDU+c+CN2XYD3sC+zjvAr5iuwD1nToXZMt1YODshjkEGEDV86G7j66bKjqDdg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@shikijs/vscode-textmate": "^9.2.2", + "@types/hast": "^3.0.4" + } + }, + "node_modules/@shikijs/vscode-textmate": { + "version": "9.2.2", + "resolved": "https://registry.npmmirror.com/@shikijs/vscode-textmate/-/vscode-textmate-9.2.2.tgz", + "integrity": "sha512-TMp15K+GGYrWlZM8+Lnj9EaHEFmOen0WJBrfa17hF7taDOYthuPPV0GWzfd/9iMij0akS/8Yw2ikquH7uVi/fg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/estree": { + "version": "1.0.5", + "resolved": "https://registry.npmmirror.com/@types/estree/-/estree-1.0.5.tgz", + "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/hast": { + "version": "3.0.4", + "resolved": "https://registry.npmmirror.com/@types/hast/-/hast-3.0.4.tgz", + "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, + "node_modules/@types/linkify-it": { + "version": "5.0.0", + "resolved": "https://registry.npmmirror.com/@types/linkify-it/-/linkify-it-5.0.0.tgz", + "integrity": "sha512-sVDA58zAw4eWAffKOaQH5/5j3XeayukzDk+ewSsnv3p4yJEZHCCzMDiZM8e0OUrRvmpGZ85jf4yDHkHsgBNr9Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/markdown-it": { + "version": "14.1.2", + "resolved": "https://registry.npmmirror.com/@types/markdown-it/-/markdown-it-14.1.2.tgz", + "integrity": "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/linkify-it": "^5", + "@types/mdurl": "^2" + } + }, + "node_modules/@types/mdast": { + "version": "4.0.4", + "resolved": "https://registry.npmmirror.com/@types/mdast/-/mdast-4.0.4.tgz", + "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, + "node_modules/@types/mdurl": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/@types/mdurl/-/mdurl-2.0.0.tgz", + "integrity": "sha512-RGdgjQUZba5p6QEFAVx2OGb8rQDL/cPRG7GiedRzMcJ1tYnUANBncjbSB1NRGwbvjcPeikRABz2nshyPk1bhWg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmmirror.com/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/web-bluetooth": { + "version": "0.0.20", + "resolved": "https://registry.npmmirror.com/@types/web-bluetooth/-/web-bluetooth-0.0.20.tgz", + "integrity": "sha512-g9gZnnXVq7gM7v3tJCWV/qw7w+KeOlSHAhgF9RytFyifW6AF61hdT2ucrYhPq9hLs5JIryeupHV3qGk95dH9ow==", + "dev": true, + "license": "MIT" + }, + "node_modules/@ungap/structured-clone": { + "version": "1.2.0", + "resolved": "https://registry.npmmirror.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz", + "integrity": "sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/@vitejs/plugin-vue": { + "version": "5.1.4", + "resolved": "https://registry.npmmirror.com/@vitejs/plugin-vue/-/plugin-vue-5.1.4.tgz", + "integrity": "sha512-N2XSI2n3sQqp5w7Y/AN/L2XDjBIRGqXko+eDp42sydYSBeJuSm5a1sLf8zakmo8u7tA8NmBgoDLA1HeOESjp9A==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "peerDependencies": { + "vite": "^5.0.0", + "vue": "^3.2.25" + } + }, + "node_modules/@vue/compiler-core": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/compiler-core/-/compiler-core-3.5.6.tgz", + "integrity": "sha512-r+gNu6K4lrvaQLQGmf+1gc41p3FO2OUJyWmNqaIITaJU6YFiV5PtQSFZt8jfztYyARwqhoCayjprC7KMvT3nRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/shared": "3.5.6", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/compiler-dom/-/compiler-dom-3.5.6.tgz", + "integrity": "sha512-xRXqxDrIqK8v8sSScpistyYH0qYqxakpsIvqMD2e5sV/PXQ1mTwtXp4k42yHK06KXxKSmitop9e45Ui/3BrTEw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.6", + "@vue/shared": "3.5.6" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/compiler-sfc/-/compiler-sfc-3.5.6.tgz", + "integrity": "sha512-pjWJ8Kj9TDHlbF5LywjVso+BIxCY5wVOLhkEXRhuCHDxPFIeX1zaFefKs8RYoHvkSMqRWt93a0f2gNJVJixHwg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/compiler-core": "3.5.6", + "@vue/compiler-dom": "3.5.6", + "@vue/compiler-ssr": "3.5.6", + "@vue/shared": "3.5.6", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.11", + "postcss": "^8.4.47", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-ssr": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/compiler-ssr/-/compiler-ssr-3.5.6.tgz", + "integrity": "sha512-VpWbaZrEOCqnmqjE83xdwegtr5qO/2OPUC6veWgvNqTJ3bYysz6vY3VqMuOijubuUYPRpG3OOKIh9TD0Stxb9A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.6", + "@vue/shared": "3.5.6" + } + }, + "node_modules/@vue/devtools-api": { + "version": "7.4.5", + "resolved": "https://registry.npmmirror.com/@vue/devtools-api/-/devtools-api-7.4.5.tgz", + "integrity": "sha512-PX9uXirHOY2P99kb1cP3DxWZojFW3acNMqd+l4i5nKcqY59trXTOfwDZXt2Qifu0OU1izAQb76Ur6NPVldF2KQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/devtools-kit": "^7.4.5" + } + }, + "node_modules/@vue/devtools-kit": { + "version": "7.4.5", + "resolved": "https://registry.npmmirror.com/@vue/devtools-kit/-/devtools-kit-7.4.5.tgz", + "integrity": "sha512-Uuki4Z6Bc/ExvtlPkeDNGSAe4580R+HPcVABfTE9TF7BTz3Nntk7vxIRUyWblZkUEcB/x+wn2uofyt5i2LaUew==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/devtools-shared": "^7.4.5", + "birpc": "^0.2.17", + "hookable": "^5.5.3", + "mitt": "^3.0.1", + "perfect-debounce": "^1.0.0", + "speakingurl": "^14.0.1", + "superjson": "^2.2.1" + } + }, + "node_modules/@vue/devtools-shared": { + "version": "7.4.5", + "resolved": "https://registry.npmmirror.com/@vue/devtools-shared/-/devtools-shared-7.4.5.tgz", + "integrity": "sha512-2XgUOkL/7QDmyYI9J7cm+rz/qBhcGv+W5+i1fhwdQ0HQ1RowhdK66F0QBuJSz/5k12opJY8eN6m03/XZMs7imQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "rfdc": "^1.4.1" + } + }, + "node_modules/@vue/reactivity": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/reactivity/-/reactivity-3.5.6.tgz", + "integrity": "sha512-shZ+KtBoHna5GyUxWfoFVBCVd7k56m6lGhk5e+J9AKjheHF6yob5eukssHRI+rzvHBiU1sWs/1ZhNbLExc5oYQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.6" + } + }, + "node_modules/@vue/runtime-core": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/runtime-core/-/runtime-core-3.5.6.tgz", + "integrity": "sha512-FpFULR6+c2lI+m1fIGONLDqPQO34jxV8g6A4wBOgne8eSRHP6PQL27+kWFIx5wNhhjkO7B4rgtsHAmWv7qKvbg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.6", + "@vue/shared": "3.5.6" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/runtime-dom/-/runtime-dom-3.5.6.tgz", + "integrity": "sha512-SDPseWre45G38ENH2zXRAHL1dw/rr5qp91lS4lt/nHvMr0MhsbCbihGAWLXNB/6VfFOJe2O+RBRkXU+CJF7/sw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.6", + "@vue/runtime-core": "3.5.6", + "@vue/shared": "3.5.6", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/server-renderer/-/server-renderer-3.5.6.tgz", + "integrity": "sha512-zivnxQnOnwEXVaT9CstJ64rZFXMS5ZkKxCjDQKiMSvUhXRzFLWZVbaBiNF4HGDqGNNsTgmjcCSmU6TB/0OOxLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.6", + "@vue/shared": "3.5.6" + }, + "peerDependencies": { + "vue": "3.5.6" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/@vue/shared/-/shared-3.5.6.tgz", + "integrity": "sha512-eidH0HInnL39z6wAt6SFIwBrvGOpDWsDxlw3rCgo1B+CQ1781WzQUSU3YjxgdkcJo9Q8S6LmXTkvI+cLHGkQfA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@vueuse/core": { + "version": "11.1.0", + "resolved": "https://registry.npmmirror.com/@vueuse/core/-/core-11.1.0.tgz", + "integrity": "sha512-P6dk79QYA6sKQnghrUz/1tHi0n9mrb/iO1WTMk/ElLmTyNqgDeSZ3wcDf6fRBGzRJbeG1dxzEOvLENMjr+E3fg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/web-bluetooth": "^0.0.20", + "@vueuse/metadata": "11.1.0", + "@vueuse/shared": "11.1.0", + "vue-demi": ">=0.14.10" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/@vueuse/core/node_modules/vue-demi": { + "version": "0.14.10", + "resolved": "https://registry.npmmirror.com/vue-demi/-/vue-demi-0.14.10.tgz", + "integrity": "sha512-nMZBOwuzabUO0nLgIcc6rycZEebF6eeUfaiQx9+WSk8e29IbLvPU9feI6tqW4kTo3hvoYAJkMh8n8D0fuISphg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "vue-demi-fix": "bin/vue-demi-fix.js", + "vue-demi-switch": "bin/vue-demi-switch.js" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@vue/composition-api": "^1.0.0-rc.1", + "vue": "^3.0.0-0 || ^2.6.0" + }, + "peerDependenciesMeta": { + "@vue/composition-api": { + "optional": true + } + } + }, + "node_modules/@vueuse/integrations": { + "version": "11.1.0", + "resolved": "https://registry.npmmirror.com/@vueuse/integrations/-/integrations-11.1.0.tgz", + "integrity": "sha512-O2ZgrAGPy0qAjpoI2YR3egNgyEqwG85fxfwmA9BshRIGjV4G6yu6CfOPpMHAOoCD+UfsIl7Vb1bXJ6ifrHYDDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vueuse/core": "11.1.0", + "@vueuse/shared": "11.1.0", + "vue-demi": ">=0.14.10" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "async-validator": "^4", + "axios": "^1", + "change-case": "^5", + "drauu": "^0.4", + "focus-trap": "^7", + "fuse.js": "^7", + "idb-keyval": "^6", + "jwt-decode": "^4", + "nprogress": "^0.2", + "qrcode": "^1.5", + "sortablejs": "^1", + "universal-cookie": "^7" + }, + "peerDependenciesMeta": { + "async-validator": { + "optional": true + }, + "axios": { + "optional": true + }, + "change-case": { + "optional": true + }, + "drauu": { + "optional": true + }, + "focus-trap": { + "optional": true + }, + "fuse.js": { + "optional": true + }, + "idb-keyval": { + "optional": true + }, + "jwt-decode": { + "optional": true + }, + "nprogress": { + "optional": true + }, + "qrcode": { + "optional": true + }, + "sortablejs": { + "optional": true + }, + "universal-cookie": { + "optional": true + } + } + }, + "node_modules/@vueuse/integrations/node_modules/vue-demi": { + "version": "0.14.10", + "resolved": "https://registry.npmmirror.com/vue-demi/-/vue-demi-0.14.10.tgz", + "integrity": "sha512-nMZBOwuzabUO0nLgIcc6rycZEebF6eeUfaiQx9+WSk8e29IbLvPU9feI6tqW4kTo3hvoYAJkMh8n8D0fuISphg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "vue-demi-fix": "bin/vue-demi-fix.js", + "vue-demi-switch": "bin/vue-demi-switch.js" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@vue/composition-api": "^1.0.0-rc.1", + "vue": "^3.0.0-0 || ^2.6.0" + }, + "peerDependenciesMeta": { + "@vue/composition-api": { + "optional": true + } + } + }, + "node_modules/@vueuse/metadata": { + "version": "11.1.0", + "resolved": "https://registry.npmmirror.com/@vueuse/metadata/-/metadata-11.1.0.tgz", + "integrity": "sha512-l9Q502TBTaPYGanl1G+hPgd3QX5s4CGnpXriVBR5fEZ/goI6fvDaVmIl3Td8oKFurOxTmbXvBPSsgrd6eu6HYg==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/@vueuse/shared": { + "version": "11.1.0", + "resolved": "https://registry.npmmirror.com/@vueuse/shared/-/shared-11.1.0.tgz", + "integrity": "sha512-YUtIpY122q7osj+zsNMFAfMTubGz0sn5QzE5gPzAIiCmtt2ha3uQUY1+JPyL4gRCTsLPX82Y9brNbo/aqlA91w==", + "dev": true, + "license": "MIT", + "dependencies": { + "vue-demi": ">=0.14.10" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/@vueuse/shared/node_modules/vue-demi": { + "version": "0.14.10", + "resolved": "https://registry.npmmirror.com/vue-demi/-/vue-demi-0.14.10.tgz", + "integrity": "sha512-nMZBOwuzabUO0nLgIcc6rycZEebF6eeUfaiQx9+WSk8e29IbLvPU9feI6tqW4kTo3hvoYAJkMh8n8D0fuISphg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "vue-demi-fix": "bin/vue-demi-fix.js", + "vue-demi-switch": "bin/vue-demi-switch.js" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@vue/composition-api": "^1.0.0-rc.1", + "vue": "^3.0.0-0 || ^2.6.0" + }, + "peerDependenciesMeta": { + "@vue/composition-api": { + "optional": true + } + } + }, + "node_modules/algoliasearch": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/algoliasearch/-/algoliasearch-4.24.0.tgz", + "integrity": "sha512-bf0QV/9jVejssFBmz2HQLxUadxk574t4iwjCKp5E7NBzwKkrDEhKPISIIjAU/p6K5qDx3qoeh4+26zWN1jmw3g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/cache-browser-local-storage": "4.24.0", + "@algolia/cache-common": "4.24.0", + "@algolia/cache-in-memory": "4.24.0", + "@algolia/client-account": "4.24.0", + "@algolia/client-analytics": "4.24.0", + "@algolia/client-common": "4.24.0", + "@algolia/client-personalization": "4.24.0", + "@algolia/client-search": "4.24.0", + "@algolia/logger-common": "4.24.0", + "@algolia/logger-console": "4.24.0", + "@algolia/recommend": "4.24.0", + "@algolia/requester-browser-xhr": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/requester-node-http": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/algoliasearch/node_modules/@algolia/client-common": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-common/-/client-common-4.24.0.tgz", + "integrity": "sha512-bc2ROsNL6w6rqpl5jj/UywlIYC21TwSSoFHKl01lYirGMW+9Eek6r02Tocg4gZ8HAw3iBvu6XQiM3BEbmEMoiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/algoliasearch/node_modules/@algolia/client-search": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-4.24.0.tgz", + "integrity": "sha512-uRW6EpNapmLAD0mW47OXqTP8eiIx5F6qN9/x/7HHO6owL3N1IXqydGwW5nhDFBrV+ldouro2W1VX3XlcUXEFCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/client-common": "4.24.0", + "@algolia/requester-common": "4.24.0", + "@algolia/transporter": "4.24.0" + } + }, + "node_modules/algoliasearch/node_modules/@algolia/requester-browser-xhr": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/requester-browser-xhr/-/requester-browser-xhr-4.24.0.tgz", + "integrity": "sha512-Z2NxZMb6+nVXSjF13YpjYTdvV3032YTBSGm2vnYvYPA6mMxzM3v5rsCiSspndn9rzIW4Qp1lPHBvuoKJV6jnAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0" + } + }, + "node_modules/algoliasearch/node_modules/@algolia/requester-node-http": { + "version": "4.24.0", + "resolved": "https://registry.npmmirror.com/@algolia/requester-node-http/-/requester-node-http-4.24.0.tgz", + "integrity": "sha512-JF18yTjNOVYvU/L3UosRcvbPMGT9B+/GQWNWnenIImglzNVGpyzChkXLnrSf6uxwVNO6ESGu6oN8MqcGQcjQJw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@algolia/requester-common": "4.24.0" + } + }, + "node_modules/birpc": { + "version": "0.2.17", + "resolved": "https://registry.npmmirror.com/birpc/-/birpc-0.2.17.tgz", + "integrity": "sha512-+hkTxhot+dWsLpp3gia5AkVHIsKlZybNT5gIYiDlNzJrmYPcTM9k5/w2uaj3IPpd7LlEYpmCj4Jj1nC41VhDFg==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/ccount": { + "version": "2.0.1", + "resolved": "https://registry.npmmirror.com/ccount/-/ccount-2.0.1.tgz", + "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/character-entities-html4": { + "version": "2.1.0", + "resolved": "https://registry.npmmirror.com/character-entities-html4/-/character-entities-html4-2.1.0.tgz", + "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/character-entities-legacy": { + "version": "3.0.0", + "resolved": "https://registry.npmmirror.com/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz", + "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/comma-separated-tokens": { + "version": "2.0.3", + "resolved": "https://registry.npmmirror.com/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", + "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/copy-anything": { + "version": "3.0.5", + "resolved": "https://registry.npmmirror.com/copy-anything/-/copy-anything-3.0.5.tgz", + "integrity": "sha512-yCEafptTtb4bk7GLEQoM8KVJpxAfdBJYaXyzQEgQQQgYrZiDp8SJmGKlYza6CYjEDNstAdNdKA3UuoULlEbS6w==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-what": "^4.1.8" + }, + "engines": { + "node": ">=12.13" + }, + "funding": { + "url": "https://github.com/sponsors/mesqueeb" + } + }, + "node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmmirror.com/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", + "dev": true, + "license": "MIT" + }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmmirror.com/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/devlop": { + "version": "1.1.0", + "resolved": "https://registry.npmmirror.com/devlop/-/devlop-1.1.0.tgz", + "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==", + "dev": true, + "license": "MIT", + "dependencies": { + "dequal": "^2.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmmirror.com/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmmirror.com/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmmirror.com/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "dev": true, + "license": "MIT" + }, + "node_modules/focus-trap": { + "version": "7.6.0", + "resolved": "https://registry.npmmirror.com/focus-trap/-/focus-trap-7.6.0.tgz", + "integrity": "sha512-1td0l3pMkWJLFipobUcGaf+5DTY4PLDDrcqoSaKP8ediO/CoWCCYk/fT/Y2A4e6TNB+Sh6clRJCjOPPnKoNHnQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "tabbable": "^6.2.0" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmmirror.com/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/hast-util-to-html": { + "version": "9.0.3", + "resolved": "https://registry.npmmirror.com/hast-util-to-html/-/hast-util-to-html-9.0.3.tgz", + "integrity": "sha512-M17uBDzMJ9RPCqLMO92gNNUDuBSq10a25SDBI08iCCxmorf4Yy6sYHK57n9WAbRAAaU+DuR4W6GN9K4DFZesYg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "ccount": "^2.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-whitespace": "^3.0.0", + "html-void-elements": "^3.0.0", + "mdast-util-to-hast": "^13.0.0", + "property-information": "^6.0.0", + "space-separated-tokens": "^2.0.0", + "stringify-entities": "^4.0.0", + "zwitch": "^2.0.4" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-whitespace": { + "version": "3.0.0", + "resolved": "https://registry.npmmirror.com/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", + "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hookable": { + "version": "5.5.3", + "resolved": "https://registry.npmmirror.com/hookable/-/hookable-5.5.3.tgz", + "integrity": "sha512-Yc+BQe8SvoXH1643Qez1zqLRmbA5rCL+sSmk6TVos0LWVfNIB7PGncdlId77WzLGSIB5KaWgTaNTs2lNVEI6VQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/html-void-elements": { + "version": "3.0.0", + "resolved": "https://registry.npmmirror.com/html-void-elements/-/html-void-elements-3.0.0.tgz", + "integrity": "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/is-what": { + "version": "4.1.16", + "resolved": "https://registry.npmmirror.com/is-what/-/is-what-4.1.16.tgz", + "integrity": "sha512-ZhMwEosbFJkA0YhFnNDgTM4ZxDRsS6HqTo7qsZM08fehyRYIYa0yHu5R6mgo1n/8MgaPBXiPimPD77baVFYg+A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.13" + }, + "funding": { + "url": "https://github.com/sponsors/mesqueeb" + } + }, + "node_modules/magic-string": { + "version": "0.30.11", + "resolved": "https://registry.npmmirror.com/magic-string/-/magic-string-0.30.11.tgz", + "integrity": "sha512-+Wri9p0QHMy+545hKww7YAu5NyzF8iomPL/RQazugQ9+Ez4Ic3mERMd8ZTX5rfK944j+560ZJi8iAwgak1Ac7A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0" + } + }, + "node_modules/mark.js": { + "version": "8.11.1", + "resolved": "https://registry.npmmirror.com/mark.js/-/mark.js-8.11.1.tgz", + "integrity": "sha512-1I+1qpDt4idfgLQG+BNWmrqku+7/2bi5nLf4YwF8y8zXvmfiTBY3PV3ZibfrjBueCByROpuBjLLFCajqkgYoLQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/mdast-util-to-hast": { + "version": "13.2.0", + "resolved": "https://registry.npmmirror.com/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz", + "integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/mdast": "^4.0.0", + "@ungap/structured-clone": "^1.0.0", + "devlop": "^1.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "trim-lines": "^3.0.0", + "unist-util-position": "^5.0.0", + "unist-util-visit": "^5.0.0", + "vfile": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-util-character": { + "version": "2.1.0", + "resolved": "https://registry.npmmirror.com/micromark-util-character/-/micromark-util-character-2.1.0.tgz", + "integrity": "sha512-KvOVV+X1yLBfs9dCBSopq/+G1PcgT3lAK07mC4BzXi5E7ahzMAF8oIupDDJ6mievI6F+lAATkbQQlQixJfT3aQ==", + "dev": true, + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + } + }, + "node_modules/micromark-util-encode": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/micromark-util-encode/-/micromark-util-encode-2.0.0.tgz", + "integrity": "sha512-pS+ROfCXAGLWCOc8egcBvT0kf27GoWMqtdarNfDcjb6YLuV5cM3ioG45Ys2qOVqeqSbjaKg72vU+Wby3eddPsA==", + "dev": true, + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromark-util-sanitize-uri": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.0.tgz", + "integrity": "sha512-WhYv5UEcZrbAtlsnPuChHUAsu/iBPOVaEVsntLBIdpibO0ddy8OzavZz3iL2xVvBZOpolujSliP65Kq0/7KIYw==", + "dev": true, + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-encode": "^2.0.0", + "micromark-util-symbol": "^2.0.0" + } + }, + "node_modules/micromark-util-symbol": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/micromark-util-symbol/-/micromark-util-symbol-2.0.0.tgz", + "integrity": "sha512-8JZt9ElZ5kyTnO94muPxIGS8oyElRJaiJO8EzV6ZSyGQ1Is8xwl4Q45qU5UOg+bGH4AikWziz0iN4sFLWs8PGw==", + "dev": true, + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/micromark-util-types": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/micromark-util-types/-/micromark-util-types-2.0.0.tgz", + "integrity": "sha512-oNh6S2WMHWRZrmutsRmDDfkzKtxF+bc2VxLC9dvtrDIRFln627VsFP6fLMgTryGDljgLPjkrzQSDcPrjPyDJ5w==", + "dev": true, + "funding": [ + { + "type": "GitHub Sponsors", + "url": "https://github.com/sponsors/unifiedjs" + }, + { + "type": "OpenCollective", + "url": "https://opencollective.com/unified" + } + ], + "license": "MIT" + }, + "node_modules/minisearch": { + "version": "7.1.0", + "resolved": "https://registry.npmmirror.com/minisearch/-/minisearch-7.1.0.tgz", + "integrity": "sha512-tv7c/uefWdEhcu6hvrfTihflgeEi2tN6VV7HJnCjK6VxM75QQJh4t9FwJCsA2EsRS8LCnu3W87CuGPWMocOLCA==", + "dev": true, + "license": "MIT" + }, + "node_modules/mitt": { + "version": "3.0.1", + "resolved": "https://registry.npmmirror.com/mitt/-/mitt-3.0.1.tgz", + "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.7", + "resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.7.tgz", + "integrity": "sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/oniguruma-to-js": { + "version": "0.4.3", + "resolved": "https://registry.npmmirror.com/oniguruma-to-js/-/oniguruma-to-js-0.4.3.tgz", + "integrity": "sha512-X0jWUcAlxORhOqqBREgPMgnshB7ZGYszBNspP+tS9hPD3l13CdaXcHbgImoHUHlrvGx/7AvFEkTRhAGYh+jzjQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "regex": "^4.3.2" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/perfect-debounce": { + "version": "1.0.0", + "resolved": "https://registry.npmmirror.com/perfect-debounce/-/perfect-debounce-1.0.0.tgz", + "integrity": "sha512-xCy9V055GLEqoFaHoC1SoLIaLmWctgCUaBaWxDZ7/Zx4CTyX7cJQLJOok/orfjZAh9kEYpjJa4d0KcJmCbctZA==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.0", + "resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.1.0.tgz", + "integrity": "sha512-TQ92mBOW0l3LeMeyLV6mzy/kWr8lkd/hp3mTg7wYK7zJhuBStmGMBG0BdeDZS/dZx1IukaX6Bk11zcln25o1Aw==", + "dev": true, + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.4.47", + "resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.4.47.tgz", + "integrity": "sha512-56rxCq7G/XfB4EkXq9Egn5GCqugWvDFjafDOThIdMBsI15iqPqR5r15TfSr1YPYeEI19YeaXMCbY6u88Y76GLQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.0", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/preact": { + "version": "10.24.0", + "resolved": "https://registry.npmmirror.com/preact/-/preact-10.24.0.tgz", + "integrity": "sha512-aK8Cf+jkfyuZ0ZZRG9FbYqwmEiGQ4y/PUO4SuTWoyWL244nZZh7bd5h2APd4rSNDYTBNghg1L+5iJN3Skxtbsw==", + "dev": true, + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/preact" + } + }, + "node_modules/property-information": { + "version": "6.5.0", + "resolved": "https://registry.npmmirror.com/property-information/-/property-information-6.5.0.tgz", + "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/regex": { + "version": "4.3.2", + "resolved": "https://registry.npmmirror.com/regex/-/regex-4.3.2.tgz", + "integrity": "sha512-kK/AA3A9K6q2js89+VMymcboLOlF5lZRCYJv3gzszXFHBr6kO6qLGzbm+UIugBEV8SMMKCTR59txoY6ctRHYVw==", + "dev": true, + "license": "MIT" + }, + "node_modules/rfdc": { + "version": "1.4.1", + "resolved": "https://registry.npmmirror.com/rfdc/-/rfdc-1.4.1.tgz", + "integrity": "sha512-q1b3N5QkRUWUl7iyylaaj3kOpIT0N2i9MqIEQXP73GVsN9cw3fdx8X63cEmWhJGi2PPCF23Ijp7ktmd39rawIA==", + "dev": true, + "license": "MIT" + }, + "node_modules/rollup": { + "version": "4.22.0", + "resolved": "https://registry.npmmirror.com/rollup/-/rollup-4.22.0.tgz", + "integrity": "sha512-W21MUIFPZ4+O2Je/EU+GP3iz7PH4pVPUXSbEZdatQnxo29+3rsUjgrJmzuAZU24z7yRAnFN6ukxeAhZh/c7hzg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.5" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.22.0", + "@rollup/rollup-android-arm64": "4.22.0", + "@rollup/rollup-darwin-arm64": "4.22.0", + "@rollup/rollup-darwin-x64": "4.22.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.22.0", + "@rollup/rollup-linux-arm-musleabihf": "4.22.0", + "@rollup/rollup-linux-arm64-gnu": "4.22.0", + "@rollup/rollup-linux-arm64-musl": "4.22.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.22.0", + "@rollup/rollup-linux-riscv64-gnu": "4.22.0", + "@rollup/rollup-linux-s390x-gnu": "4.22.0", + "@rollup/rollup-linux-x64-gnu": "4.22.0", + "@rollup/rollup-linux-x64-musl": "4.22.0", + "@rollup/rollup-win32-arm64-msvc": "4.22.0", + "@rollup/rollup-win32-ia32-msvc": "4.22.0", + "@rollup/rollup-win32-x64-msvc": "4.22.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/search-insights": { + "version": "2.17.2", + "resolved": "https://registry.npmmirror.com/search-insights/-/search-insights-2.17.2.tgz", + "integrity": "sha512-zFNpOpUO+tY2D85KrxJ+aqwnIfdEGi06UH2+xEb+Bp9Mwznmauqc9djbnBibJO5mpfUPPa8st6Sx65+vbeO45g==", + "dev": true, + "license": "MIT", + "peer": true + }, + "node_modules/shiki": { + "version": "1.17.7", + "resolved": "https://registry.npmmirror.com/shiki/-/shiki-1.17.7.tgz", + "integrity": "sha512-Zf6hNtWhFyF4XP5OOsXkBTEx9JFPiN0TQx4wSe+Vqeuczewgk2vT4IZhF4gka55uelm052BD5BaHavNqUNZd+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@shikijs/core": "1.17.7", + "@shikijs/engine-javascript": "1.17.7", + "@shikijs/engine-oniguruma": "1.17.7", + "@shikijs/types": "1.17.7", + "@shikijs/vscode-textmate": "^9.2.2", + "@types/hast": "^3.0.4" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/space-separated-tokens": { + "version": "2.0.2", + "resolved": "https://registry.npmmirror.com/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", + "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/speakingurl": { + "version": "14.0.1", + "resolved": "https://registry.npmmirror.com/speakingurl/-/speakingurl-14.0.1.tgz", + "integrity": "sha512-1POYv7uv2gXoyGFpBCmpDVSNV74IfsWlDW216UPjbWufNf+bSU6GdbDsxdcxtfwb4xlI3yxzOTKClUosxARYrQ==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/stringify-entities": { + "version": "4.0.4", + "resolved": "https://registry.npmmirror.com/stringify-entities/-/stringify-entities-4.0.4.tgz", + "integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==", + "dev": true, + "license": "MIT", + "dependencies": { + "character-entities-html4": "^2.0.0", + "character-entities-legacy": "^3.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/superjson": { + "version": "2.2.1", + "resolved": "https://registry.npmmirror.com/superjson/-/superjson-2.2.1.tgz", + "integrity": "sha512-8iGv75BYOa0xRJHK5vRLEjE2H/i4lulTjzpUXic3Eg8akftYjkmQDa8JARQ42rlczXyFR3IeRoeFCc7RxHsYZA==", + "dev": true, + "license": "MIT", + "dependencies": { + "copy-anything": "^3.0.2" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tabbable": { + "version": "6.2.0", + "resolved": "https://registry.npmmirror.com/tabbable/-/tabbable-6.2.0.tgz", + "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==", + "dev": true, + "license": "MIT" + }, + "node_modules/to-fast-properties": { + "version": "2.0.0", + "resolved": "https://registry.npmmirror.com/to-fast-properties/-/to-fast-properties-2.0.0.tgz", + "integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/trim-lines": { + "version": "3.0.1", + "resolved": "https://registry.npmmirror.com/trim-lines/-/trim-lines-3.0.1.tgz", + "integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/unist-util-is": { + "version": "6.0.0", + "resolved": "https://registry.npmmirror.com/unist-util-is/-/unist-util-is-6.0.0.tgz", + "integrity": "sha512-2qCTHimwdxLfz+YzdGfkqNlH0tLi9xjTnHddPmJwtIG9MGsdbutfTc4P+haPD7l7Cjxf/WZj+we5qfVPvvxfYw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-position": { + "version": "5.0.0", + "resolved": "https://registry.npmmirror.com/unist-util-position/-/unist-util-position-5.0.0.tgz", + "integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-stringify-position": { + "version": "4.0.0", + "resolved": "https://registry.npmmirror.com/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", + "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit": { + "version": "5.0.0", + "resolved": "https://registry.npmmirror.com/unist-util-visit/-/unist-util-visit-5.0.0.tgz", + "integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/unist-util-visit-parents": { + "version": "6.0.1", + "resolved": "https://registry.npmmirror.com/unist-util-visit-parents/-/unist-util-visit-parents-6.0.1.tgz", + "integrity": "sha512-L/PqWzfTP9lzzEa6CKs0k2nARxTdZduw3zyh8d2NVBnsyvHjSX4TWse388YrrQKbvI8w20fGjGlhgT96WwKykw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile": { + "version": "6.0.3", + "resolved": "https://registry.npmmirror.com/vfile/-/vfile-6.0.3.tgz", + "integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vfile-message": { + "version": "4.0.2", + "resolved": "https://registry.npmmirror.com/vfile-message/-/vfile-message-4.0.2.tgz", + "integrity": "sha512-jRDZ1IMLttGj41KcZvlrYAaI3CfqpLpfpf+Mfig13viT6NKvRzWZ+lXz0Y5D60w6uJIBAOGq9mSHf0gktF0duw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-stringify-position": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/vite": { + "version": "5.4.6", + "resolved": "https://registry.npmmirror.com/vite/-/vite-5.4.6.tgz", + "integrity": "sha512-IeL5f8OO5nylsgzd9tq4qD2QqI0k2CQLGrWD0rCN0EQJZpBK5vJAx0I+GDkMOXxQX/OfFHMuLIx6ddAxGX/k+Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/vitepress": { + "version": "1.3.4", + "resolved": "https://registry.npmmirror.com/vitepress/-/vitepress-1.3.4.tgz", + "integrity": "sha512-I1/F6OW1xl3kW4PaIMC6snxjWgf3qfziq2aqsDoFc/Gt41WbcRv++z8zjw8qGRIJ+I4bUW7ZcKFDHHN/jkH9DQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@docsearch/css": "^3.6.1", + "@docsearch/js": "^3.6.1", + "@shikijs/core": "^1.13.0", + "@shikijs/transformers": "^1.13.0", + "@types/markdown-it": "^14.1.2", + "@vitejs/plugin-vue": "^5.1.2", + "@vue/devtools-api": "^7.3.8", + "@vue/shared": "^3.4.38", + "@vueuse/core": "^11.0.0", + "@vueuse/integrations": "^11.0.0", + "focus-trap": "^7.5.4", + "mark.js": "8.11.1", + "minisearch": "^7.1.0", + "shiki": "^1.13.0", + "vite": "^5.4.1", + "vue": "^3.4.38" + }, + "bin": { + "vitepress": "bin/vitepress.js" + }, + "peerDependencies": { + "markdown-it-mathjax3": "^4", + "postcss": "^8" + }, + "peerDependenciesMeta": { + "markdown-it-mathjax3": { + "optional": true + }, + "postcss": { + "optional": true + } + } + }, + "node_modules/vue": { + "version": "3.5.6", + "resolved": "https://registry.npmmirror.com/vue/-/vue-3.5.6.tgz", + "integrity": "sha512-zv+20E2VIYbcJOzJPUWp03NOGFhMmpCKOfSxVTmCYyYFFko48H9tmuQFzYj7tu4qX1AeXlp9DmhIP89/sSxxhw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.6", + "@vue/compiler-sfc": "3.5.6", + "@vue/runtime-dom": "3.5.6", + "@vue/server-renderer": "3.5.6", + "@vue/shared": "3.5.6" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/zwitch": { + "version": "2.0.4", + "resolved": "https://registry.npmmirror.com/zwitch/-/zwitch-2.0.4.tgz", + "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==", + "dev": true, + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + } + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/package.json b/MindSpider/DeepSentimentCrawling/MediaCrawler/package.json new file mode 100644 index 0000000..d7e5a5a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/package.json @@ -0,0 +1,10 @@ +{ + "scripts": { + "docs:dev": "vitepress dev docs", + "docs:build": "vitepress build docs", + "docs:preview": "vitepress preview docs" + }, + "devDependencies": { + "vitepress": "^1.3.4" + } +} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/__init__.py new file mode 100644 index 0000000..fe35c24 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/__init__.py @@ -0,0 +1,16 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 14:37 +# @Desc : IP代理池入口 +from .base_proxy import * diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/base_proxy.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/base_proxy.py new file mode 100644 index 0000000..b6f0027 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/base_proxy.py @@ -0,0 +1,75 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 11:18 +# @Desc : 爬虫 IP 获取实现 +# @Url : 快代理HTTP实现,官方文档:https://www.kuaidaili.com/?ref=ldwkjqipvz6c +import json +from abc import ABC, abstractmethod +from typing import List + +import config +from cache.abs_cache import AbstractCache +from cache.cache_factory import CacheFactory +from tools.utils import utils + +from .types import IpInfoModel + + +class IpGetError(Exception): + """ ip get error""" + + +class ProxyProvider(ABC): + @abstractmethod + async def get_proxy(self, num: int) -> List[IpInfoModel]: + """ + 获取 IP 的抽象方法,不同的 HTTP 代理商需要实现该方法 + :param num: 提取的 IP 数量 + :return: + """ + raise NotImplementedError + + + +class IpCache: + def __init__(self): + self.cache_client: AbstractCache = CacheFactory.create_cache(cache_type=config.CACHE_TYPE_MEMORY) + + def set_ip(self, ip_key: str, ip_value_info: str, ex: int): + """ + 设置IP并带有过期时间,到期之后由 redis 负责删除 + :param ip_key: + :param ip_value_info: + :param ex: + :return: + """ + self.cache_client.set(key=ip_key, value=ip_value_info, expire_time=ex) + + def load_all_ip(self, proxy_brand_name: str) -> List[IpInfoModel]: + """ + 从 redis 中加载所有还未过期的 IP 信息 + :param proxy_brand_name: 代理商名称 + :return: + """ + all_ip_list: List[IpInfoModel] = [] + all_ip_keys: List[str] = self.cache_client.keys(pattern=f"{proxy_brand_name}_*") + try: + for ip_key in all_ip_keys: + ip_value = self.cache_client.get(ip_key) + if not ip_value: + continue + all_ip_list.append(IpInfoModel(**json.loads(ip_value))) + except Exception as e: + utils.logger.error("[IpCache.load_all_ip] get ip err from redis db", e) + return all_ip_list diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/__init__.py new file mode 100644 index 0000000..1fda3ee --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/__init__.py @@ -0,0 +1,18 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/5 10:13 +# @Desc : +from .jishu_http_proxy import new_jisu_http_proxy +from .kuaidl_proxy import new_kuai_daili_proxy +from .wandou_http_proxy import new_wandou_http_proxy \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/jishu_http_proxy.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/jishu_http_proxy.py new file mode 100644 index 0000000..1a84c08 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/jishu_http_proxy.py @@ -0,0 +1,99 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/5 09:32 +# @Desc : 已废弃!!!!!倒闭了!!!极速HTTP 代理IP实现. 请使用快代理实现(proxy/providers/kuaidl_proxy.py) +import os +from typing import Dict, List +from urllib.parse import urlencode + +import httpx + +from proxy import IpCache, IpGetError, ProxyProvider +from proxy.types import IpInfoModel +from tools import utils + + +class JiSuHttpProxy(ProxyProvider): + + def __init__(self, key: str, crypto: str, time_validity_period: int): + """ + 极速HTTP 代理IP实现 + :param key: 提取key值 (去官网注册后获取) + :param crypto: 加密签名 (去官网注册后获取) + """ + self.proxy_brand_name = "JISUHTTP" + self.api_path = "https://api.jisuhttp.com" + self.params = { + "key": key, + "crypto": crypto, + "time": time_validity_period, # IP使用时长,支持3、5、10、15、30分钟时效 + "type": "json", # 数据结果为json + "port": "2", # IP协议:1:HTTP、2:HTTPS、3:SOCKS5 + "pw": "1", # 是否使用账密验证, 1:是,0:否,否表示白名单验证;默认为0 + "se": "1", # 返回JSON格式时是否显示IP过期时间, 1:显示,0:不显示;默认为0 + } + self.ip_cache = IpCache() + + async def get_proxy(self, num: int) -> List[IpInfoModel]: + """ + :param num: + :return: + """ + + # 优先从缓存中拿 IP + ip_cache_list = self.ip_cache.load_all_ip(proxy_brand_name=self.proxy_brand_name) + if len(ip_cache_list) >= num: + return ip_cache_list[:num] + + # 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中 + need_get_count = num - len(ip_cache_list) + self.params.update({"num": need_get_count}) + ip_infos = [] + async with httpx.AsyncClient() as client: + url = self.api_path + "/fetchips" + '?' + urlencode(self.params) + utils.logger.info(f"[JiSuHttpProxy.get_proxy] get ip proxy url:{url}") + response = await client.get(url, headers={ + "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler", + }) + res_dict: Dict = response.json() + if res_dict.get("code") == 0: + data: List[Dict] = res_dict.get("data") + current_ts = utils.get_unix_timestamp() + for ip_item in data: + ip_info_model = IpInfoModel( + ip=ip_item.get("ip"), + port=ip_item.get("port"), + user=ip_item.get("user"), + password=ip_item.get("pass"), + expired_time_ts=utils.get_unix_time_from_time_str(ip_item.get("expire")), + ) + ip_key = f"JISUHTTP_{ip_info_model.ip}_{ip_info_model.port}_{ip_info_model.user}_{ip_info_model.password}" + ip_value = ip_info_model.json() + ip_infos.append(ip_info_model) + self.ip_cache.set_ip(ip_key, ip_value, ex=ip_info_model.expired_time_ts - current_ts) + else: + raise IpGetError(res_dict.get("msg", "unkown err")) + return ip_cache_list + ip_infos + + +def new_jisu_http_proxy() -> JiSuHttpProxy: + """ + 构造极速HTTP实例 + Returns: + + """ + return JiSuHttpProxy( + key=os.getenv("jisu_key", ""), # 通过环境变量的方式获取极速HTTPIP提取key值 + crypto=os.getenv("jisu_crypto", ""), # 通过环境变量的方式获取极速HTTPIP提取加密签名 + time_validity_period=30 # 30分钟(最长时效) + ) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/kuaidl_proxy.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/kuaidl_proxy.py new file mode 100644 index 0000000..8ca1062 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/kuaidl_proxy.py @@ -0,0 +1,145 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/5 09:43 +# @Desc : 快代理HTTP实现,官方文档:https://www.kuaidaili.com/?ref=ldwkjqipvz6c +import os +import re +from typing import Dict, List + +import httpx +from pydantic import BaseModel, Field + +from proxy import IpCache, IpInfoModel, ProxyProvider +from proxy.types import ProviderNameEnum +from tools import utils + + +class KuaidailiProxyModel(BaseModel): + ip: str = Field("ip") + port: int = Field("端口") + expire_ts: int = Field("过期时间") + + +def parse_kuaidaili_proxy(proxy_info: str) -> KuaidailiProxyModel: + """ + 解析快代理的IP信息 + Args: + proxy_info: + + Returns: + + """ + proxies: List[str] = proxy_info.split(":") + if len(proxies) != 2: + raise Exception("not invalid kuaidaili proxy info") + + pattern = r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5}),(\d+)' + match = re.search(pattern, proxy_info) + if not match.groups(): + raise Exception("not match kuaidaili proxy info") + + return KuaidailiProxyModel( + ip=match.groups()[0], + port=int(match.groups()[1]), + expire_ts=int(match.groups()[2]) + ) + + +class KuaiDaiLiProxy(ProxyProvider): + def __init__(self, kdl_user_name: str, kdl_user_pwd: str, kdl_secret_id: str, kdl_signature: str): + """ + + Args: + kdl_user_name: + kdl_user_pwd: + """ + self.kdl_user_name = kdl_user_name + self.kdl_user_pwd = kdl_user_pwd + self.api_base = "https://dps.kdlapi.com/" + self.secret_id = kdl_secret_id + self.signature = kdl_signature + self.ip_cache = IpCache() + self.proxy_brand_name = ProviderNameEnum.KUAI_DAILI_PROVIDER.value + self.params = { + "secret_id": self.secret_id, + "signature": self.signature, + "pt": 1, + "format": "json", + "sep": 1, + "f_et": 1, + } + + async def get_proxy(self, num: int) -> List[IpInfoModel]: + """ + 快代理实现 + Args: + num: + + Returns: + + """ + uri = "/api/getdps/" + + # 优先从缓存中拿 IP + ip_cache_list = self.ip_cache.load_all_ip(proxy_brand_name=self.proxy_brand_name) + if len(ip_cache_list) >= num: + return ip_cache_list[:num] + + # 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中 + need_get_count = num - len(ip_cache_list) + self.params.update({"num": need_get_count}) + + ip_infos: List[IpInfoModel] = [] + async with httpx.AsyncClient() as client: + response = await client.get(self.api_base + uri, params=self.params) + + if response.status_code != 200: + utils.logger.error(f"[KuaiDaiLiProxy.get_proxies] statuc code not 200 and response.txt:{response.text}") + raise Exception("get ip error from proxy provider and status code not 200 ...") + + ip_response: Dict = response.json() + if ip_response.get("code") != 0: + utils.logger.error(f"[KuaiDaiLiProxy.get_proxies] code not 0 and msg:{ip_response.get('msg')}") + raise Exception("get ip error from proxy provider and code not 0 ...") + + proxy_list: List[str] = ip_response.get("data", {}).get("proxy_list") + for proxy in proxy_list: + proxy_model = parse_kuaidaili_proxy(proxy) + ip_info_model = IpInfoModel( + ip=proxy_model.ip, + port=proxy_model.port, + user=self.kdl_user_name, + password=self.kdl_user_pwd, + expired_time_ts=proxy_model.expire_ts, + + ) + ip_key = f"{self.proxy_brand_name}_{ip_info_model.ip}_{ip_info_model.port}" + self.ip_cache.set_ip(ip_key, ip_info_model.model_dump_json(), ex=ip_info_model.expired_time_ts) + ip_infos.append(ip_info_model) + + return ip_cache_list + ip_infos + + +def new_kuai_daili_proxy() -> KuaiDaiLiProxy: + """ + 构造快代理HTTP实例 + Returns: + + """ + return KuaiDaiLiProxy( + kdl_secret_id=os.getenv("kdl_secret_id", "你的快代理secert_id"), + kdl_signature=os.getenv("kdl_signature", "你的快代理签名"), + kdl_user_name=os.getenv("kdl_user_name", "你的快代理用户名"), + kdl_user_pwd=os.getenv("kdl_user_pwd", "你的快代理密码"), + ) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/wandou_http_proxy.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/wandou_http_proxy.py new file mode 100644 index 0000000..78ae245 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/providers/wandou_http_proxy.py @@ -0,0 +1,110 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2025/7/31 +# @Desc : 豌豆HTTP 代理IP实现 +import os +from typing import Dict, List +from urllib.parse import urlencode + +import httpx + +from proxy import IpCache, IpGetError, ProxyProvider +from proxy.types import IpInfoModel +from tools import utils + + +class WanDouHttpProxy(ProxyProvider): + + def __init__(self, app_key: str, num: int = 100): + """ + 豌豆HTTP 代理IP实现 + :param app_key: 开放的app_key,可以通过用户中心获取 + :param num: 单次提取IP数量,最大100 + """ + self.proxy_brand_name = "WANDOUHTTP" + self.api_path = "https://api.wandouapp.com/" + self.params = { + "app_key": app_key, + "num": num, + } + self.ip_cache = IpCache() + + async def get_proxy(self, num: int) -> List[IpInfoModel]: + """ + :param num: + :return: + """ + + # 优先从缓存中拿 IP + ip_cache_list = self.ip_cache.load_all_ip( + proxy_brand_name=self.proxy_brand_name + ) + if len(ip_cache_list) >= num: + return ip_cache_list[:num] + + # 如果缓存中的数量不够,从IP代理商获取补上,再存入缓存中 + need_get_count = num - len(ip_cache_list) + self.params.update({"num": min(need_get_count, 100)}) # 最大100 + ip_infos = [] + async with httpx.AsyncClient() as client: + url = self.api_path + "?" + urlencode(self.params) + utils.logger.info(f"[WanDouHttpProxy.get_proxy] get ip proxy url:{url}") + response = await client.get( + url, + headers={ + "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler", + }, + ) + res_dict: Dict = response.json() + if res_dict.get("code") == 200: + data: List[Dict] = res_dict.get("data", []) + current_ts = utils.get_unix_timestamp() + for ip_item in data: + ip_info_model = IpInfoModel( + ip=ip_item.get("ip"), + port=ip_item.get("port"), + user="", # 豌豆HTTP不需要用户名密码认证 + password="", + expired_time_ts=utils.get_unix_time_from_time_str( + ip_item.get("expire_time") + ), + ) + ip_key = f"WANDOUHTTP_{ip_info_model.ip}_{ip_info_model.port}" + ip_value = ip_info_model.model_dump_json() + ip_infos.append(ip_info_model) + self.ip_cache.set_ip( + ip_key, ip_value, ex=ip_info_model.expired_time_ts - current_ts + ) + else: + error_msg = res_dict.get("msg", "unknown error") + # 处理具体错误码 + error_code = res_dict.get("code") + if error_code == 10001: + error_msg = "通用错误,具体错误信息查看msg内容" + elif error_code == 10048: + error_msg = "没有可用套餐" + raise IpGetError(f"{error_msg} (code: {error_code})") + return ip_cache_list + ip_infos + + +def new_wandou_http_proxy() -> WanDouHttpProxy: + """ + 构造豌豆HTTP实例 + Returns: + + """ + return WanDouHttpProxy( + app_key=os.getenv( + "wandou_app_key", "你的豌豆HTTP app_key" + ), # 通过环境变量的方式获取豌豆HTTP app_key + ) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/proxy_ip_pool.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/proxy_ip_pool.py new file mode 100644 index 0000000..cbc8ea4 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/proxy_ip_pool.py @@ -0,0 +1,136 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 13:45 +# @Desc : ip代理池实现 +import random +from typing import Dict, List + +import httpx +from tenacity import retry, stop_after_attempt, wait_fixed + +import config +from proxy.providers import ( + new_kuai_daili_proxy, + new_wandou_http_proxy, +) +from tools import utils + +from .base_proxy import ProxyProvider +from .types import IpInfoModel, ProviderNameEnum + + +class ProxyIpPool: + + def __init__( + self, ip_pool_count: int, enable_validate_ip: bool, ip_provider: ProxyProvider + ) -> None: + """ + + Args: + ip_pool_count: + enable_validate_ip: + ip_provider: + """ + self.valid_ip_url = "https://echo.apifox.cn/" # 验证 IP 是否有效的地址 + self.ip_pool_count = ip_pool_count + self.enable_validate_ip = enable_validate_ip + self.proxy_list: List[IpInfoModel] = [] + self.ip_provider: ProxyProvider = ip_provider + + async def load_proxies(self) -> None: + """ + 加载IP代理 + Returns: + + """ + self.proxy_list = await self.ip_provider.get_proxy(self.ip_pool_count) + + async def _is_valid_proxy(self, proxy: IpInfoModel) -> bool: + """ + 验证代理IP是否有效 + :param proxy: + :return: + """ + utils.logger.info( + f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} is it valid " + ) + try: + # httpx 0.28.1 需要直接传入代理URL字符串,而不是字典 + if proxy.user and proxy.password: + proxy_url = f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}" + else: + proxy_url = f"http://{proxy.ip}:{proxy.port}" + + async with httpx.AsyncClient(proxy=proxy_url) as client: + response = await client.get(self.valid_ip_url) + if response.status_code == 200: + return True + else: + return False + except Exception as e: + utils.logger.info( + f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} err: {e}" + ) + raise e + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def get_proxy(self) -> IpInfoModel: + """ + 从代理池中随机提取一个代理IP + :return: + """ + if len(self.proxy_list) == 0: + await self._reload_proxies() + + proxy = random.choice(self.proxy_list) + self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉 + if self.enable_validate_ip: + if not await self._is_valid_proxy(proxy): + raise Exception( + "[ProxyIpPool.get_proxy] current ip invalid and again get it" + ) + return proxy + + async def _reload_proxies(self): + """ + # 重新加载代理池 + :return: + """ + self.proxy_list = [] + await self.load_proxies() + + +IpProxyProvider: Dict[str, ProxyProvider] = { + ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(), + ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(), +} + + +async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyIpPool: + """ + 创建 IP 代理池 + :param ip_pool_count: ip池子的数量 + :param enable_validate_ip: 是否开启验证IP代理 + :return: + """ + pool = ProxyIpPool( + ip_pool_count=ip_pool_count, + enable_validate_ip=enable_validate_ip, + ip_provider=IpProxyProvider.get(config.IP_PROXY_PROVIDER_NAME), + ) + await pool.load_proxies() + return pool + + +if __name__ == "__main__": + pass diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/types.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/types.py new file mode 100644 index 0000000..db8c9b1 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/proxy/types.py @@ -0,0 +1,35 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/5 10:18 +# @Desc : 基础类型 +from enum import Enum +from typing import Optional + +from pydantic import BaseModel, Field + + +class ProviderNameEnum(Enum): + KUAI_DAILI_PROVIDER: str = "kuaidaili" + WANDOU_HTTP_PROVIDER: str = "wandouhttp" + + +class IpInfoModel(BaseModel): + """Unified IP model""" + + ip: str = Field(title="ip") + port: int = Field(title="端口") + user: str = Field(title="IP代理认证的用户名") + protocol: str = Field(default="https://", title="代理IP的协议") + password: str = Field(title="IP代理认证用户的密码") + expired_time_ts: Optional[int] = Field(title="IP 过期时间") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/pyproject.toml b/MindSpider/DeepSentimentCrawling/MediaCrawler/pyproject.toml new file mode 100644 index 0000000..cc36c70 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "mediacrawler" +author = "程序员阿江-Relakkes " +version = "0.1.0" +description = "A social media crawler project, support Xiaohongshu, Weibo, Zhihu, Bilibili, Douyin, BaiduTieBa etc." +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "aiofiles~=23.2.1", + "aiomysql==0.2.0", + "aiosqlite>=0.21.0", + "fastapi==0.110.2", + "httpx==0.28.1", + "jieba==0.42.1", + "matplotlib==3.9.0", + "opencv-python>=4.11.0.86", + "pandas==2.2.3", + "parsel==1.9.1", + "pillow==9.5.0", + "playwright==1.45.0", + "pydantic==2.5.2", + "pyexecjs==1.5.1", + "python-dotenv==1.0.1", + "redis~=4.6.0", + "requests==2.32.3", + "tenacity==8.2.2", + "uvicorn==0.29.0", + "wordcloud==1.9.3", +] + +[[tool.uv.index]] +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +default = true diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/recv_sms.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/recv_sms.py new file mode 100644 index 0000000..e584058 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/recv_sms.py @@ -0,0 +1,79 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import re +from typing import List + +import uvicorn +from fastapi import FastAPI, HTTPException, status +from pydantic import BaseModel + +import config +from cache.abs_cache import AbstractCache +from cache.cache_factory import CacheFactory +from tools import utils + +app = FastAPI() + +cache_client : AbstractCache = CacheFactory.create_cache(cache_type=config.CACHE_TYPE_MEMORY) + + +class SmsNotification(BaseModel): + platform: str + current_number: str + from_number: str + sms_content: str + timestamp: str + + +def extract_verification_code(message: str) -> str: + """ + Extract verification code of 6 digits from the SMS. + """ + pattern = re.compile(r'\b[0-9]{6}\b') + codes: List[str] = pattern.findall(message) + return codes[0] if codes else "" + + +@app.post("/") +def receive_sms_notification(sms: SmsNotification): + """ + Receive SMS notification and send it to Redis. + Args: + sms: + { + "platform": "xhs", + "from_number": "1069421xxx134", + "sms_content": "【小红书】您的验证码是: 171959, 3分钟内有效。请勿向他人泄漏。如非本人操作,可忽略本消息。", + "timestamp": "1686720601614", + "current_number": "13152442222" + } + + Returns: + + """ + utils.logger.info(f"Received SMS notification: {sms.platform}, {sms.current_number}") + sms_code = extract_verification_code(sms.sms_content) + if sms_code: + # Save the verification code in Redis and set the expiration time to 3 minutes. + key = f"{sms.platform}_{sms.current_number}" + cache_client.set(key, sms_code, expire_time=60 * 3) + + return {"status": "ok"} + + +@app.get("/", status_code=status.HTTP_404_NOT_FOUND) +async def not_found(): + raise HTTPException(status_code=404, detail="Not Found") + + +if __name__ == '__main__': + uvicorn.run(app, port=8000, host='0.0.0.0') \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.db b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.db new file mode 100644 index 0000000..5bc1f8d Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.db differ diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.sql b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.sql new file mode 100644 index 0000000..4c6d252 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/sqlite_tables.sql @@ -0,0 +1,569 @@ +-- SQLite版本的MediaCrawler数据库表结构 +-- 从MySQL tables.sql转换而来,适配SQLite语法 + +-- ---------------------------- +-- Table structure for bilibili_video +-- ---------------------------- +DROP TABLE IF EXISTS bilibili_video; +CREATE TABLE bilibili_video ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + video_id TEXT NOT NULL, + video_type TEXT NOT NULL, + title TEXT DEFAULT NULL, + desc TEXT, + create_time INTEGER NOT NULL, + liked_count TEXT DEFAULT NULL, + disliked_count TEXT DEFAULT NULL, + video_play_count TEXT DEFAULT NULL, + video_favorite_count TEXT DEFAULT NULL, + video_share_count TEXT DEFAULT NULL, + video_coin_count TEXT DEFAULT NULL, + video_danmaku TEXT DEFAULT NULL, + video_comment TEXT DEFAULT NULL, + video_url TEXT DEFAULT NULL, + video_cover_url TEXT DEFAULT NULL, + source_keyword TEXT DEFAULT '' +); + +CREATE INDEX idx_bilibili_vi_video_i_31c36e ON bilibili_video(video_id); +CREATE INDEX idx_bilibili_vi_create__73e0ec ON bilibili_video(create_time); + +-- ---------------------------- +-- Table structure for bilibili_video_comment +-- ---------------------------- +DROP TABLE IF EXISTS bilibili_video_comment; +CREATE TABLE bilibili_video_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + sex TEXT DEFAULT NULL, + sign TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + comment_id TEXT NOT NULL, + video_id TEXT NOT NULL, + content TEXT, + create_time INTEGER NOT NULL, + sub_comment_count TEXT NOT NULL, + parent_comment_id TEXT DEFAULT NULL, + like_count TEXT NOT NULL DEFAULT '0' +); + +CREATE INDEX idx_bilibili_vi_comment_41c34e ON bilibili_video_comment(comment_id); +CREATE INDEX idx_bilibili_vi_video_i_f22873 ON bilibili_video_comment(video_id); + +-- ---------------------------- +-- Table structure for bilibili_up_info +-- ---------------------------- +DROP TABLE IF EXISTS bilibili_up_info; +CREATE TABLE bilibili_up_info ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + sex TEXT DEFAULT NULL, + sign TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + total_fans INTEGER DEFAULT NULL, + total_liked INTEGER DEFAULT NULL, + user_rank INTEGER DEFAULT NULL, + is_official INTEGER DEFAULT NULL +); + +CREATE INDEX idx_bilibili_vi_user_123456 ON bilibili_up_info(user_id); + +-- ---------------------------- +-- Table structure for bilibili_contact_info +-- ---------------------------- +DROP TABLE IF EXISTS bilibili_contact_info; +CREATE TABLE bilibili_contact_info ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + up_id TEXT DEFAULT NULL, + fan_id TEXT DEFAULT NULL, + up_name TEXT DEFAULT NULL, + fan_name TEXT DEFAULT NULL, + up_sign TEXT DEFAULT NULL, + fan_sign TEXT DEFAULT NULL, + up_avatar TEXT DEFAULT NULL, + fan_avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE INDEX idx_bilibili_contact_info_up_id ON bilibili_contact_info(up_id); +CREATE INDEX idx_bilibili_contact_info_fan_id ON bilibili_contact_info(fan_id); + +-- ---------------------------- +-- Table structure for bilibili_up_dynamic +-- ---------------------------- +DROP TABLE IF EXISTS bilibili_up_dynamic; +CREATE TABLE bilibili_up_dynamic ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + dynamic_id TEXT DEFAULT NULL, + user_id TEXT DEFAULT NULL, + user_name TEXT DEFAULT NULL, + text TEXT DEFAULT NULL, + type TEXT DEFAULT NULL, + pub_ts INTEGER DEFAULT NULL, + total_comments INTEGER DEFAULT NULL, + total_forwards INTEGER DEFAULT NULL, + total_liked INTEGER DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE INDEX idx_bilibili_up_dynamic_dynamic_id ON bilibili_up_dynamic(dynamic_id); + +-- ---------------------------- +-- Table structure for douyin_aweme +-- ---------------------------- +DROP TABLE IF EXISTS douyin_aweme; +CREATE TABLE douyin_aweme ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + sec_uid TEXT DEFAULT NULL, + short_user_id TEXT DEFAULT NULL, + user_unique_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + user_signature TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + aweme_id TEXT NOT NULL, + aweme_type TEXT NOT NULL, + title TEXT DEFAULT NULL, + desc TEXT, + create_time INTEGER NOT NULL, + liked_count TEXT DEFAULT NULL, + comment_count TEXT DEFAULT NULL, + share_count TEXT DEFAULT NULL, + collected_count TEXT DEFAULT NULL, + aweme_url TEXT DEFAULT NULL, + cover_url TEXT DEFAULT NULL, + video_download_url TEXT DEFAULT NULL, + music_download_url TEXT DEFAULT NULL, + note_download_url TEXT DEFAULT NULL, + source_keyword TEXT DEFAULT '' +); + +CREATE INDEX idx_douyin_awem_aweme_i_6f7bc6 ON douyin_aweme(aweme_id); +CREATE INDEX idx_douyin_awem_create__299dfe ON douyin_aweme(create_time); + +-- ---------------------------- +-- Table structure for douyin_aweme_comment +-- ---------------------------- +DROP TABLE IF EXISTS douyin_aweme_comment; +CREATE TABLE douyin_aweme_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + sec_uid TEXT DEFAULT NULL, + short_user_id TEXT DEFAULT NULL, + user_unique_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + user_signature TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + comment_id TEXT NOT NULL, + aweme_id TEXT NOT NULL, + content TEXT, + create_time INTEGER NOT NULL, + sub_comment_count TEXT NOT NULL, + parent_comment_id TEXT DEFAULT NULL, + like_count TEXT NOT NULL DEFAULT '0', + pictures TEXT NOT NULL DEFAULT '' +); + +CREATE INDEX idx_douyin_awem_comment_fcd7e4 ON douyin_aweme_comment(comment_id); +CREATE INDEX idx_douyin_awem_aweme_i_c50049 ON douyin_aweme_comment(aweme_id); + +-- ---------------------------- +-- Table structure for dy_creator +-- ---------------------------- +DROP TABLE IF EXISTS dy_creator; +CREATE TABLE dy_creator ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + desc TEXT, + gender TEXT DEFAULT NULL, + follows TEXT DEFAULT NULL, + fans TEXT DEFAULT NULL, + interaction TEXT DEFAULT NULL, + videos_count TEXT DEFAULT NULL +); + +-- ---------------------------- +-- Table structure for kuaishou_video +-- ---------------------------- +DROP TABLE IF EXISTS kuaishou_video; +CREATE TABLE kuaishou_video ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + video_id TEXT NOT NULL, + video_type TEXT NOT NULL, + title TEXT DEFAULT NULL, + desc TEXT, + create_time INTEGER NOT NULL, + liked_count TEXT DEFAULT NULL, + viewd_count TEXT DEFAULT NULL, + video_url TEXT DEFAULT NULL, + video_cover_url TEXT DEFAULT NULL, + video_play_url TEXT DEFAULT NULL, + source_keyword TEXT DEFAULT '' +); + +CREATE INDEX idx_kuaishou_vi_video_i_c5c6a6 ON kuaishou_video(video_id); +CREATE INDEX idx_kuaishou_vi_create__a10dee ON kuaishou_video(create_time); + +-- ---------------------------- +-- Table structure for kuaishou_video_comment +-- ---------------------------- +DROP TABLE IF EXISTS kuaishou_video_comment; +CREATE TABLE kuaishou_video_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + comment_id TEXT NOT NULL, + video_id TEXT NOT NULL, + content TEXT, + create_time INTEGER NOT NULL, + sub_comment_count TEXT NOT NULL +); + +CREATE INDEX idx_kuaishou_vi_comment_ed48fa ON kuaishou_video_comment(comment_id); +CREATE INDEX idx_kuaishou_vi_video_i_e50914 ON kuaishou_video_comment(video_id); + +-- ---------------------------- +-- Table structure for weibo_note +-- ---------------------------- +DROP TABLE IF EXISTS weibo_note; +CREATE TABLE weibo_note ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + gender TEXT DEFAULT NULL, + profile_url TEXT DEFAULT NULL, + ip_location TEXT DEFAULT '', + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + note_id TEXT NOT NULL, + content TEXT, + create_time INTEGER NOT NULL, + create_date_time TEXT NOT NULL, + liked_count TEXT DEFAULT NULL, + comments_count TEXT DEFAULT NULL, + shared_count TEXT DEFAULT NULL, + note_url TEXT DEFAULT NULL, + source_keyword TEXT DEFAULT '' +); + +CREATE INDEX idx_weibo_note_note_id_f95b1a ON weibo_note(note_id); +CREATE INDEX idx_weibo_note_create__692709 ON weibo_note(create_time); +CREATE INDEX idx_weibo_note_create__d05ed2 ON weibo_note(create_date_time); + +-- ---------------------------- +-- Table structure for weibo_note_comment +-- ---------------------------- +DROP TABLE IF EXISTS weibo_note_comment; +CREATE TABLE weibo_note_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT DEFAULT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + gender TEXT DEFAULT NULL, + profile_url TEXT DEFAULT NULL, + ip_location TEXT DEFAULT '', + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + comment_id TEXT NOT NULL, + note_id TEXT NOT NULL, + content TEXT, + create_time INTEGER NOT NULL, + create_date_time TEXT NOT NULL, + comment_like_count TEXT NOT NULL, + sub_comment_count TEXT NOT NULL, + parent_comment_id TEXT DEFAULT NULL +); + +CREATE INDEX idx_weibo_note__comment_c7611c ON weibo_note_comment(comment_id); +CREATE INDEX idx_weibo_note__note_id_24f108 ON weibo_note_comment(note_id); +CREATE INDEX idx_weibo_note__create__667fe3 ON weibo_note_comment(create_date_time); + +-- ---------------------------- +-- Table structure for weibo_creator +-- ---------------------------- +DROP TABLE IF EXISTS weibo_creator; +CREATE TABLE weibo_creator ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + desc TEXT, + gender TEXT DEFAULT NULL, + follows TEXT DEFAULT NULL, + fans TEXT DEFAULT NULL, + tag_list TEXT +); + +-- ---------------------------- +-- Table structure for xhs_creator +-- ---------------------------- +DROP TABLE IF EXISTS xhs_creator; +CREATE TABLE xhs_creator ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + desc TEXT, + gender TEXT DEFAULT NULL, + follows TEXT DEFAULT NULL, + fans TEXT DEFAULT NULL, + interaction TEXT DEFAULT NULL, + tag_list TEXT +); + +-- ---------------------------- +-- Table structure for xhs_note +-- ---------------------------- +DROP TABLE IF EXISTS xhs_note; +CREATE TABLE xhs_note ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + note_id TEXT NOT NULL, + type TEXT DEFAULT NULL, + title TEXT DEFAULT NULL, + desc TEXT, + video_url TEXT, + time INTEGER NOT NULL, + last_update_time INTEGER NOT NULL, + liked_count TEXT DEFAULT NULL, + collected_count TEXT DEFAULT NULL, + comment_count TEXT DEFAULT NULL, + share_count TEXT DEFAULT NULL, + image_list TEXT, + tag_list TEXT, + note_url TEXT DEFAULT NULL, + source_keyword TEXT DEFAULT '', + xsec_token TEXT DEFAULT NULL +); + +CREATE INDEX idx_xhs_note_note_id_209457 ON xhs_note(note_id); +CREATE INDEX idx_xhs_note_time_eaa910 ON xhs_note(time); + +-- ---------------------------- +-- Table structure for xhs_note_comment +-- ---------------------------- +DROP TABLE IF EXISTS xhs_note_comment; +CREATE TABLE xhs_note_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + comment_id TEXT NOT NULL, + create_time INTEGER NOT NULL, + note_id TEXT NOT NULL, + content TEXT NOT NULL, + sub_comment_count INTEGER NOT NULL, + pictures TEXT DEFAULT NULL, + parent_comment_id TEXT DEFAULT NULL, + like_count TEXT DEFAULT NULL +); + +CREATE INDEX idx_xhs_note_co_comment_8e8349 ON xhs_note_comment(comment_id); +CREATE INDEX idx_xhs_note_co_create__204f8d ON xhs_note_comment(create_time); + +-- ---------------------------- +-- Table structure for tieba_note +-- ---------------------------- +DROP TABLE IF EXISTS tieba_note; +CREATE TABLE tieba_note ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + note_id TEXT NOT NULL, + title TEXT NOT NULL, + desc TEXT, + note_url TEXT NOT NULL, + publish_time TEXT NOT NULL, + user_link TEXT DEFAULT '', + user_nickname TEXT DEFAULT '', + user_avatar TEXT DEFAULT '', + tieba_id TEXT DEFAULT '', + tieba_name TEXT NOT NULL, + tieba_link TEXT NOT NULL, + total_replay_num INTEGER DEFAULT 0, + total_replay_page INTEGER DEFAULT 0, + ip_location TEXT DEFAULT '', + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + source_keyword TEXT DEFAULT '' +); + +CREATE INDEX idx_tieba_note_note_id ON tieba_note(note_id); +CREATE INDEX idx_tieba_note_publish_time ON tieba_note(publish_time); + +-- ---------------------------- +-- Table structure for tieba_comment +-- ---------------------------- +DROP TABLE IF EXISTS tieba_comment; +CREATE TABLE tieba_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + comment_id TEXT NOT NULL, + parent_comment_id TEXT DEFAULT '', + content TEXT NOT NULL, + user_link TEXT DEFAULT '', + user_nickname TEXT DEFAULT '', + user_avatar TEXT DEFAULT '', + tieba_id TEXT DEFAULT '', + tieba_name TEXT NOT NULL, + tieba_link TEXT NOT NULL, + publish_time TEXT DEFAULT '', + ip_location TEXT DEFAULT '', + sub_comment_count INTEGER DEFAULT 0, + note_id TEXT NOT NULL, + note_url TEXT NOT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE INDEX idx_tieba_comment_comment_id ON tieba_comment(comment_id); +CREATE INDEX idx_tieba_comment_note_id ON tieba_comment(note_id); +CREATE INDEX idx_tieba_comment_publish_time ON tieba_comment(publish_time); + +-- ---------------------------- +-- Table structure for tieba_creator +-- ---------------------------- +DROP TABLE IF EXISTS tieba_creator; +CREATE TABLE tieba_creator ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + user_name TEXT NOT NULL, + nickname TEXT DEFAULT NULL, + avatar TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL, + gender TEXT DEFAULT NULL, + follows TEXT DEFAULT NULL, + fans TEXT DEFAULT NULL, + registration_duration TEXT DEFAULT NULL +); + +-- ---------------------------- +-- Table structure for zhihu_content +-- ---------------------------- +DROP TABLE IF EXISTS zhihu_content; +CREATE TABLE zhihu_content ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + content_id TEXT NOT NULL, + content_type TEXT NOT NULL, + content_text TEXT, + content_url TEXT NOT NULL, + question_id TEXT DEFAULT NULL, + title TEXT NOT NULL, + desc TEXT, + created_time TEXT NOT NULL, + updated_time TEXT NOT NULL, + voteup_count INTEGER NOT NULL DEFAULT 0, + comment_count INTEGER NOT NULL DEFAULT 0, + source_keyword TEXT DEFAULT NULL, + user_id TEXT NOT NULL, + user_link TEXT NOT NULL, + user_nickname TEXT NOT NULL, + user_avatar TEXT NOT NULL, + user_url_token TEXT NOT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE INDEX idx_zhihu_content_content_id ON zhihu_content(content_id); +CREATE INDEX idx_zhihu_content_created_time ON zhihu_content(created_time); + +-- ---------------------------- +-- Table structure for zhihu_comment +-- ---------------------------- +DROP TABLE IF EXISTS zhihu_comment; +CREATE TABLE zhihu_comment ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + comment_id TEXT NOT NULL, + parent_comment_id TEXT DEFAULT NULL, + content TEXT NOT NULL, + publish_time TEXT NOT NULL, + ip_location TEXT DEFAULT NULL, + sub_comment_count INTEGER NOT NULL DEFAULT 0, + like_count INTEGER NOT NULL DEFAULT 0, + dislike_count INTEGER NOT NULL DEFAULT 0, + content_id TEXT NOT NULL, + content_type TEXT NOT NULL, + user_id TEXT NOT NULL, + user_link TEXT NOT NULL, + user_nickname TEXT NOT NULL, + user_avatar TEXT NOT NULL, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE INDEX idx_zhihu_comment_comment_id ON zhihu_comment(comment_id); +CREATE INDEX idx_zhihu_comment_content_id ON zhihu_comment(content_id); +CREATE INDEX idx_zhihu_comment_publish_time ON zhihu_comment(publish_time); + +-- ---------------------------- +-- Table structure for zhihu_creator +-- ---------------------------- +DROP TABLE IF EXISTS zhihu_creator; +CREATE TABLE zhihu_creator ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL UNIQUE, + user_link TEXT NOT NULL, + user_nickname TEXT NOT NULL, + user_avatar TEXT NOT NULL, + url_token TEXT NOT NULL, + gender TEXT DEFAULT NULL, + ip_location TEXT DEFAULT NULL, + follows INTEGER NOT NULL DEFAULT 0, + fans INTEGER NOT NULL DEFAULT 0, + anwser_count INTEGER NOT NULL DEFAULT 0, + video_count INTEGER NOT NULL DEFAULT 0, + question_count INTEGER NOT NULL DEFAULT 0, + article_count INTEGER NOT NULL DEFAULT 0, + column_count INTEGER NOT NULL DEFAULT 0, + get_voteup_count INTEGER NOT NULL DEFAULT 0, + add_ts INTEGER NOT NULL, + last_modify_ts INTEGER NOT NULL +); + +CREATE UNIQUE INDEX idx_zhihu_creator_user_id ON zhihu_creator(user_id); diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql new file mode 100644 index 0000000..7310625 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql @@ -0,0 +1,597 @@ +-- ---------------------------- +-- Table structure for bilibili_video +-- ---------------------------- +DROP TABLE IF EXISTS `bilibili_video`; +CREATE TABLE `bilibili_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `disliked_count` varchar(16) DEFAULT NULL COMMENT '视频点踩数', + `video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量', + `video_favorite_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数量', + `video_share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数量', + `video_coin_count` varchar(16) DEFAULT NULL COMMENT '视频投币数量', + `video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量', + `video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`), + KEY `idx_bilibili_vi_create__73e0ec` (`create_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B站视频'; + +-- ---------------------------- +-- Table structure for bilibili_video_comment +-- ---------------------------- +DROP TABLE IF EXISTS `bilibili_video_comment`; +CREATE TABLE `bilibili_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `sex` varchar(64) DEFAULT NULL COMMENT '用户性别', + `sign` text DEFAULT NULL COMMENT '用户签名', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`), + KEY `idx_bilibili_vi_video_i_f22873` (`video_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论'; + +-- ---------------------------- +-- Table structure for bilibili_up_info +-- ---------------------------- +DROP TABLE IF EXISTS `bilibili_up_info`; +CREATE TABLE `bilibili_up_info` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `sex` varchar(64) DEFAULT NULL COMMENT '用户性别', + `sign` text DEFAULT NULL COMMENT '用户签名', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `total_fans` bigint DEFAULT NULL COMMENT '粉丝数', + `total_liked` bigint DEFAULT NULL COMMENT '总获赞数', + `user_rank` int DEFAULT NULL COMMENT '用户等级', + `is_official` int DEFAULT NULL COMMENT '是否官号', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_user_123456` (`user_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息'; + +-- ---------------------------- +-- Table structure for bilibili_contact_info +-- ---------------------------- +DROP TABLE IF EXISTS `bilibili_contact_info`; +CREATE TABLE `bilibili_contact_info` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `up_id` varchar(64) DEFAULT NULL COMMENT 'up主ID', + `fan_id` varchar(64) DEFAULT NULL COMMENT '粉丝ID', + `up_name` varchar(64) DEFAULT NULL COMMENT 'up主昵称', + `fan_name` varchar(64) DEFAULT NULL COMMENT '粉丝昵称', + `up_sign` longtext DEFAULT NULL COMMENT 'up主签名', + `fan_sign` longtext DEFAULT NULL COMMENT '粉丝签名', + `up_avatar` varchar(255) DEFAULT NULL COMMENT 'up主头像地址', + `fan_avatar` varchar(255) DEFAULT NULL COMMENT '粉丝头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_bilibili_contact_info_up_id` (`up_id`), + KEY `idx_bilibili_contact_info_fan_id` (`fan_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站联系人信息'; + +-- ---------------------------- +-- Table structure for bilibili_up_dynamic +-- ---------------------------- +DROP TABLE IF EXISTS `bilibili_up_dynamic`; +CREATE TABLE `bilibili_up_dynamic` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `dynamic_id` varchar(64) DEFAULT NULL COMMENT '动态ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `user_name` varchar(64) DEFAULT NULL COMMENT '用户名', + `text` longtext DEFAULT NULL COMMENT '动态文本', + `type` varchar(64) DEFAULT NULL COMMENT '动态类型', + `pub_ts` bigint DEFAULT NULL COMMENT '动态发布时间', + `total_comments` bigint DEFAULT NULL COMMENT '评论数', + `total_forwards` bigint DEFAULT NULL COMMENT '转发数', + `total_liked` bigint DEFAULT NULL COMMENT '点赞数', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_bilibili_up_dynamic_dynamic_id` (`dynamic_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站up主动态信息'; + +-- ---------------------------- +-- Table structure for douyin_aweme +-- ---------------------------- +DROP TABLE IF EXISTS `douyin_aweme`; +CREATE TABLE `douyin_aweme` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `aweme_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(1024) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数', + `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', + `cover_url` varchar(500) DEFAULT NULL COMMENT '视频封面图URL', + `video_download_url` longtext COMMENT '视频下载地址', + `music_download_url` longtext COMMENT '音乐下载地址', + `note_download_url` longtext COMMENT '笔记下载地址', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), + KEY `idx_douyin_awem_create__299dfe` (`create_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频'; + +-- ---------------------------- +-- Table structure for douyin_aweme_comment +-- ---------------------------- +DROP TABLE IF EXISTS `douyin_aweme_comment`; +CREATE TABLE `douyin_aweme_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`), + KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论'; + +-- ---------------------------- +-- Table structure for dy_creator +-- ---------------------------- +DROP TABLE IF EXISTS `dy_creator`; +CREATE TABLE `dy_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(128) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', + `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息'; + +-- ---------------------------- +-- Table structure for kuaishou_video +-- ---------------------------- +DROP TABLE IF EXISTS `kuaishou_video`; +CREATE TABLE `kuaishou_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + `video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`), + KEY `idx_kuaishou_vi_create__a10dee` (`create_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频'; + +-- ---------------------------- +-- Table structure for kuaishou_video_comment +-- ---------------------------- +DROP TABLE IF EXISTS `kuaishou_video_comment`; +CREATE TABLE `kuaishou_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`), + KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频评论'; + + +-- ---------------------------- +-- Table structure for weibo_note +-- ---------------------------- +DROP TABLE IF EXISTS `weibo_note`; +CREATE TABLE `weibo_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '帖子正文内容', + `create_time` bigint NOT NULL COMMENT '帖子发布时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间', + `liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数', + `comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量', + `shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量', + `note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL', + PRIMARY KEY (`id`), + KEY `idx_weibo_note_note_id_f95b1a` (`note_id`), + KEY `idx_weibo_note_create__692709` (`create_time`), + KEY `idx_weibo_note_create__d05ed2` (`create_date_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子'; + +-- ---------------------------- +-- Table structure for weibo_note_comment +-- ---------------------------- +DROP TABLE IF EXISTS `weibo_note_comment`; +CREATE TABLE `weibo_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间', + `comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_weibo_note__comment_c7611c` (`comment_id`), + KEY `idx_weibo_note__note_id_24f108` (`note_id`), + KEY `idx_weibo_note__create__667fe3` (`create_date_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子评论'; + +-- ---------------------------- +-- Table structure for xhs_creator +-- ---------------------------- +DROP TABLE IF EXISTS `xhs_creator`; +CREATE TABLE `xhs_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数', + `tag_list` longtext COMMENT '标签列表', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书博主'; + +-- ---------------------------- +-- Table structure for xhs_note +-- ---------------------------- +DROP TABLE IF EXISTS `xhs_note`; +CREATE TABLE `xhs_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)', + `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', + `desc` longtext COMMENT '笔记描述', + `video_url` longtext COMMENT '视频地址', + `time` bigint NOT NULL COMMENT '笔记发布时间戳', + `last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数', + `image_list` longtext COMMENT '笔记封面图片列表', + `tag_list` longtext COMMENT '标签列表', + `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', + PRIMARY KEY (`id`), + KEY `idx_xhs_note_note_id_209457` (`note_id`), + KEY `idx_xhs_note_time_eaa910` (`time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记'; + +-- ---------------------------- +-- Table structure for xhs_note_comment +-- ---------------------------- +DROP TABLE IF EXISTS `xhs_note_comment`; +CREATE TABLE `xhs_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `content` longtext NOT NULL COMMENT '评论内容', + `sub_comment_count` int NOT NULL COMMENT '子评论数量', + `pictures` varchar(512) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`), + KEY `idx_xhs_note_co_create__204f8d` (`create_time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论'; + +-- ---------------------------- +-- alter table xhs_note_comment to support parent_comment_id +-- ---------------------------- +ALTER TABLE `xhs_note_comment` + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + +ALTER TABLE `douyin_aweme_comment` + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + +ALTER TABLE `bilibili_video_comment` + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + +ALTER TABLE `weibo_note_comment` + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + + +DROP TABLE IF EXISTS `tieba_note`; +CREATE TABLE tieba_note +( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + note_id VARCHAR(644) NOT NULL COMMENT '帖子ID', + title VARCHAR(255) NOT NULL COMMENT '帖子标题', + `desc` TEXT COMMENT '帖子描述', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + publish_time VARCHAR(255) NOT NULL COMMENT '发布时间', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', + tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', + tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', + total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数', + total_replay_page INT DEFAULT 0 COMMENT '帖子回复总页数', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_note_note_id` (`note_id`), + KEY `idx_tieba_note_publish_time` (`publish_time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; + +DROP TABLE IF EXISTS `tieba_comment`; +CREATE TABLE tieba_comment +( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', + parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', + content TEXT NOT NULL COMMENT '评论内容', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', + tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', + tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', + publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + sub_comment_count INT DEFAULT 0 COMMENT '子评论数', + note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_comment_comment_id` (`note_id`), + KEY `idx_tieba_comment_note_id` (`note_id`), + KEY `idx_tieba_comment_publish_time` (`publish_time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; + +-- 增加搜索来源关键字字段 +alter table bilibili_video + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table douyin_aweme + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table kuaishou_video + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table weibo_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table xhs_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table tieba_note + add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; + + +DROP TABLE IF EXISTS `weibo_creator`; +CREATE TABLE `weibo_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(2) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `tag_list` longtext COMMENT '标签列表', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博博主'; + + +ALTER TABLE `xhs_note_comment` + ADD COLUMN `like_count` VARCHAR(64) DEFAULT NULL COMMENT '评论点赞数量'; + + +DROP TABLE IF EXISTS `tieba_creator`; +CREATE TABLE `tieba_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_name` varchar(64) NOT NULL COMMENT '用户名', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `gender` varchar(2) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `registration_duration` varchar(16) DEFAULT NULL COMMENT '吧龄', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧创作者'; + +DROP TABLE IF EXISTS `zhihu_content`; +CREATE TABLE `zhihu_content` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `content_id` varchar(64) NOT NULL COMMENT '内容ID', + `content_type` varchar(16) NOT NULL COMMENT '内容类型(article | answer | zvideo)', + `content_text` longtext COMMENT '内容文本, 如果是视频类型这里为空', + `content_url` varchar(255) NOT NULL COMMENT '内容落地链接', + `question_id` varchar(64) DEFAULT NULL COMMENT '问题ID, type为answer时有值', + `title` varchar(255) NOT NULL COMMENT '内容标题', + `desc` longtext COMMENT '内容描述', + `created_time` varchar(32) NOT NULL COMMENT '创建时间', + `updated_time` varchar(32) NOT NULL COMMENT '更新时间', + `voteup_count` int NOT NULL DEFAULT '0' COMMENT '赞同人数', + `comment_count` int NOT NULL DEFAULT '0' COMMENT '评论数量', + `source_keyword` varchar(64) DEFAULT NULL COMMENT '来源关键词', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `user_url_token` varchar(255) NOT NULL COMMENT '用户url_token', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_zhihu_content_content_id` (`content_id`), + KEY `idx_zhihu_content_created_time` (`created_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎内容(回答、文章、视频)'; + + +DROP TABLE IF EXISTS `zhihu_comment`; +CREATE TABLE `zhihu_comment` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `parent_comment_id` varchar(64) DEFAULT NULL COMMENT '父评论ID', + `content` text NOT NULL COMMENT '评论内容', + `publish_time` varchar(32) NOT NULL COMMENT '发布时间', + `ip_location` varchar(64) DEFAULT NULL COMMENT 'IP地理位置', + `sub_comment_count` int NOT NULL DEFAULT '0' COMMENT '子评论数', + `like_count` int NOT NULL DEFAULT '0' COMMENT '点赞数', + `dislike_count` int NOT NULL DEFAULT '0' COMMENT '踩数', + `content_id` varchar(64) NOT NULL COMMENT '内容ID', + `content_type` varchar(16) NOT NULL COMMENT '内容类型(article | answer | zvideo)', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_zhihu_comment_comment_id` (`comment_id`), + KEY `idx_zhihu_comment_content_id` (`content_id`), + KEY `idx_zhihu_comment_publish_time` (`publish_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎评论'; + +DROP TABLE IF EXISTS `zhihu_creator`; +CREATE TABLE `zhihu_creator` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `url_token` varchar(64) NOT NULL COMMENT '用户URL Token', + `gender` varchar(16) DEFAULT NULL COMMENT '用户性别', + `ip_location` varchar(64) DEFAULT NULL COMMENT 'IP地理位置', + `follows` int NOT NULL DEFAULT 0 COMMENT '关注数', + `fans` int NOT NULL DEFAULT 0 COMMENT '粉丝数', + `anwser_count` int NOT NULL DEFAULT 0 COMMENT '回答数', + `video_count` int NOT NULL DEFAULT 0 COMMENT '视频数', + `question_count` int NOT NULL DEFAULT 0 COMMENT '问题数', + `article_count` int NOT NULL DEFAULT 0 COMMENT '文章数', + `column_count` int NOT NULL DEFAULT 0 COMMENT '专栏数', + `get_voteup_count` int NOT NULL DEFAULT 0 COMMENT '获得的赞同数', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_zhihu_creator_user_id` (`user_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎创作者'; + + +-- add column `like_count` to douyin_aweme_comment +alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数'; + +alter table xhs_note add column xsec_token varchar(50) default null comment '签名算法'; +alter table douyin_aweme_comment add column `pictures` varchar(500) NOT NULL DEFAULT '' COMMENT '评论图片列表'; +alter table bilibili_video_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数'; diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/__init__.py new file mode 100644 index 0000000..771a1ba --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/__init__.py @@ -0,0 +1,15 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 17:29 +# @Desc : diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/__init__.py new file mode 100644 index 0000000..30a5050 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/__init__.py @@ -0,0 +1,220 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 19:34 +# @Desc : + +from typing import List + +import config +from var import source_keyword_var + +from .bilibili_store_impl import * +from .bilibilli_store_media import * + + +class BiliStoreFactory: + STORES = { + "csv": BiliCsvStoreImplement, + "db": BiliDbStoreImplement, + "json": BiliJsonStoreImplement, + "sqlite": BiliSqliteStoreImplement, + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + + +async def update_bilibili_video(video_item: Dict): + video_item_view: Dict = video_item.get("View") + video_user_info: Dict = video_item_view.get("owner") + video_item_stat: Dict = video_item_view.get("stat") + video_id = str(video_item_view.get("aid")) + save_content_item = { + "video_id": video_id, + "video_type": "video", + "title": video_item_view.get("title", "")[:500], + "desc": video_item_view.get("desc", "")[:500], + "create_time": video_item_view.get("pubdate"), + "user_id": str(video_user_info.get("mid")), + "nickname": video_user_info.get("name"), + "avatar": video_user_info.get("face", ""), + "liked_count": str(video_item_stat.get("like", "")), + "disliked_count": str(video_item_stat.get("dislike", "")), + "video_play_count": str(video_item_stat.get("view", "")), + "video_favorite_count": str(video_item_stat.get("favorite", "")), + "video_share_count": str(video_item_stat.get("share", "")), + "video_coin_count": str(video_item_stat.get("coin", "")), + "video_danmaku": str(video_item_stat.get("danmaku", "")), + "video_comment": str(video_item_stat.get("reply", "")), + "last_modify_ts": utils.get_current_timestamp(), + "video_url": f"https://www.bilibili.com/video/av{video_id}", + "video_cover_url": video_item_view.get("pic", ""), + "source_keyword": source_keyword_var.get(), + } + utils.logger.info(f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}") + await BiliStoreFactory.create_store().store_content(content_item=save_content_item) + + +async def update_up_info(video_item: Dict): + video_item_card_list: Dict = video_item.get("Card") + video_item_card: Dict = video_item_card_list.get("card") + saver_up_info = { + "user_id": str(video_item_card.get("mid")), + "nickname": video_item_card.get("name"), + "sex": video_item_card.get("sex"), + "sign": video_item_card.get("sign"), + "avatar": video_item_card.get("face"), + "last_modify_ts": utils.get_current_timestamp(), + "total_fans": video_item_card.get("fans"), + "total_liked": video_item_card_list.get("like_num"), + "user_rank": video_item_card.get("level_info").get("current_level"), + "is_official": video_item_card.get("official_verify").get("type"), + } + utils.logger.info(f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}") + await BiliStoreFactory.create_store().store_creator(creator=saver_up_info) + + +async def batch_update_bilibili_video_comments(video_id: str, comments: List[Dict]): + if not comments: + return + for comment_item in comments: + await update_bilibili_video_comment(video_id, comment_item) + + +async def update_bilibili_video_comment(video_id: str, comment_item: Dict): + comment_id = str(comment_item.get("rpid")) + parent_comment_id = str(comment_item.get("parent", 0)) + content: Dict = comment_item.get("content") + user_info: Dict = comment_item.get("member") + like_count: int = comment_item.get("like", 0) + save_comment_item = { + "comment_id": comment_id, + "parent_comment_id": parent_comment_id, + "create_time": comment_item.get("ctime"), + "video_id": str(video_id), + "content": content.get("message"), + "user_id": user_info.get("mid"), + "nickname": user_info.get("uname"), + "sex": user_info.get("sex"), + "sign": user_info.get("sign"), + "avatar": user_info.get("avatar"), + "sub_comment_count": str(comment_item.get("rcount", 0)), + "like_count": like_count, + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}") + await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item) + + +async def store_video(aid, video_content, extension_file_name): + """ + video video storage implementation + Args: + aid: + video_content: + extension_file_name: + """ + await BilibiliVideo().store_video({ + "aid": aid, + "video_content": video_content, + "extension_file_name": extension_file_name, + }) + + +async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List[Dict]): + if not fans_list: + return + for fan_item in fans_list: + fan_info: Dict = { + "id": fan_item.get("mid"), + "name": fan_item.get("uname"), + "sign": fan_item.get("sign"), + "avatar": fan_item.get("face"), + } + await update_bilibili_creator_contact(creator_info=creator_info, fan_info=fan_info) + + +async def batch_update_bilibili_creator_followings(creator_info: Dict, followings_list: List[Dict]): + if not followings_list: + return + for following_item in followings_list: + following_info: Dict = { + "id": following_item.get("mid"), + "name": following_item.get("uname"), + "sign": following_item.get("sign"), + "avatar": following_item.get("face"), + } + await update_bilibili_creator_contact(creator_info=following_info, fan_info=creator_info) + + +async def batch_update_bilibili_creator_dynamics(creator_info: Dict, dynamics_list: List[Dict]): + if not dynamics_list: + return + for dynamic_item in dynamics_list: + dynamic_id: str = dynamic_item["id_str"] + dynamic_text: str = "" + if dynamic_item["modules"]["module_dynamic"].get("desc"): + dynamic_text = dynamic_item["modules"]["module_dynamic"]["desc"]["text"] + dynamic_type: str = dynamic_item["type"].split("_")[-1] + dynamic_pub_ts: str = dynamic_item["modules"]["module_author"]["pub_ts"] + dynamic_stat: Dict = dynamic_item["modules"]["module_stat"] + dynamic_comment: int = dynamic_stat["comment"]["count"] + dynamic_forward: int = dynamic_stat["forward"]["count"] + dynamic_like: int = dynamic_stat["like"]["count"] + dynamic_info: Dict = { + "dynamic_id": dynamic_id, + "text": dynamic_text, + "type": dynamic_type, + "pub_ts": dynamic_pub_ts, + "total_comments": dynamic_comment, + "total_forwards": dynamic_forward, + "total_liked": dynamic_like, + } + await update_bilibili_creator_dynamic(creator_info=creator_info, dynamic_info=dynamic_info) + + +async def update_bilibili_creator_contact(creator_info: Dict, fan_info: Dict): + save_contact_item = { + "up_id": creator_info["id"], + "fan_id": fan_info["id"], + "up_name": creator_info["name"], + "fan_name": fan_info["name"], + "up_sign": creator_info["sign"], + "fan_sign": fan_info["sign"], + "up_avatar": creator_info["avatar"], + "fan_avatar": fan_info["avatar"], + "last_modify_ts": utils.get_current_timestamp(), + } + + await BiliStoreFactory.create_store().store_contact(contact_item=save_contact_item) + + +async def update_bilibili_creator_dynamic(creator_info: Dict, dynamic_info: Dict): + save_dynamic_item = { + "dynamic_id": dynamic_info["dynamic_id"], + "user_id": creator_info["id"], + "user_name": creator_info["name"], + "text": dynamic_info["text"], + "type": dynamic_info["type"], + "pub_ts": dynamic_info["pub_ts"], + "total_comments": dynamic_info["total_comments"], + "total_forwards": dynamic_info["total_forwards"], + "total_liked": dynamic_info["total_liked"], + "last_modify_ts": utils.get_current_timestamp(), + } + + await BiliStoreFactory.create_store().store_dynamic(dynamic_item=save_dynamic_item) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_impl.py new file mode 100644 index 0000000..d16a0a2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_impl.py @@ -0,0 +1,465 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 19:34 +# @Desc : B站存储实现类 +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + except ValueError: + return 1 + +class BiliCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/bilibili" + file_count:int=calculate_number_of_files(csv_store_path) + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/bilibili/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Bilibili content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Bilibili comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Bilibili creator CSV storage implementation + Args: + creator: creator item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creators") + + async def store_contact(self, contact_item: Dict): + """ + Bilibili contact CSV storage implementation + Args: + contact_item: creator's contact item dict + + Returns: + + """ + + await self.save_data_to_csv(save_item=contact_item, store_type="contacts") + + async def store_dynamic(self, dynamic_item: Dict): + """ + Bilibili dynamic CSV storage implementation + Args: + dynamic_item: creator's dynamic item dict + + Returns: + + """ + + await self.save_data_to_csv(save_item=dynamic_item, store_type="dynamics") + + +class BiliDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Bilibili content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + video_id = content_item.get("video_id") + video_detail: Dict = await query_content_by_content_id(content_id=video_id) + if not video_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(video_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Bilibili content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Bilibili creator DB storage implementation + Args: + creator: creator item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_creator, + query_creator_by_creator_id, + update_creator_by_creator_id) + creator_id = creator.get("user_id") + creator_detail: Dict = await query_creator_by_creator_id(creator_id=creator_id) + if not creator_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_creator_id(creator_id,creator_item=creator) + + async def store_contact(self, contact_item: Dict): + """ + Bilibili contact DB storage implementation + Args: + contact_item: contact item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_contact, + query_contact_by_up_and_fan, + update_contact_by_id, ) + + up_id = contact_item.get("up_id") + fan_id = contact_item.get("fan_id") + contact_detail: Dict = await query_contact_by_up_and_fan(up_id=up_id, fan_id=fan_id) + if not contact_detail: + contact_item["add_ts"] = utils.get_current_timestamp() + await add_new_contact(contact_item) + else: + key_id = contact_detail.get("id") + await update_contact_by_id(id=key_id, contact_item=contact_item) + + async def store_dynamic(self, dynamic_item): + """ + Bilibili dynamic DB storage implementation + Args: + dynamic_item: dynamic item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_dynamic, + query_dynamic_by_dynamic_id, + update_dynamic_by_dynamic_id) + + dynamic_id = dynamic_item.get("dynamic_id") + dynamic_detail = await query_dynamic_by_dynamic_id(dynamic_id=dynamic_id) + if not dynamic_detail: + dynamic_item["add_ts"] = utils.get_current_timestamp() + await add_new_dynamic(dynamic_item) + else: + await update_dynamic_by_dynamic_id(dynamic_id, dynamic_item=dynamic_item) + + +class BiliJsonStoreImplement(AbstractStore): + json_store_path: str = "data/bilibili/json" + words_store_path: str = "data/bilibili/words" + lock = asyncio.Lock() + file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + + def make_save_file_name(self, store_type: str) -> (str,str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + creator JSON storage implementation + Args: + creator: + + Returns: + + """ + await self.save_data_to_json(creator, "creators") + + async def store_contact(self, contact_item: Dict): + """ + creator contact JSON storage implementation + Args: + contact_item: creator's contact item dict + + Returns: + + """ + + await self.save_data_to_json(save_item=contact_item, store_type="contacts") + + async def store_dynamic(self, dynamic_item: Dict): + """ + creator dynamic JSON storage implementation + Args: + dynamic_item: creator's contact item dict + + Returns: + + """ + + await self.save_data_to_json(save_item=dynamic_item, store_type="dynamics") + + +class BiliSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Bilibili content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + video_id = content_item.get("video_id") + video_detail: Dict = await query_content_by_content_id(content_id=video_id) + if not video_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(video_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Bilibili comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Bilibili creator SQLite storage implementation + Args: + creator: creator item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_creator, + query_creator_by_creator_id, + update_creator_by_creator_id) + creator_id = creator.get("user_id") + creator_detail: Dict = await query_creator_by_creator_id(creator_id=creator_id) + if not creator_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_creator_id(creator_id, creator_item=creator) + + async def store_contact(self, contact_item: Dict): + """ + Bilibili contact SQLite storage implementation + Args: + contact_item: contact item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_contact, + query_contact_by_up_and_fan, + update_contact_by_id, ) + + up_id = contact_item.get("up_id") + fan_id = contact_item.get("fan_id") + contact_detail: Dict = await query_contact_by_up_and_fan(up_id=up_id, fan_id=fan_id) + if not contact_detail: + contact_item["add_ts"] = utils.get_current_timestamp() + await add_new_contact(contact_item) + else: + key_id = contact_detail.get("id") + await update_contact_by_id(id=key_id, contact_item=contact_item) + + async def store_dynamic(self, dynamic_item): + """ + Bilibili dynamic SQLite storage implementation + Args: + dynamic_item: dynamic item dict + + Returns: + + """ + + from .bilibili_store_sql import (add_new_dynamic, + query_dynamic_by_dynamic_id, + update_dynamic_by_dynamic_id) + + dynamic_id = dynamic_item.get("dynamic_id") + dynamic_detail = await query_dynamic_by_dynamic_id(dynamic_id=dynamic_id) + if not dynamic_detail: + dynamic_item["add_ts"] = utils.get_current_timestamp() + await add_new_dynamic(dynamic_item) + else: + await update_dynamic_by_dynamic_id(dynamic_id, dynamic_item=dynamic_item) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_sql.py new file mode 100644 index 0000000..0b2654b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibili_store_sql.py @@ -0,0 +1,253 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 15:30 +# @Desc : sql接口集合 + +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from bilibili_video where video_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_video", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_video", content_item, "video_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from bilibili_video_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_video_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_video_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_creator_id(creator_id: str) -> Dict: + """ + 查询up主信息 + Args: + creator_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from bilibili_up_info where user_id = '{creator_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增up主信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_up_info", creator_item) + return last_row_id + + +async def update_creator_by_creator_id(creator_id: str, creator_item: Dict) -> int: + """ + 更新up主信息 + Args: + creator_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_up_info", creator_item, "user_id", creator_id) + return effect_row + + +async def query_contact_by_up_and_fan(up_id: str, fan_id: str) -> Dict: + """ + 查询一条关联关系 + Args: + up_id: + fan_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from bilibili_contact_info where up_id = '{up_id}' and fan_id = '{fan_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_contact(contact_item: Dict) -> int: + """ + 新增关联关系 + Args: + contact_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_contact_info", contact_item) + return last_row_id + + +async def update_contact_by_id(id: str, contact_item: Dict) -> int: + """ + 更新关联关系 + Args: + id: + contact_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_contact_info", contact_item, "id", id) + return effect_row + + +async def query_dynamic_by_dynamic_id(dynamic_id: str) -> Dict: + """ + 查询一条动态信息 + Args: + dynamic_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from bilibili_up_dynamic where dynamic_id = '{dynamic_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_dynamic(dynamic_item: Dict) -> int: + """ + 新增动态信息 + Args: + dynamic_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("bilibili_up_dynamic", dynamic_item) + return last_row_id + + +async def update_dynamic_by_dynamic_id(dynamic_id: str, dynamic_item: Dict) -> int: + """ + 更新动态信息 + Args: + dynamic_id: + dynamic_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("bilibili_up_dynamic", dynamic_item, "dynamic_id", dynamic_id) + return effect_row diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibilli_store_media.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibilli_store_media.py new file mode 100644 index 0000000..524e9fd --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/bilibili/bilibilli_store_media.py @@ -0,0 +1,68 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : helloteemo +# @Time : 2024/7/12 20:01 +# @Desc : bilibili 媒体保存 +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage, AbstractStoreVideo +from tools import utils + + +class BilibiliVideo(AbstractStoreVideo): + video_store_path: str = "data/bilibili/videos" + + async def store_video(self, video_content_item: Dict): + """ + store content + + Args: + video_content_item: + + Returns: + + """ + await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"), video_content_item.get("extension_file_name")) + + def make_save_file_name(self, aid: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + aid: aid + extension_file_name: video filename with extension + + Returns: + + """ + return f"{self.video_store_path}/{aid}/{extension_file_name}" + + async def save_video(self, aid: int, video_content: str, extension_file_name="mp4"): + """ + save video to local + + Args: + aid: aid + video_content: video content + extension_file_name: video filename with extension + + Returns: + + """ + pathlib.Path(self.video_store_path + "/" + str(aid)).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(str(aid), extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(video_content) + utils.logger.info(f"[BilibiliVideoImplement.save_video] save save_video {save_file_name} success ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/__init__.py new file mode 100644 index 0000000..f81aa9b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/__init__.py @@ -0,0 +1,266 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 18:46 +# @Desc : +from typing import List + +import config +from var import source_keyword_var + +from .douyin_store_impl import * +from .douyin_store_media import * + + +class DouyinStoreFactory: + STORES = { + "csv": DouyinCsvStoreImplement, + "db": DouyinDbStoreImplement, + "json": DouyinJsonStoreImplement, + "sqlite": DouyinSqliteStoreImplement, + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = DouyinStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + + +def _extract_note_image_list(aweme_detail: Dict) -> List[str]: + """ + 提取笔记图片列表 + + Args: + aweme_detail (Dict): 抖音内容详情 + + Returns: + List[str]: 笔记图片列表 + """ + images_res: List[str] = [] + images: List[Dict] = aweme_detail.get("images", []) + + if not images: + return [] + + for image in images: + image_url_list = image.get("url_list", []) # download_url_list 为带水印的图片,url_list 为无水印的图片 + if image_url_list: + images_res.append(image_url_list[-1]) + + return images_res + + +def _extract_comment_image_list(comment_item: Dict) -> List[str]: + """ + 提取评论图片列表 + + Args: + comment_item (Dict): 抖音评论 + + Returns: + List[str]: 评论图片列表 + """ + images_res: List[str] = [] + image_list: List[Dict] = comment_item.get("image_list", []) + + if not image_list: + return [] + + for image in image_list: + image_url_list = image.get("origin_url", {}).get("url_list", []) + if image_url_list and len(image_url_list) > 1: + images_res.append(image_url_list[1]) + + return images_res + + +def _extract_content_cover_url(aweme_detail: Dict) -> str: + """ + 提取视频封面地址 + + Args: + aweme_detail (Dict): 抖音内容详情 + + Returns: + str: 视频封面地址 + """ + res_cover_url = "" + + video_item = aweme_detail.get("video", {}) + raw_cover_url_list = (video_item.get("raw_cover", {}) or video_item.get("origin_cover", {})).get("url_list", []) + if raw_cover_url_list and len(raw_cover_url_list) > 1: + res_cover_url = raw_cover_url_list[1] + + return res_cover_url + + +def _extract_video_download_url(aweme_detail: Dict) -> str: + """ + 提取视频下载地址 + + Args: + aweme_detail (Dict): 抖音视频 + + Returns: + str: 视频下载地址 + """ + video_item = aweme_detail.get("video", {}) + url_h264_list = video_item.get("play_addr_h264", {}).get("url_list", []) + url_256_list = video_item.get("play_addr_256", {}).get("url_list", []) + url_list = video_item.get("play_addr", {}).get("url_list", []) + actual_url_list = url_h264_list or url_256_list or url_list + if not actual_url_list or len(actual_url_list) < 2: + return "" + return actual_url_list[-1] + + +def _extract_music_download_url(aweme_detail: Dict) -> str: + """ + 提取音乐下载地址 + + Args: + aweme_detail (Dict): 抖音视频 + + Returns: + str: 音乐下载地址 + """ + music_item = aweme_detail.get("music", {}) + play_url = music_item.get("play_url", {}) + music_url = play_url.get("uri", "") + return music_url + + +async def update_douyin_aweme(aweme_item: Dict): + aweme_id = aweme_item.get("aweme_id") + user_info = aweme_item.get("author", {}) + interact_info = aweme_item.get("statistics", {}) + save_content_item = { + "aweme_id": aweme_id, + "aweme_type": str(aweme_item.get("aweme_type")), + "title": aweme_item.get("desc", ""), + "desc": aweme_item.get("desc", ""), + "create_time": aweme_item.get("create_time"), + "user_id": user_info.get("uid"), + "sec_uid": user_info.get("sec_uid"), + "short_user_id": user_info.get("short_id"), + "user_unique_id": user_info.get("unique_id"), + "user_signature": user_info.get("signature"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0], + "liked_count": str(interact_info.get("digg_count")), + "collected_count": str(interact_info.get("collect_count")), + "comment_count": str(interact_info.get("comment_count")), + "share_count": str(interact_info.get("share_count")), + "ip_location": aweme_item.get("ip_label", ""), + "last_modify_ts": utils.get_current_timestamp(), + "aweme_url": f"https://www.douyin.com/video/{aweme_id}", + "cover_url": _extract_content_cover_url(aweme_item), + "video_download_url": _extract_video_download_url(aweme_item), + "music_download_url": _extract_music_download_url(aweme_item), + "note_download_url": ",".join(_extract_note_image_list(aweme_item)), + "source_keyword": source_keyword_var.get(), + } + utils.logger.info(f"[store.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{save_content_item.get('title')}") + await DouyinStoreFactory.create_store().store_content(content_item=save_content_item) + + +async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): + if not comments: + return + for comment_item in comments: + await update_dy_aweme_comment(aweme_id, comment_item) + + +async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): + comment_aweme_id = comment_item.get("aweme_id") + if aweme_id != comment_aweme_id: + utils.logger.error(f"[store.douyin.update_dy_aweme_comment] comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") + return + user_info = comment_item.get("user", {}) + comment_id = comment_item.get("cid") + parent_comment_id = comment_item.get("reply_id", "0") + avatar_info = (user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get("avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}) + save_comment_item = { + "comment_id": comment_id, + "create_time": comment_item.get("create_time"), + "ip_location": comment_item.get("ip_label", ""), + "aweme_id": aweme_id, + "content": comment_item.get("text"), + "user_id": user_info.get("uid"), + "sec_uid": user_info.get("sec_uid"), + "short_user_id": user_info.get("short_id"), + "user_unique_id": user_info.get("unique_id"), + "user_signature": user_info.get("signature"), + "nickname": user_info.get("nickname"), + "avatar": avatar_info.get("url_list", [""])[0], + "sub_comment_count": str(comment_item.get("reply_comment_total", 0)), + "like_count": (comment_item.get("digg_count") if comment_item.get("digg_count") else 0), + "last_modify_ts": utils.get_current_timestamp(), + "parent_comment_id": parent_comment_id, + "pictures": ",".join(_extract_comment_image_list(comment_item)), + } + utils.logger.info(f"[store.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {save_comment_item.get('content')}") + + await DouyinStoreFactory.create_store().store_comment(comment_item=save_comment_item) + + +async def save_creator(user_id: str, creator: Dict): + user_info = creator.get("user", {}) + gender_map = {0: "未知", 1: "男", 2: "女"} + avatar_uri = user_info.get("avatar_300x300", {}).get("uri") + local_db_item = { + "user_id": user_id, + "nickname": user_info.get("nickname"), + "gender": gender_map.get(user_info.get("gender"), "未知"), + "avatar": f"https://p3-pc.douyinpic.com/img/{avatar_uri}" + r"~c5_300x300.jpeg?from=2956013662", + "desc": user_info.get("signature"), + "ip_location": user_info.get("ip_location"), + "follows": user_info.get("following_count", 0), + "fans": user_info.get("max_follower_count", 0), + "interaction": user_info.get("total_favorited", 0), + "videos_count": user_info.get("aweme_count", 0), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.douyin.save_creator] creator:{local_db_item}") + await DouyinStoreFactory.create_store().store_creator(local_db_item) + + +async def update_dy_aweme_image(aweme_id, pic_content, extension_file_name): + """ + 更新抖音笔记图片 + Args: + aweme_id: + pic_content: + extension_file_name: + + Returns: + + """ + + await DouYinImage().store_image({"aweme_id": aweme_id, "pic_content": pic_content, "extension_file_name": extension_file_name}) + + +async def update_dy_aweme_video(aweme_id, video_content, extension_file_name): + """ + 更新抖音短视频 + Args: + aweme_id: + video_content: + extension_file_name: + + Returns: + + """ + + await DouYinVideo().store_video({"aweme_id": aweme_id, "video_content": video_content, "extension_file_name": extension_file_name}) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_impl.py new file mode 100644 index 0000000..7edbdd2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_impl.py @@ -0,0 +1,324 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 18:46 +# @Desc : 抖音存储实现类 +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + except ValueError: + return 1 + + +class DouyinCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/douyin" + file_count: int = calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/douyin/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Douyin content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Douyin comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Douyin creator CSV storage implementation + Args: + creator: creator item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class DouyinDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Douyin content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .douyin_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + aweme_id = content_item.get("aweme_id") + aweme_detail: Dict = await query_content_by_content_id(content_id=aweme_id) + if not aweme_detail: + content_item["add_ts"] = utils.get_current_timestamp() + if content_item.get("title"): + await add_new_content(content_item) + else: + await update_content_by_content_id(aweme_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Douyin content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .douyin_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Douyin content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .douyin_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + +class DouyinJsonStoreImplement(AbstractStore): + json_store_path: str = "data/douyin/json" + words_store_path: str = "data/douyin/words" + + lock = asyncio.Lock() + file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str,str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False, indent=4)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + + async def store_creator(self, creator: Dict): + """ + Douyin creator CSV storage implementation + Args: + creator: creator item dict + + Returns: + + """ + await self.save_data_to_json(save_item=creator, store_type="creator") + + +class DouyinSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Douyin content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .douyin_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + aweme_id = content_item.get("aweme_id") + aweme_detail: Dict = await query_content_by_content_id(content_id=aweme_id) + if not aweme_detail: + content_item["add_ts"] = utils.get_current_timestamp() + if content_item.get("title"): + await add_new_content(content_item) + else: + await update_content_by_content_id(aweme_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Douyin comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .douyin_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Douyin creator SQLite storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .douyin_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_media.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_media.py new file mode 100644 index 0000000..5eebeea --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_media.py @@ -0,0 +1,111 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage, AbstractStoreVideo +from tools import utils + + +class DouYinImage(AbstractStoreImage): + image_store_path: str = "data/douyin/images" + + async def store_image(self, image_content_item: Dict): + """ + store content + + Args: + image_content_item: + + Returns: + + """ + await self.save_image(image_content_item.get("aweme_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name")) + + def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + aweme_id: aweme id + extension_file_name: image filename with extension + + Returns: + + """ + return f"{self.image_store_path}/{aweme_id}/{extension_file_name}" + + async def save_image(self, aweme_id: str, pic_content: str, extension_file_name): + """ + save image to local + + Args: + aweme_id: aweme id + pic_content: image content + extension_file_name: image filename with extension + + Returns: + + """ + pathlib.Path(self.image_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(aweme_id, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(pic_content) + utils.logger.info(f"[DouYinImageStoreImplement.save_image] save image {save_file_name} success ...") + + +class DouYinVideo(AbstractStoreVideo): + video_store_path: str = "data/douyin/videos" + + async def store_video(self, video_content_item: Dict): + """ + store content + + Args: + video_content_item: + + Returns: + + """ + await self.save_video(video_content_item.get("aweme_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name")) + + def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + aweme_id: aweme id + extension_file_name: video filename with extension + + Returns: + + """ + return f"{self.video_store_path}/{aweme_id}/{extension_file_name}" + + async def save_video(self, aweme_id: str, video_content: str, extension_file_name): + """ + save video to local + + Args: + aweme_id: aweme id + video_content: video content + extension_file_name: video filename with extension + + Returns: + + """ + pathlib.Path(self.video_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(aweme_id, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(video_content) + utils.logger.info(f"[DouYinVideoStoreImplement.save_video] save video {save_file_name} success ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_sql.py new file mode 100644 index 0000000..eb54eb7 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/douyin/douyin_store_sql.py @@ -0,0 +1,160 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 15:30 +# @Desc : sql接口集合 + +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from douyin_aweme where aweme_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("douyin_aweme", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("douyin_aweme", content_item, "aweme_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from douyin_aweme_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("douyin_aweme_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("douyin_aweme_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from dy_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("dy_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("dy_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/__init__.py new file mode 100644 index 0000000..3ce1089 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/__init__.py @@ -0,0 +1,111 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 20:03 +# @Desc : +from typing import List + +import config +from var import source_keyword_var + +from .kuaishou_store_impl import * + + +class KuaishouStoreFactory: + STORES = { + "csv": KuaishouCsvStoreImplement, + "db": KuaishouDbStoreImplement, + "json": KuaishouJsonStoreImplement, + "sqlite": KuaishouSqliteStoreImplement + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = KuaishouStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError( + "[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + + +async def update_kuaishou_video(video_item: Dict): + photo_info: Dict = video_item.get("photo", {}) + video_id = photo_info.get("id") + if not video_id: + return + user_info = video_item.get("author", {}) + save_content_item = { + "video_id": video_id, + "video_type": str(video_item.get("type")), + "title": photo_info.get("caption", "")[:500], + "desc": photo_info.get("caption", "")[:500], + "create_time": photo_info.get("timestamp"), + "user_id": user_info.get("id"), + "nickname": user_info.get("name"), + "avatar": user_info.get("headerUrl", ""), + "liked_count": str(photo_info.get("realLikeCount")), + "viewd_count": str(photo_info.get("viewCount")), + "last_modify_ts": utils.get_current_timestamp(), + "video_url": f"https://www.kuaishou.com/short-video/{video_id}", + "video_cover_url": photo_info.get("coverUrl", ""), + "video_play_url": photo_info.get("photoUrl", ""), + "source_keyword": source_keyword_var.get(), + } + utils.logger.info( + f"[store.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{save_content_item.get('title')}") + await KuaishouStoreFactory.create_store().store_content(content_item=save_content_item) + + +async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]): + utils.logger.info(f"[store.kuaishou.batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}") + if not comments: + return + for comment_item in comments: + await update_ks_video_comment(video_id, comment_item) + + +async def update_ks_video_comment(video_id: str, comment_item: Dict): + comment_id = comment_item.get("commentId") + save_comment_item = { + "comment_id": comment_id, + "create_time": comment_item.get("timestamp"), + "video_id": video_id, + "content": comment_item.get("content"), + "user_id": comment_item.get("authorId"), + "nickname": comment_item.get("authorName"), + "avatar": comment_item.get("headurl"), + "sub_comment_count": str(comment_item.get("subCommentCount", 0)), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info( + f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}") + await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item) + +async def save_creator(user_id: str, creator: Dict): + ownerCount = creator.get('ownerCount', {}) + profile = creator.get('profile', {}) + + local_db_item = { + 'user_id': user_id, + 'nickname': profile.get('user_name'), + 'gender': '女' if profile.get('gender') == "F" else '男', + 'avatar': profile.get('headurl'), + 'desc': profile.get('user_text'), + 'ip_location': "", + 'follows': ownerCount.get("follow"), + 'fans': ownerCount.get("fan"), + 'interaction': ownerCount.get("photo_public"), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}") + await KuaishouStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_impl.py new file mode 100644 index 0000000..950f3a2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_impl.py @@ -0,0 +1,290 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 20:03 +# @Desc : 快手存储实现类 +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + except ValueError: + return 1 + + +class KuaishouCsvStoreImplement(AbstractStore): + async def store_creator(self, creator: Dict): + pass + + csv_store_path: str = "data/kuaishou" + file_count:int=calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/douyin/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Kuaishou content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Kuaishou comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + +class KuaishouDbStoreImplement(AbstractStore): + async def store_creator(self, creator: Dict): + pass + + async def store_content(self, content_item: Dict): + """ + Kuaishou content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .kuaishou_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + video_id = content_item.get("video_id") + video_detail: Dict = await query_content_by_content_id(content_id=video_id) + if not video_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(video_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Kuaishou content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .kuaishou_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + +class KuaishouJsonStoreImplement(AbstractStore): + json_store_path: str = "data/kuaishou/json" + words_store_path: str = "data/kuaishou/words" + lock = asyncio.Lock() + file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + + + def make_save_file_name(self, store_type: str) -> (str,str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Kuaishou content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") + + +class KuaishouSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Kuaishou content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .kuaishou_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + video_id = content_item.get("video_id") + video_detail: Dict = await query_content_by_content_id(content_id=video_id) + if not video_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(video_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Kuaishou comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .kuaishou_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Kuaishou creator SQLite storage implementation + Args: + creator: creator dict + + Returns: + + """ + pass \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_sql.py new file mode 100644 index 0000000..953f00f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/kuaishou/kuaishou_store_sql.py @@ -0,0 +1,114 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 15:30 +# @Desc : sql接口集合 + +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from kuaishou_video where video_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("kuaishou_video", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("kuaishou_video", content_item, "video_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from kuaishou_video_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("kuaishou_video_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("kuaishou_video_comment", comment_item, "comment_id", comment_id) + return effect_row diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/__init__.py new file mode 100644 index 0000000..e928d85 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/__init__.py @@ -0,0 +1,115 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import List + +from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote +from var import source_keyword_var + +from . import tieba_store_impl +from .tieba_store_impl import * + + +class TieBaStoreFactory: + STORES = { + "csv": TieBaCsvStoreImplement, + "db": TieBaDbStoreImplement, + "json": TieBaJsonStoreImplement, + "sqlite": TieBaSqliteStoreImplement + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError( + "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") + return store_class() + + +async def batch_update_tieba_notes(note_list: List[TiebaNote]): + """ + Batch update tieba notes + Args: + note_list: + + Returns: + + """ + if not note_list: + return + for note_item in note_list: + await update_tieba_note(note_item) + + +async def update_tieba_note(note_item: TiebaNote): + """ + Add or Update tieba note + Args: + note_item: + + Returns: + + """ + note_item.source_keyword = source_keyword_var.get() + save_note_item = note_item.model_dump() + save_note_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}") + + await TieBaStoreFactory.create_store().store_content(save_note_item) + + +async def batch_update_tieba_note_comments(note_id: str, comments: List[TiebaComment]): + """ + Batch update tieba note comments + Args: + note_id: + comments: + + Returns: + + """ + if not comments: + return + for comment_item in comments: + await update_tieba_note_comment(note_id, comment_item) + + +async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment): + """ + Update tieba note comment + Args: + note_id: + comment_item: + + Returns: + + """ + save_comment_item = comment_item.model_dump() + save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}") + await TieBaStoreFactory.create_store().store_comment(save_comment_item) + + +async def save_creator(user_info: TiebaCreator): + """ + Save creator information to local + Args: + user_info: + + Returns: + + """ + local_db_item = user_info.model_dump() + local_db_item["last_modify_ts"] = utils.get_current_timestamp() + utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}") + await TieBaStoreFactory.create_store().store_creator(local_db_item) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_impl.py new file mode 100644 index 0000000..84267e4 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_impl.py @@ -0,0 +1,318 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + except ValueError: + return 1 + + +class TieBaCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/tieba" + file_count: int = calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/tieba/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + f.fileno() + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + tieba content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + tieba comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + tieba content CSV storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class TieBaDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + tieba content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + tieba content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + tieba content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .tieba_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class TieBaJsonStoreImplement(AbstractStore): + json_store_path: str = "data/tieba/json" + words_store_path: str = "data/tieba/words" + lock = asyncio.Lock() + file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str, str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + tieba content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") + + +class TieBaSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + tieba content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + tieba comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + tieba creator SQLite storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .tieba_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_sql.py new file mode 100644 index 0000000..702ddac --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/tieba/tieba_store_sql.py @@ -0,0 +1,156 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from tieba_note where note_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from tieba_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/__init__.py new file mode 100644 index 0000000..41bb317 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/__init__.py @@ -0,0 +1,190 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 21:34 +# @Desc : + +import re +from typing import List + +from var import source_keyword_var + +from .weibo_store_media import * +from .weibo_store_impl import * + + +class WeibostoreFactory: + STORES = { + "csv": WeiboCsvStoreImplement, + "db": WeiboDbStoreImplement, + "json": WeiboJsonStoreImplement, + "sqlite": WeiboSqliteStoreImplement, + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + + +async def batch_update_weibo_notes(note_list: List[Dict]): + """ + Batch update weibo notes + Args: + note_list: + + Returns: + + """ + if not note_list: + return + for note_item in note_list: + await update_weibo_note(note_item) + + +async def update_weibo_note(note_item: Dict): + """ + Update weibo note + Args: + note_item: + + Returns: + + """ + if not note_item: + return + + mblog: Dict = note_item.get("mblog") + user_info: Dict = mblog.get("user") + note_id = mblog.get("id") + content_text = mblog.get("text") + clean_text = re.sub(r"<.*?>", "", content_text) + save_content_item = { + # 微博信息 + "note_id": note_id, + "content": clean_text, + "create_time": utils.rfc2822_to_timestamp(mblog.get("created_at")), + "create_date_time": str(utils.rfc2822_to_china_datetime(mblog.get("created_at"))), + "liked_count": str(mblog.get("attitudes_count", 0)), + "comments_count": str(mblog.get("comments_count", 0)), + "shared_count": str(mblog.get("reposts_count", 0)), + "last_modify_ts": utils.get_current_timestamp(), + "note_url": f"https://m.weibo.cn/detail/{note_id}", + "ip_location": mblog.get("region_name", "").replace("发布于 ", ""), + + # 用户信息 + "user_id": str(user_info.get("id")), + "nickname": user_info.get("screen_name", ""), + "gender": user_info.get("gender", ""), + "profile_url": user_info.get("profile_url", ""), + "avatar": user_info.get("profile_image_url", ""), + "source_keyword": source_keyword_var.get(), + } + utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...") + await WeibostoreFactory.create_store().store_content(content_item=save_content_item) + + +async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]): + """ + Batch update weibo note comments + Args: + note_id: + comments: + + Returns: + + """ + if not comments: + return + for comment_item in comments: + await update_weibo_note_comment(note_id, comment_item) + + +async def update_weibo_note_comment(note_id: str, comment_item: Dict): + """ + Update weibo note comment + Args: + note_id: weibo note id + comment_item: weibo comment item + + Returns: + + """ + if not comment_item or not note_id: + return + comment_id = str(comment_item.get("id")) + user_info: Dict = comment_item.get("user") + content_text = comment_item.get("text") + clean_text = re.sub(r"<.*?>", "", content_text) + save_comment_item = { + "comment_id": comment_id, + "create_time": utils.rfc2822_to_timestamp(comment_item.get("created_at")), + "create_date_time": str(utils.rfc2822_to_china_datetime(comment_item.get("created_at"))), + "note_id": note_id, + "content": clean_text, + "sub_comment_count": str(comment_item.get("total_number", 0)), + "comment_like_count": str(comment_item.get("like_count", 0)), + "last_modify_ts": utils.get_current_timestamp(), + "ip_location": comment_item.get("source", "").replace("来自", ""), + "parent_comment_id": comment_item.get("rootid", ""), + + # 用户信息 + "user_id": str(user_info.get("id")), + "nickname": user_info.get("screen_name", ""), + "gender": user_info.get("gender", ""), + "profile_url": user_info.get("profile_url", ""), + "avatar": user_info.get("profile_image_url", ""), + } + utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...") + await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item) + + +async def update_weibo_note_image(picid: str, pic_content, extension_file_name): + """ + Save weibo note image to local + Args: + picid: + pic_content: + extension_file_name: + + Returns: + + """ + await WeiboStoreImage().store_image({"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name}) + + +async def save_creator(user_id: str, user_info: Dict): + """ + Save creator information to local + Args: + user_id: + user_info: + + Returns: + + """ + local_db_item = { + 'user_id': user_id, + 'nickname': user_info.get('screen_name'), + 'gender': '女' if user_info.get('gender') == "f" else '男', + 'avatar': user_info.get('avatar_hd'), + 'desc': user_info.get('description'), + 'ip_location': user_info.get("source", "").replace("来自", ""), + 'follows': user_info.get('follow_count', ''), + 'fans': user_info.get('followers_count', ''), + 'tag_list': '', + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.weibo.save_creator] creator:{local_db_item}") + await WeibostoreFactory.create_store().store_creator(local_db_item) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_impl.py new file mode 100644 index 0000000..2efd7f3 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_impl.py @@ -0,0 +1,326 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 21:35 +# @Desc : 微博存储实现类 +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + except ValueError: + return 1 + + +class WeiboCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/weibo" + file_count: int = calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/bilibili/search_comments_20240114.csv ... + + """ + + return f"{self.csv_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Weibo content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Weibo comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Weibo creator CSV storage implementation + Args: + creator: + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creators") + + +class WeiboDbStoreImplement(AbstractStore): + + async def store_content(self, content_item: Dict): + """ + Weibo content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .weibo_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Weibo content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .weibo_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Weibo creator DB storage implementation + Args: + creator: + + Returns: + + """ + + from .weibo_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class WeiboJsonStoreImplement(AbstractStore): + json_store_path: str = "data/weibo/json" + words_store_path: str = "data/weibo/words" + lock = asyncio.Lock() + file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str, str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + creator JSON storage implementation + Args: + creator: + + Returns: + + """ + await self.save_data_to_json(creator, "creators") + + +class WeiboSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Weibo content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + + from .weibo_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Weibo comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .weibo_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Weibo creator SQLite storage implementation + Args: + creator: + + Returns: + + """ + + from .weibo_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_media.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_media.py new file mode 100644 index 0000000..bc6d69c --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_media.py @@ -0,0 +1,68 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : Erm +# @Time : 2024/4/9 17:35 +# @Desc : 微博媒体保存 +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage, AbstractStoreVideo +from tools import utils + + +class WeiboStoreImage(AbstractStoreImage): + image_store_path: str = "data/weibo/images" + + async def store_image(self, image_content_item: Dict): + """ + store content + + Args: + image_content_item: + + Returns: + + """ + await self.save_image(image_content_item.get("pic_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name")) + + def make_save_file_name(self, picid: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + picid: image id + extension_file_name: video filename with extension + + Returns: + + """ + return f"{self.image_store_path}/{picid}.{extension_file_name}" + + async def save_image(self, picid: str, pic_content: str, extension_file_name="jpg"): + """ + save image to local + + Args: + picid: image id + pic_content: image content + extension_file_name: image filename with extension + + Returns: + + """ + pathlib.Path(self.image_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(picid, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(pic_content) + utils.logger.info(f"[WeiboImageStoreImplement.save_image] save image {save_file_name} success ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_sql.py new file mode 100644 index 0000000..e6c7767 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/weibo/weibo_store_sql.py @@ -0,0 +1,160 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 15:30 +# @Desc : sql接口集合 + +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from weibo_note where note_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("weibo_note", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("weibo_note", content_item, "note_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from weibo_note_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("weibo_note_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("weibo_note_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from weibo_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("weibo_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("weibo_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/__init__.py new file mode 100644 index 0000000..13a50bd --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/__init__.py @@ -0,0 +1,241 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 17:34 +# @Desc : +from typing import List + +import config +from var import source_keyword_var + +from . import xhs_store_impl +from .xhs_store_media import * +from .xhs_store_impl import * + + +class XhsStoreFactory: + STORES = { + "csv": XhsCsvStoreImplement, + "db": XhsDbStoreImplement, + "json": XhsJsonStoreImplement, + "sqlite": XhsSqliteStoreImplement, + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = XhsStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[XhsStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + + +def get_video_url_arr(note_item: Dict) -> List: + """ + 获取视频url数组 + Args: + note_item: + + Returns: + + """ + if note_item.get('type') != 'video': + return [] + + videoArr = [] + originVideoKey = note_item.get('video').get('consumer').get('origin_video_key') + if originVideoKey == '': + originVideoKey = note_item.get('video').get('consumer').get('originVideoKey') + # 降级有水印 + if originVideoKey == '': + videos = note_item.get('video').get('media').get('stream').get('h264') + if type(videos).__name__ == 'list': + videoArr = [v.get('master_url') for v in videos] + else: + videoArr = [f"http://sns-video-bd.xhscdn.com/{originVideoKey}"] + + return videoArr + + +async def update_xhs_note(note_item: Dict): + """ + 更新小红书笔记 + Args: + note_item: + + Returns: + + """ + note_id = note_item.get("note_id") + user_info = note_item.get("user", {}) + interact_info = note_item.get("interact_info", {}) + image_list: List[Dict] = note_item.get("image_list", []) + tag_list: List[Dict] = note_item.get("tag_list", []) + + for img in image_list: + if img.get('url_default') != '': + img.update({'url': img.get('url_default')}) + + video_url = ','.join(get_video_url_arr(note_item)) + + local_db_item = { + "note_id": note_item.get("note_id"), # 帖子id + "type": note_item.get("type"), # 帖子类型 + "title": note_item.get("title") or note_item.get("desc", "")[:255], # 帖子标题 + "desc": note_item.get("desc", ""), # 帖子描述 + "video_url": video_url, # 帖子视频url + "time": note_item.get("time"), # 帖子发布时间 + "last_update_time": note_item.get("last_update_time", 0), # 帖子最后更新时间 + "user_id": user_info.get("user_id"), # 用户id + "nickname": user_info.get("nickname"), # 用户昵称 + "avatar": user_info.get("avatar"), # 用户头像 + "liked_count": interact_info.get("liked_count"), # 点赞数 + "collected_count": interact_info.get("collected_count"), # 收藏数 + "comment_count": interact_info.get("comment_count"), # 评论数 + "share_count": interact_info.get("share_count"), # 分享数 + "ip_location": note_item.get("ip_location", ""), # ip地址 + "image_list": ','.join([img.get('url', '') for img in image_list]), # 图片url + "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # 标签 + "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间) + "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # 帖子url + "source_keyword": source_keyword_var.get(), # 搜索关键词 + "xsec_token": note_item.get("xsec_token"), # xsec_token + } + utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") + await XhsStoreFactory.create_store().store_content(local_db_item) + + +async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]): + """ + 批量更新小红书笔记评论 + Args: + note_id: + comments: + + Returns: + + """ + if not comments: + return + for comment_item in comments: + await update_xhs_note_comment(note_id, comment_item) + + +async def update_xhs_note_comment(note_id: str, comment_item: Dict): + """ + 更新小红书笔记评论 + Args: + note_id: + comment_item: + + Returns: + + """ + user_info = comment_item.get("user_info", {}) + comment_id = comment_item.get("id") + comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] + target_comment = comment_item.get("target_comment", {}) + local_db_item = { + "comment_id": comment_id, # 评论id + "create_time": comment_item.get("create_time"), # 评论时间 + "ip_location": comment_item.get("ip_location"), # ip地址 + "note_id": note_id, # 帖子id + "content": comment_item.get("content"), # 评论内容 + "user_id": user_info.get("user_id"), # 用户id + "nickname": user_info.get("nickname"), # 用户昵称 + "avatar": user_info.get("image"), # 用户头像 + "sub_comment_count": comment_item.get("sub_comment_count", 0), # 子评论数 + "pictures": ",".join(comment_pictures), # 评论图片 + "parent_comment_id": target_comment.get("id", 0), # 父评论id + "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间) + "like_count": comment_item.get("like_count", 0), + } + utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}") + await XhsStoreFactory.create_store().store_comment(local_db_item) + + +async def save_creator(user_id: str, creator: Dict): + """ + 保存小红书创作者 + Args: + user_id: + creator: + + Returns: + + """ + user_info = creator.get('basicInfo', {}) + + follows = 0 + fans = 0 + interaction = 0 + for i in creator.get('interactions'): + if i.get('type') == 'follows': + follows = i.get('count') + elif i.get('type') == 'fans': + fans = i.get('count') + elif i.get('type') == 'interaction': + interaction = i.get('count') + + def get_gender(gender): + if gender == 1: + return '女' + elif gender == 0: + return '男' + else: + return None + + local_db_item = { + 'user_id': user_id, # 用户id + 'nickname': user_info.get('nickname'), # 昵称 + 'gender': get_gender(user_info.get('gender')), # 性别 + 'avatar': user_info.get('images'), # 头像 + 'desc': user_info.get('desc'), # 个人描述 + 'ip_location': user_info.get('ipLocation'), # ip地址 + 'follows': follows, # 关注数 + 'fans': fans, # 粉丝数 + 'interaction': interaction, # 互动数 + 'tag_list': json.dumps({tag.get('tagType'): tag.get('name') + for tag in creator.get('tags')}, ensure_ascii=False), # 标签 + "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳(MediaCrawler程序生成的,主要用途在db存储的时候记录一条记录最新更新时间) + } + utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}") + await XhsStoreFactory.create_store().store_creator(local_db_item) + + +async def update_xhs_note_image(note_id, pic_content, extension_file_name): + """ + 更新小红书笔记图片 + Args: + note_id: + pic_content: + extension_file_name: + + Returns: + + """ + + await XiaoHongShuImage().store_image({"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name}) + + +async def update_xhs_note_video(note_id, video_content, extension_file_name): + """ + 更新小红书笔记视频 + Args: + note_id: + video_content: + extension_file_name: + + Returns: + + """ + + await XiaoHongShuVideo().store_video({"notice_id": note_id, "video_content": video_content, "extension_file_name": extension_file_name}) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_impl.py new file mode 100644 index 0000000..063b01b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_impl.py @@ -0,0 +1,318 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/1/14 16:58 +# @Desc : 小红书存储实现类 +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + except ValueError: + return 1 + + +class XhsCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/xhs" + file_count:int=calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/xhs/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + f.fileno() + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Xiaohongshu content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Xiaohongshu comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content CSV storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class XhsDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .xhs_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .xhs_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .xhs_store_sql import (add_new_creator, query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class XhsJsonStoreImplement(AbstractStore): + json_store_path: str = "data/xhs/json" + words_store_path: str = "data/xhs/words" + lock = asyncio.Lock() + file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str,str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False, indent=4)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") + + +class XhsSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Xiaohongshu content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .xhs_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Xiaohongshu comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .xhs_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu creator SQLite storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .xhs_store_sql import (add_new_creator, query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_media.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_media.py new file mode 100644 index 0000000..5ba6420 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_media.py @@ -0,0 +1,115 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# -*- coding: utf-8 -*- +# @Author : helloteemo +# @Time : 2024/7/11 22:35 +# @Desc : 小红书媒体保存 +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage, AbstractStoreVideo +from tools import utils + + +class XiaoHongShuImage(AbstractStoreImage): + image_store_path: str = "data/xhs/images" + + async def store_image(self, image_content_item: Dict): + """ + store content + + Args: + image_content_item: + + Returns: + + """ + await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name")) + + def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + notice_id: notice id + extension_file_name: image filename with extension + + Returns: + + """ + return f"{self.image_store_path}/{notice_id}/{extension_file_name}" + + async def save_image(self, notice_id: str, pic_content: str, extension_file_name): + """ + save image to local + + Args: + notice_id: notice id + pic_content: image content + extension_file_name: image filename with extension + + Returns: + + """ + pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(notice_id, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(pic_content) + utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...") + + +class XiaoHongShuVideo(AbstractStoreVideo): + video_store_path: str = "data/xhs/videos" + + async def store_video(self, video_content_item: Dict): + """ + store content + + Args: + video_content_item: + + Returns: + + """ + await self.save_video(video_content_item.get("notice_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name")) + + def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str: + """ + make save file name by store type + + Args: + notice_id: notice id + extension_file_name: video filename with extension + + Returns: + + """ + return f"{self.video_store_path}/{notice_id}/{extension_file_name}" + + async def save_video(self, notice_id: str, video_content: str, extension_file_name): + """ + save video to local + + Args: + notice_id: notice id + video_content: video content + extension_file_name: video filename with extension + + Returns: + + """ + pathlib.Path(self.video_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(notice_id, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(video_content) + utils.logger.info(f"[XiaoHongShuVideoStoreImplement.save_video] save video {save_file_name} success ...") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_sql.py new file mode 100644 index 0000000..3da130e --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/xhs/xhs_store_sql.py @@ -0,0 +1,160 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2024/4/6 15:30 +# @Desc : sql接口集合 + +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from xhs_note where note_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("xhs_note", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("xhs_note", content_item, "note_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from xhs_note_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("xhs_note_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("xhs_note_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from xhs_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("xhs_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("xhs_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/__init__.py new file mode 100644 index 0000000..54991b0 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/__init__.py @@ -0,0 +1,117 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import List + +import config +from base.base_crawler import AbstractStore +from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator +from store.zhihu.zhihu_store_impl import (ZhihuCsvStoreImplement, + ZhihuDbStoreImplement, + ZhihuJsonStoreImplement, + ZhihuSqliteStoreImplement) +from tools import utils +from var import source_keyword_var + + +class ZhihuStoreFactory: + STORES = { + "csv": ZhihuCsvStoreImplement, + "db": ZhihuDbStoreImplement, + "json": ZhihuJsonStoreImplement, + "sqlite": ZhihuSqliteStoreImplement + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") + return store_class() + +async def batch_update_zhihu_contents(contents: List[ZhihuContent]): + """ + 批量更新知乎内容 + Args: + contents: + + Returns: + + """ + if not contents: + return + + for content_item in contents: + await update_zhihu_content(content_item) + +async def update_zhihu_content(content_item: ZhihuContent): + """ + 更新知乎内容 + Args: + content_item: + + Returns: + + """ + content_item.source_keyword = source_keyword_var.get() + local_db_item = content_item.model_dump() + local_db_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.zhihu.update_zhihu_content] zhihu content: {local_db_item}") + await ZhihuStoreFactory.create_store().store_content(local_db_item) + + + +async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]): + """ + 批量更新知乎内容评论 + Args: + comments: + + Returns: + + """ + if not comments: + return + + for comment_item in comments: + await update_zhihu_content_comment(comment_item) + + +async def update_zhihu_content_comment(comment_item: ZhihuComment): + """ + 更新知乎内容评论 + Args: + comment_item: + + Returns: + + """ + local_db_item = comment_item.model_dump() + local_db_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}") + await ZhihuStoreFactory.create_store().store_comment(local_db_item) + + +async def save_creator(creator: ZhihuCreator): + """ + 保存知乎创作者信息 + Args: + creator: + + Returns: + + """ + if not creator: + return + local_db_item = creator.model_dump() + local_db_item.update({"last_modify_ts": utils.get_current_timestamp()}) + await ZhihuStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_impl.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_impl.py new file mode 100644 index 0000000..84d5c06 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_impl.py @@ -0,0 +1,318 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + except ValueError: + return 1 + + +class ZhihuCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/zhihu" + file_count: int = calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/zhihu/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + f.fileno() + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Zhihu content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Zhihu comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Zhihu content CSV storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class ZhihuDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Zhihu content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Zhihu content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Zhihu content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class ZhihuJsonStoreImplement(AbstractStore): + json_store_path: str = "data/zhihu/json" + words_store_path: str = "data/zhihu/words" + lock = asyncio.Lock() + file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str, str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False, indent=4)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementation + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Zhihu content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") + + +class ZhihuSqliteStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Zhihu content SQLite storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Zhihu comment SQLite storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Zhihu creator SQLite storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_sql.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_sql.py new file mode 100644 index 0000000..5c0ef89 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/store/zhihu/zhihu_store_sql.py @@ -0,0 +1,156 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +from typing import Dict, List, Union + +from async_db import AsyncMysqlDB +from async_sqlite_db import AsyncSqliteDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from zhihu_content where content_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_content", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_content", content_item, "content_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from zhihu_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + sql: str = f"select * from zhihu_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: Union[AsyncMysqlDB, AsyncSqliteDB] = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/test/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/__init__.py new file mode 100644 index 0000000..e907b1d --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/__init__.py @@ -0,0 +1,12 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_expiring_local_cache.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_expiring_local_cache.py new file mode 100644 index 0000000..829b297 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_expiring_local_cache.py @@ -0,0 +1,50 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/2 10:35 +# @Desc : + +import time +import unittest + +from cache.local_cache import ExpiringLocalCache + + +class TestExpiringLocalCache(unittest.TestCase): + + def setUp(self): + self.cache = ExpiringLocalCache(cron_interval=10) + + def test_set_and_get(self): + self.cache.set('key', 'value', 10) + self.assertEqual(self.cache.get('key'), 'value') + + def test_expired_key(self): + self.cache.set('key', 'value', 1) + time.sleep(2) # wait for the key to expire + self.assertIsNone(self.cache.get('key')) + + def test_clear(self): + # 设置两个键值对,过期时间为11秒 + self.cache.set('key', 'value', 11) + # 睡眠12秒,让cache类的定时任务执行一次 + time.sleep(12) + self.assertIsNone(self.cache.get('key')) + + def tearDown(self): + del self.cache + + +if __name__ == '__main__': + unittest.main() diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_proxy_ip_pool.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_proxy_ip_pool.py new file mode 100644 index 0000000..d7da494 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_proxy_ip_pool.py @@ -0,0 +1,30 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 14:42 +# @Desc : +from unittest import IsolatedAsyncioTestCase + +from proxy.proxy_ip_pool import create_ip_pool +from proxy.types import IpInfoModel + + +class TestIpPool(IsolatedAsyncioTestCase): + async def test_ip_pool(self): + pool = await create_ip_pool(ip_pool_count=1, enable_validate_ip=True) + print("\n") + for i in range(3): + ip_proxy_info: IpInfoModel = await pool.get_proxy() + print(ip_proxy_info) + self.assertIsNotNone(ip_proxy_info.ip, msg="验证 ip 是否获取成功") + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_redis_cache.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_redis_cache.py new file mode 100644 index 0000000..38efd5b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_redis_cache.py @@ -0,0 +1,51 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Name : 程序员阿江-Relakkes +# @Time : 2024/6/2 19:54 +# @Desc : + +import time +import unittest + +from cache.redis_cache import RedisCache + + +class TestRedisCache(unittest.TestCase): + + def setUp(self): + self.redis_cache = RedisCache() + + def test_set_and_get(self): + self.redis_cache.set('key', 'value', 10) + self.assertEqual(self.redis_cache.get('key'), 'value') + + def test_expired_key(self): + self.redis_cache.set('key', 'value', 1) + time.sleep(2) # wait for the key to expire + self.assertIsNone(self.redis_cache.get('key')) + + def test_keys(self): + self.redis_cache.set('key1', 'value1', 10) + self.redis_cache.set('key2', 'value2', 10) + keys = self.redis_cache.keys('*') + self.assertIn('key1', keys) + self.assertIn('key2', keys) + + def tearDown(self): + # self.redis_cache._redis_client.flushdb() # 清空redis数据库 + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_utils.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_utils.py new file mode 100644 index 0000000..f9a7c98 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/test/test_utils.py @@ -0,0 +1,21 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- + +from tools import utils + + +def test_convert_cookies(): + xhs_cookies = "a1=x000101360; webId=1190c4d3cxxxx125xxx; " + cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies) + assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx" + assert cookie_dict.get("a1") == "x000101360" diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/__init__.py new file mode 100644 index 0000000..7c5494a --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/__init__.py @@ -0,0 +1,11 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/browser_launcher.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/browser_launcher.py new file mode 100644 index 0000000..de802a7 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/browser_launcher.py @@ -0,0 +1,249 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import os +import platform +import subprocess +import time +import socket +from typing import Optional, List, Tuple +import asyncio +from pathlib import Path + +from tools import utils + + +class BrowserLauncher: + """ + 浏览器启动器,用于检测和启动用户的Chrome/Edge浏览器 + 支持Windows和macOS系统 + """ + + def __init__(self): + self.system = platform.system() + self.browser_process = None + self.debug_port = None + + def detect_browser_paths(self) -> List[str]: + """ + 检测系统中可用的浏览器路径 + 返回按优先级排序的浏览器路径列表 + """ + paths = [] + + if self.system == "Windows": + # Windows下的常见Chrome/Edge安装路径 + possible_paths = [ + # Chrome路径 + os.path.expandvars(r"%PROGRAMFILES%\Google\Chrome\Application\chrome.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe"), + os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe"), + # Edge路径 + os.path.expandvars(r"%PROGRAMFILES%\Microsoft\Edge\Application\msedge.exe"), + os.path.expandvars(r"%PROGRAMFILES(X86)%\Microsoft\Edge\Application\msedge.exe"), + # Chrome Beta/Dev/Canary + os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Beta\Application\chrome.exe"), + os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Dev\Application\chrome.exe"), + os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome SxS\Application\chrome.exe"), + ] + elif self.system == "Darwin": # macOS + # macOS下的常见Chrome/Edge安装路径 + possible_paths = [ + # Chrome路径 + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta", + "/Applications/Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev", + "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary", + # Edge路径 + "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge", + "/Applications/Microsoft Edge Beta.app/Contents/MacOS/Microsoft Edge Beta", + "/Applications/Microsoft Edge Dev.app/Contents/MacOS/Microsoft Edge Dev", + "/Applications/Microsoft Edge Canary.app/Contents/MacOS/Microsoft Edge Canary", + ] + else: + # Linux等其他系统 + possible_paths = [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + "/usr/bin/chromium-browser", + "/usr/bin/chromium", + "/snap/bin/chromium", + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ] + + # 检查路径是否存在且可执行 + for path in possible_paths: + if os.path.isfile(path) and os.access(path, os.X_OK): + paths.append(path) + + return paths + + def find_available_port(self, start_port: int = 9222) -> int: + """ + 查找可用的端口 + """ + port = start_port + while port < start_port + 100: # 最多尝试100个端口 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('localhost', port)) + return port + except OSError: + port += 1 + + raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port} 到 {port-1}") + + def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False, + user_data_dir: Optional[str] = None) -> subprocess.Popen: + """ + 启动浏览器进程 + """ + # 基本启动参数 + args = [ + browser_path, + f"--remote-debugging-port={debug_port}", + "--remote-debugging-address=0.0.0.0", # 允许远程访问 + "--no-first-run", + "--no-default-browser-check", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-renderer-backgrounding", + "--disable-features=TranslateUI", + "--disable-ipc-flooding-protection", + "--disable-hang-monitor", + "--disable-prompt-on-repost", + "--disable-sync", + "--disable-web-security", # 可能有助于某些网站的访问 + "--disable-features=VizDisplayCompositor", + "--disable-dev-shm-usage", # 避免共享内存问题 + "--no-sandbox", # 在CDP模式下关闭沙箱 + ] + + # 无头模式 + if headless: + args.extend([ + "--headless", + "--disable-gpu", + ]) + else: + # 非无头模式下也保持一些稳定性参数 + args.extend([ + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + ]) + + # 用户数据目录 + if user_data_dir: + args.append(f"--user-data-dir={user_data_dir}") + + utils.logger.info(f"[BrowserLauncher] 启动浏览器: {browser_path}") + utils.logger.info(f"[BrowserLauncher] 调试端口: {debug_port}") + utils.logger.info(f"[BrowserLauncher] 无头模式: {headless}") + + try: + # 在Windows上,使用CREATE_NEW_PROCESS_GROUP避免Ctrl+C影响子进程 + if self.system == "Windows": + process = subprocess.Popen( + args, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + creationflags=subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + process = subprocess.Popen( + args, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + preexec_fn=os.setsid # 创建新的进程组 + ) + + return process + + except Exception as e: + utils.logger.error(f"[BrowserLauncher] 启动浏览器失败: {e}") + raise + + def wait_for_browser_ready(self, debug_port: int, timeout: int = 30) -> bool: + """ + 等待浏览器准备就绪 + """ + utils.logger.info(f"[BrowserLauncher] 等待浏览器在端口 {debug_port} 上准备就绪...") + + start_time = time.time() + while time.time() - start_time < timeout: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(1) + result = s.connect_ex(('localhost', debug_port)) + if result == 0: + utils.logger.info(f"[BrowserLauncher] 浏览器已在端口 {debug_port} 上准备就绪") + return True + except Exception: + pass + + time.sleep(0.5) + + utils.logger.error(f"[BrowserLauncher] 浏览器在 {timeout} 秒内未能准备就绪") + return False + + def get_browser_info(self, browser_path: str) -> Tuple[str, str]: + """ + 获取浏览器信息(名称和版本) + """ + try: + if "chrome" in browser_path.lower(): + name = "Google Chrome" + elif "edge" in browser_path.lower() or "msedge" in browser_path.lower(): + name = "Microsoft Edge" + elif "chromium" in browser_path.lower(): + name = "Chromium" + else: + name = "Unknown Browser" + + # 尝试获取版本信息 + try: + result = subprocess.run([browser_path, "--version"], + capture_output=True, text=True, timeout=5) + version = result.stdout.strip() if result.stdout else "Unknown Version" + except: + version = "Unknown Version" + + return name, version + + except Exception: + return "Unknown Browser", "Unknown Version" + + def cleanup(self): + """ + 清理资源,关闭浏览器进程 + """ + if self.browser_process: + try: + utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...") + + if self.system == "Windows": + # Windows下使用taskkill强制终止进程树 + subprocess.run(["taskkill", "/F", "/T", "/PID", str(self.browser_process.pid)], + capture_output=True) + else: + # Unix系统下终止进程组 + os.killpg(os.getpgid(self.browser_process.pid), 9) + + self.browser_process = None + utils.logger.info("[BrowserLauncher] 浏览器进程已关闭") + + except Exception as e: + utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}") diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/cdp_browser.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/cdp_browser.py new file mode 100644 index 0000000..5a0ed28 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/cdp_browser.py @@ -0,0 +1,341 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import os +import asyncio +import socket +import httpx +from typing import Optional, Dict, Any +from playwright.async_api import Browser, BrowserContext, Playwright + +import config +from tools.browser_launcher import BrowserLauncher +from tools import utils + + +class CDPBrowserManager: + """ + CDP浏览器管理器,负责启动和管理通过CDP连接的浏览器 + """ + + def __init__(self): + self.launcher = BrowserLauncher() + self.browser: Optional[Browser] = None + self.browser_context: Optional[BrowserContext] = None + self.debug_port: Optional[int] = None + + async def launch_and_connect( + self, + playwright: Playwright, + playwright_proxy: Optional[Dict] = None, + user_agent: Optional[str] = None, + headless: bool = False, + ) -> BrowserContext: + """ + 启动浏览器并通过CDP连接 + """ + try: + # 1. 检测浏览器路径 + browser_path = await self._get_browser_path() + + # 2. 获取可用端口 + self.debug_port = self.launcher.find_available_port(config.CDP_DEBUG_PORT) + + # 3. 启动浏览器 + await self._launch_browser(browser_path, headless) + + # 4. 通过CDP连接 + await self._connect_via_cdp(playwright) + + # 5. 创建浏览器上下文 + browser_context = await self._create_browser_context( + playwright_proxy, user_agent + ) + + self.browser_context = browser_context + return browser_context + + except Exception as e: + utils.logger.error(f"[CDPBrowserManager] CDP浏览器启动失败: {e}") + await self.cleanup() + raise + + async def _get_browser_path(self) -> str: + """ + 获取浏览器路径 + """ + # 优先使用用户自定义路径 + if config.CUSTOM_BROWSER_PATH and os.path.isfile(config.CUSTOM_BROWSER_PATH): + utils.logger.info( + f"[CDPBrowserManager] 使用自定义浏览器路径: {config.CUSTOM_BROWSER_PATH}" + ) + return config.CUSTOM_BROWSER_PATH + + # 自动检测浏览器路径 + browser_paths = self.launcher.detect_browser_paths() + + if not browser_paths: + raise RuntimeError( + "未找到可用的浏览器。请确保已安装Chrome或Edge浏览器," + "或在配置文件中设置CUSTOM_BROWSER_PATH指定浏览器路径。" + ) + + browser_path = browser_paths[0] # 使用第一个找到的浏览器 + browser_name, browser_version = self.launcher.get_browser_info(browser_path) + + utils.logger.info( + f"[CDPBrowserManager] 检测到浏览器: {browser_name} ({browser_version})" + ) + utils.logger.info(f"[CDPBrowserManager] 浏览器路径: {browser_path}") + + return browser_path + + async def _test_cdp_connection(self, debug_port: int) -> bool: + """ + 测试CDP连接是否可用 + """ + try: + # 简单的socket连接测试 + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(5) + result = s.connect_ex(("localhost", debug_port)) + if result == 0: + utils.logger.info( + f"[CDPBrowserManager] CDP端口 {debug_port} 可访问" + ) + return True + else: + utils.logger.warning( + f"[CDPBrowserManager] CDP端口 {debug_port} 不可访问" + ) + return False + except Exception as e: + utils.logger.warning(f"[CDPBrowserManager] CDP连接测试失败: {e}") + return False + + async def _launch_browser(self, browser_path: str, headless: bool): + """ + 启动浏览器进程 + """ + # 设置用户数据目录(如果启用了保存登录状态) + user_data_dir = None + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join( + os.getcwd(), + "browser_data", + f"cdp_{config.USER_DATA_DIR % config.PLATFORM}", + ) + os.makedirs(user_data_dir, exist_ok=True) + utils.logger.info(f"[CDPBrowserManager] 用户数据目录: {user_data_dir}") + + # 启动浏览器 + self.launcher.browser_process = self.launcher.launch_browser( + browser_path=browser_path, + debug_port=self.debug_port, + headless=headless, + user_data_dir=user_data_dir, + ) + + # 等待浏览器准备就绪 + if not self.launcher.wait_for_browser_ready( + self.debug_port, config.BROWSER_LAUNCH_TIMEOUT + ): + raise RuntimeError(f"浏览器在 {config.BROWSER_LAUNCH_TIMEOUT} 秒内未能启动") + + # 额外等待一秒让CDP服务完全启动 + await asyncio.sleep(1) + + # 测试CDP连接 + if not await self._test_cdp_connection(self.debug_port): + utils.logger.warning( + "[CDPBrowserManager] CDP连接测试失败,但将继续尝试连接" + ) + + async def _get_browser_websocket_url(self, debug_port: int) -> str: + """ + 获取浏览器的WebSocket连接URL + """ + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"http://localhost:{debug_port}/json/version", timeout=10 + ) + if response.status_code == 200: + data = response.json() + ws_url = data.get("webSocketDebuggerUrl") + if ws_url: + utils.logger.info( + f"[CDPBrowserManager] 获取到浏览器WebSocket URL: {ws_url}" + ) + return ws_url + else: + raise RuntimeError("未找到webSocketDebuggerUrl") + else: + raise RuntimeError(f"HTTP {response.status_code}: {response.text}") + except Exception as e: + utils.logger.error(f"[CDPBrowserManager] 获取WebSocket URL失败: {e}") + raise + + async def _connect_via_cdp(self, playwright: Playwright): + """ + 通过CDP连接到浏览器 + """ + try: + # 获取正确的WebSocket URL + ws_url = await self._get_browser_websocket_url(self.debug_port) + utils.logger.info(f"[CDPBrowserManager] 正在通过CDP连接到浏览器: {ws_url}") + + # 使用Playwright的connectOverCDP方法连接 + self.browser = await playwright.chromium.connect_over_cdp(ws_url) + + if self.browser.is_connected(): + utils.logger.info("[CDPBrowserManager] 成功连接到浏览器") + utils.logger.info( + f"[CDPBrowserManager] 浏览器上下文数量: {len(self.browser.contexts)}" + ) + else: + raise RuntimeError("CDP连接失败") + + except Exception as e: + utils.logger.error(f"[CDPBrowserManager] CDP连接失败: {e}") + raise + + async def _create_browser_context( + self, playwright_proxy: Optional[Dict] = None, user_agent: Optional[str] = None + ) -> BrowserContext: + """ + 创建或获取浏览器上下文 + """ + if not self.browser: + raise RuntimeError("浏览器未连接") + + # 获取现有上下文或创建新的上下文 + contexts = self.browser.contexts + + if contexts: + # 使用现有的第一个上下文 + browser_context = contexts[0] + utils.logger.info("[CDPBrowserManager] 使用现有的浏览器上下文") + else: + # 创建新的上下文 + context_options = { + "viewport": {"width": 1920, "height": 1080}, + "accept_downloads": True, + } + + # 设置用户代理 + if user_agent: + context_options["user_agent"] = user_agent + utils.logger.info(f"[CDPBrowserManager] 设置用户代理: {user_agent}") + + # 注意:CDP模式下代理设置可能不生效,因为浏览器已经启动 + if playwright_proxy: + utils.logger.warning( + "[CDPBrowserManager] 警告: CDP模式下代理设置可能不生效," + "建议在浏览器启动前配置系统代理或浏览器代理扩展" + ) + + browser_context = await self.browser.new_context(**context_options) + utils.logger.info("[CDPBrowserManager] 创建新的浏览器上下文") + + return browser_context + + async def add_stealth_script(self, script_path: str = "libs/stealth.min.js"): + """ + 添加反检测脚本 + """ + if self.browser_context and os.path.exists(script_path): + try: + await self.browser_context.add_init_script(path=script_path) + utils.logger.info( + f"[CDPBrowserManager] 已添加反检测脚本: {script_path}" + ) + except Exception as e: + utils.logger.warning(f"[CDPBrowserManager] 添加反检测脚本失败: {e}") + + async def add_cookies(self, cookies: list): + """ + 添加Cookie + """ + if self.browser_context: + try: + await self.browser_context.add_cookies(cookies) + utils.logger.info(f"[CDPBrowserManager] 已添加 {len(cookies)} 个Cookie") + except Exception as e: + utils.logger.warning(f"[CDPBrowserManager] 添加Cookie失败: {e}") + + async def get_cookies(self) -> list: + """ + 获取当前Cookie + """ + if self.browser_context: + try: + cookies = await self.browser_context.cookies() + return cookies + except Exception as e: + utils.logger.warning(f"[CDPBrowserManager] 获取Cookie失败: {e}") + return [] + return [] + + async def cleanup(self): + """ + 清理资源 + """ + try: + # 关闭浏览器上下文 + # if self.browser_context: + # await self.browser_context.close() + # self.browser_context = None + # utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭") + + # # 断开浏览器连接 + # if self.browser: + # await self.browser.close() + # self.browser = None + # utils.logger.info("[CDPBrowserManager] 浏览器连接已断开") + + # 关闭浏览器进程(如果配置为自动关闭) + if config.AUTO_CLOSE_BROWSER: + self.launcher.cleanup() + else: + utils.logger.info( + "[CDPBrowserManager] 浏览器进程保持运行(AUTO_CLOSE_BROWSER=False)" + ) + + except Exception as e: + utils.logger.error(f"[CDPBrowserManager] 清理资源时出错: {e}") + + def is_connected(self) -> bool: + """ + 检查是否已连接到浏览器 + """ + return self.browser is not None and self.browser.is_connected() + + async def get_browser_info(self) -> Dict[str, Any]: + """ + 获取浏览器信息 + """ + if not self.browser: + return {} + + try: + version = self.browser.version + contexts_count = len(self.browser.contexts) + + return { + "version": version, + "contexts_count": contexts_count, + "debug_port": self.debug_port, + "is_connected": self.is_connected(), + } + except Exception as e: + utils.logger.warning(f"[CDPBrowserManager] 获取浏览器信息失败: {e}") + return {} diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/crawler_util.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/crawler_util.py new file mode 100644 index 0000000..06cf8c5 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/crawler_util.py @@ -0,0 +1,212 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 12:53 +# @Desc : 爬虫相关的工具函数 + +import base64 +import json +import random +import re +import urllib +import urllib.parse +from io import BytesIO +from typing import Dict, List, Optional, Tuple, cast + +import httpx +from PIL import Image, ImageDraw, ImageShow +from playwright.async_api import Cookie, Page + +from . import utils + + +async def find_login_qrcode(page: Page, selector: str) -> str: + """find login qrcode image from target selector""" + try: + elements = await page.wait_for_selector( + selector=selector, + ) + login_qrcode_img = str(await elements.get_property("src")) # type: ignore + if "http://" in login_qrcode_img or "https://" in login_qrcode_img: + async with httpx.AsyncClient(follow_redirects=True) as client: + utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}") + resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()}) + if resp.status_code == 200: + image_data = resp.content + base64_image = base64.b64encode(image_data).decode('utf-8') + return base64_image + raise Exception(f"fetch login image url failed, response message:{resp.text}") + return login_qrcode_img + + except Exception as e: + print(e) + return "" + + +async def find_qrcode_img_from_canvas(page: Page, canvas_selector: str) -> str: + """ + find qrcode image from canvas element + Args: + page: + canvas_selector: + + Returns: + + """ + + # 等待Canvas元素加载完成 + canvas = await page.wait_for_selector(canvas_selector) + + # 截取Canvas元素的截图 + screenshot = await canvas.screenshot() + + # 将截图转换为base64格式 + base64_image = base64.b64encode(screenshot).decode('utf-8') + return base64_image + + +def show_qrcode(qr_code) -> None: # type: ignore + """parse base64 encode qrcode image and show it""" + if "," in qr_code: + qr_code = qr_code.split(",")[1] + qr_code = base64.b64decode(qr_code) + image = Image.open(BytesIO(qr_code)) + + # Add a square border around the QR code and display it within the border to improve scanning accuracy. + width, height = image.size + new_image = Image.new('RGB', (width + 20, height + 20), color=(255, 255, 255)) + new_image.paste(image, (10, 10)) + draw = ImageDraw.Draw(new_image) + draw.rectangle((0, 0, width + 19, height + 19), outline=(0, 0, 0), width=1) + del ImageShow.UnixViewer.options["save_all"] + new_image.show() + + +def get_user_agent() -> str: + ua_list = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.5112.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.5060.53 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.4844.84 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5112.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5060.53 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.4844.84 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5112.79 Safari/537.36" + ] + return random.choice(ua_list) + + +def get_mobile_user_agent() -> str: + ua_list = [ + "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36", + "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0", + "Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36" + ] + return random.choice(ua_list) + + +def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]: + if not cookies: + return "", {} + cookies_str = ";".join([f"{cookie.get('name')}={cookie.get('value')}" for cookie in cookies]) + cookie_dict = dict() + for cookie in cookies: + cookie_dict[cookie.get('name')] = cookie.get('value') + return cookies_str, cookie_dict + + +def convert_str_cookie_to_dict(cookie_str: str) -> Dict: + cookie_dict: Dict[str, str] = dict() + if not cookie_str: + return cookie_dict + for cookie in cookie_str.split(";"): + cookie = cookie.strip() + if not cookie: + continue + cookie_list = cookie.split("=") + if len(cookie_list) != 2: + continue + cookie_value = cookie_list[1] + if isinstance(cookie_value, list): + cookie_value = "".join(cookie_value) + cookie_dict[cookie_list[0]] = cookie_value + return cookie_dict + + +def match_interact_info_count(count_str: str) -> int: + if not count_str: + return 0 + + match = re.search(r'\d+', count_str) + if match: + number = match.group() + return int(number) + else: + return 0 + + +def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[str]]: + """format proxy info for playwright and httpx""" + # fix circular import issue + from proxy.proxy_ip_pool import IpInfoModel + ip_proxy_info = cast(IpInfoModel, ip_proxy_info) + + playwright_proxy = { + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, + } + # httpx 0.28.1 需要直接传入代理URL字符串,而不是字典 + if ip_proxy_info.user and ip_proxy_info.password: + httpx_proxy = f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + else: + httpx_proxy = f"http://{ip_proxy_info.ip}:{ip_proxy_info.port}" + return playwright_proxy, httpx_proxy + + +def extract_text_from_html(html: str) -> str: + """Extract text from HTML, removing all tags.""" + if not html: + return "" + + # Remove script and style elements + clean_html = re.sub(r'<(script|style)[^>]*>.*?', '', html, flags=re.DOTALL) + # Remove all other tags + clean_text = re.sub(r'<[^>]+>', '', clean_html).strip() + return clean_text + +def extract_url_params_to_dict(url: str) -> Dict: + """Extract URL parameters to dict""" + url_params_dict = dict() + if not url: + return url_params_dict + parsed_url = urllib.parse.urlparse(url) + url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query)) + return url_params_dict diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/easing.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/easing.py new file mode 100644 index 0000000..4d94e98 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/easing.py @@ -0,0 +1,81 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# copy from https://github.com/aneasystone/selenium-test/blob/master/12-slider-captcha.py +# thanks to aneasystone for his great work +import math +from typing import List, Tuple + +import numpy as np + + +# https://github.com/gdsmith/jquery.easing/blob/master/jquery.easing.js +def ease_in_quad(x): + return x * x + + +def ease_out_quad(x): + return 1 - (1 - x) * (1 - x) + + +def ease_out_quart(x): + return 1 - pow(1 - x, 4) + + +def ease_out_expo(x): + if x == 1: + return 1 + else: + return 1 - pow(2, -10 * x) + + +def ease_out_bounce(x): + n1 = 7.5625 + d1 = 2.75 + if x < 1 / d1: + return n1 * x * x + elif x < 2 / d1: + x -= 1.5 / d1 + return n1 * x * x + 0.75 + elif x < 2.5 / d1: + x -= 2.25 / d1 + return n1 * x * x + 0.9375 + else: + x -= 2.625 / d1 + return n1 * x * x + 0.984375 + + +def ease_out_elastic(x): + if x == 0: + return 0 + elif x == 1: + return 1 + else: + c4 = (2 * math.pi) / 3 + return pow(2, -10 * x) * math.sin((x * 10 - 0.75) * c4) + 1 + + +def get_tracks(distance, seconds, ease_func) -> Tuple[List[int], List[int]]: + tracks = [0] + offsets = [0] + for t in np.arange(0.0, seconds, 0.1): + ease = globals()[ease_func] + offset = round(ease(t / seconds) * distance) + tracks.append(offset - offsets[-1]) + offsets.append(offset) + return offsets, tracks + + +if __name__ == '__main__': + o, tl = get_tracks(129, 3, "ease_out_expo") + print(tl) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/slider_util.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/slider_util.py new file mode 100644 index 0000000..9557443 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/slider_util.py @@ -0,0 +1,175 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 12:55 +# @Desc : 滑块相关的工具包 +import os +from typing import List +from urllib.parse import urlparse + +import cv2 +import httpx +import numpy as np + + +class Slide: + """ + copy from https://blog.csdn.net/weixin_43582101 thanks for author + update: relakkes + """ + def __init__(self, gap, bg, gap_size=None, bg_size=None, out=None): + """ + :param gap: 缺口图片链接或者url + :param bg: 带缺口的图片链接或者url + """ + self.img_dir = os.path.join(os.getcwd(), 'temp_image') + if not os.path.exists(self.img_dir): + os.makedirs(self.img_dir) + + bg_resize = bg_size if bg_size else (340, 212) + gap_size = gap_size if gap_size else (68, 68) + self.bg = self.check_is_img_path(bg, 'bg', resize=bg_resize) + self.gap = self.check_is_img_path(gap, 'gap', resize=gap_size) + self.out = out if out else os.path.join(self.img_dir, 'out.jpg') + + @staticmethod + def check_is_img_path(img, img_type, resize): + if img.startswith('http'): + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;" + "q=0.8,application/signed-exchange;v=b3;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7,ja;q=0.6", + "AbstractCache-Control": "max-age=0", + "Connection": "keep-alive", + "Host": urlparse(img).hostname, + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/91.0.4472.164 Safari/537.36", + } + img_res = httpx.get(img, headers=headers) + if img_res.status_code == 200: + img_path = f'./temp_image/{img_type}.jpg' + image = np.asarray(bytearray(img_res.content), dtype="uint8") + image = cv2.imdecode(image, cv2.IMREAD_COLOR) + if resize: + image = cv2.resize(image, dsize=resize) + cv2.imwrite(img_path, image) + return img_path + else: + raise Exception(f"保存{img_type}图片失败") + else: + return img + + @staticmethod + def clear_white(img): + """清除图片的空白区域,这里主要清除滑块的空白""" + img = cv2.imread(img) + rows, cols, channel = img.shape + min_x = 255 + min_y = 255 + max_x = 0 + max_y = 0 + for x in range(1, rows): + for y in range(1, cols): + t = set(img[x, y]) + if len(t) >= 2: + if x <= min_x: + min_x = x + elif x >= max_x: + max_x = x + + if y <= min_y: + min_y = y + elif y >= max_y: + max_y = y + img1 = img[min_x:max_x, min_y: max_y] + return img1 + + def template_match(self, tpl, target): + th, tw = tpl.shape[:2] + result = cv2.matchTemplate(target, tpl, cv2.TM_CCOEFF_NORMED) + # 寻找矩阵(一维数组当作向量,用Mat定义) 中最小值和最大值的位置 + min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result) + tl = max_loc + br = (tl[0] + tw, tl[1] + th) + # 绘制矩形边框,将匹配区域标注出来 + # target:目标图像 + # tl:矩形定点 + # br:矩形的宽高 + # (0,0,255):矩形边框颜色 + # 1:矩形边框大小 + cv2.rectangle(target, tl, br, (0, 0, 255), 2) + cv2.imwrite(self.out, target) + return tl[0] + + @staticmethod + def image_edge_detection(img): + edges = cv2.Canny(img, 100, 200) + return edges + + def discern(self): + img1 = self.clear_white(self.gap) + img1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY) + slide = self.image_edge_detection(img1) + + back = cv2.imread(self.bg, cv2.COLOR_RGB2GRAY) + back = self.image_edge_detection(back) + + slide_pic = cv2.cvtColor(slide, cv2.COLOR_GRAY2RGB) + back_pic = cv2.cvtColor(back, cv2.COLOR_GRAY2RGB) + x = self.template_match(slide_pic, back_pic) + # 输出横坐标, 即 滑块在图片上的位置 + return x + + +def get_track_simple(distance) -> List[int]: + # 有的检测移动速度的 如果匀速移动会被识别出来,来个简单点的 渐进 + # distance为传入的总距离 + # 移动轨迹 + track: List[int] = [] + # 当前位移 + current = 0 + # 减速阈值 + mid = distance * 4 / 5 + # 计算间隔 + t = 0.2 + # 初速度 + v = 1 + + while current < distance: + if current < mid: + # 加速度为2 + a = 4 + else: + # 加速度为-2 + a = -3 + v0 = v + # 当前速度 + v = v0 + a * t # type: ignore + # 移动距离 + move = v0 * t + 1 / 2 * a * t * t + # 当前位移 + current += move # type: ignore + # 加入轨迹 + track.append(round(move)) + return track + + +def get_tracks(distance: int, level: str = "easy") -> List[int]: + if level == "easy": + return get_track_simple(distance) + else: + from . import easing + _, tricks = easing.get_tracks(distance, seconds=2, ease_func="ease_out_expo") + return tricks diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/time_util.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/time_util.py new file mode 100644 index 0000000..341aa80 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/time_util.py @@ -0,0 +1,117 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/2 12:52 +# @Desc : 时间相关的工具函数 + +import time +from datetime import datetime, timedelta, timezone + + +def get_current_timestamp() -> int: + """ + 获取当前的时间戳(13 位):1701493264496 + :return: + """ + return int(time.time() * 1000) + + +def get_current_time() -> str: + """ + 获取当前的时间:'2023-12-02 13:01:23' + :return: + """ + return time.strftime('%Y-%m-%d %X', time.localtime()) + + +def get_current_date() -> str: + """ + 获取当前的日期:'2023-12-02' + :return: + """ + return time.strftime('%Y-%m-%d', time.localtime()) + + +def get_time_str_from_unix_time(unixtime): + """ + unix 整数类型时间戳 ==> 字符串日期时间 + :param unixtime: + :return: + """ + if int(unixtime) > 1000000000000: + unixtime = int(unixtime) / 1000 + return time.strftime('%Y-%m-%d %X', time.localtime(unixtime)) + + +def get_date_str_from_unix_time(unixtime): + """ + unix 整数类型时间戳 ==> 字符串日期 + :param unixtime: + :return: + """ + if int(unixtime) > 1000000000000: + unixtime = int(unixtime) / 1000 + return time.strftime('%Y-%m-%d', time.localtime(unixtime)) + + +def get_unix_time_from_time_str(time_str): + """ + 字符串时间 ==> unix 整数类型时间戳,精确到秒 + :param time_str: + :return: + """ + try: + format_str = "%Y-%m-%d %H:%M:%S" + tm_object = time.strptime(str(time_str), format_str) + return int(time.mktime(tm_object)) + except Exception as e: + return 0 + pass + + +def get_unix_timestamp(): + return int(time.time()) + + +def rfc2822_to_china_datetime(rfc2822_time): + # 定义RFC 2822格式 + rfc2822_format = "%a %b %d %H:%M:%S %z %Y" + + # 将RFC 2822时间字符串转换为datetime对象 + dt_object = datetime.strptime(rfc2822_time, rfc2822_format) + + # 将datetime对象的时区转换为中国时区 + dt_object_china = dt_object.astimezone(timezone(timedelta(hours=8))) + return dt_object_china + + +def rfc2822_to_timestamp(rfc2822_time): + # 定义RFC 2822格式 + rfc2822_format = "%a %b %d %H:%M:%S %z %Y" + + # 将RFC 2822时间字符串转换为datetime对象 + dt_object = datetime.strptime(rfc2822_time, rfc2822_format) + + # 将datetime对象转换为UTC时间 + dt_utc = dt_object.replace(tzinfo=timezone.utc) + + # 计算UTC时间对应的Unix时间戳 + timestamp = int(dt_utc.timestamp()) + + return timestamp + + +if __name__ == '__main__': + # 示例用法 + _rfc2822_time = "Sat Dec 23 17:12:54 +0800 2023" + print(rfc2822_to_china_datetime(_rfc2822_time)) diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/utils.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/utils.py new file mode 100644 index 0000000..80f01e2 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/utils.py @@ -0,0 +1,42 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import argparse +import logging + +from .crawler_util import * +from .slider_util import * +from .time_util import * + + +def init_loging_config(): + level = logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", + datefmt='%Y-%m-%d %H:%M:%S' + ) + _logger = logging.getLogger("MediaCrawler") + _logger.setLevel(level) + return _logger + + +logger = init_loging_config() + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/words.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/words.py new file mode 100644 index 0000000..50ab00f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/tools/words.py @@ -0,0 +1,83 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import asyncio +import json +import logging +from collections import Counter + +import aiofiles +import jieba +import matplotlib.pyplot as plt +from wordcloud import WordCloud + +import config +from tools import utils + +plot_lock = asyncio.Lock() + +class AsyncWordCloudGenerator: + def __init__(self): + logging.getLogger('jieba').setLevel(logging.WARNING) + self.stop_words_file = config.STOP_WORDS_FILE + self.lock = asyncio.Lock() + self.stop_words = self.load_stop_words() + self.custom_words = config.CUSTOM_WORDS + for word, group in self.custom_words.items(): + jieba.add_word(word) + + def load_stop_words(self): + with open(self.stop_words_file, 'r', encoding='utf-8') as f: + return set(f.read().strip().split('\n')) + + async def generate_word_frequency_and_cloud(self, data, save_words_prefix): + all_text = ' '.join(item['content'] for item in data) + words = [word for word in jieba.lcut(all_text) if word not in self.stop_words and len(word.strip()) > 0] + word_freq = Counter(words) + + # Save word frequency to file + freq_file = f"{save_words_prefix}_word_freq.json" + async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file: + await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4)) + + # Try to acquire the plot lock without waiting + if plot_lock.locked(): + utils.logger.info("Skipping word cloud generation as the lock is held.") + return + + await self.generate_word_cloud(word_freq, save_words_prefix) + + async def generate_word_cloud(self, word_freq, save_words_prefix): + await plot_lock.acquire() + top_20_word_freq = {word: freq for word, freq in + sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]} + wordcloud = WordCloud( + font_path=config.FONT_PATH, + width=800, + height=400, + background_color='white', + max_words=200, + stopwords=self.stop_words, + colormap='viridis', + contour_color='steelblue', + contour_width=1 + ).generate_from_frequencies(top_20_word_freq) + + # Save word cloud image + plt.figure(figsize=(10, 5), facecolor='white') + plt.imshow(wordcloud, interpolation='bilinear') + + plt.axis('off') + plt.tight_layout(pad=0) + plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300) + plt.close() + + plot_lock.release() \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/uv.lock b/MindSpider/DeepSentimentCrawling/MediaCrawler/uv.lock new file mode 100644 index 0000000..2ca26d3 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/uv.lock @@ -0,0 +1,1685 @@ +version = 1 +revision = 1 +requires-python = ">=3.9" +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.10.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin'", + "python_full_version < '3.10' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform != 'darwin' and sys_platform != 'linux')", +] + +[[package]] +name = "aiofiles" +version = "23.2.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/41/cfed10bc64d774f497a86e5ede9248e1d062db675504b41c320954d99641/aiofiles-23.2.1.tar.gz", hash = "sha256:84ec2218d8419404abcb9f0c02df3f34c6e0a68ed41072acfb1cef5cbc29051a", size = 32072 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/19/5af6804c4cc0fed83f47bff6e413a98a36618e7d40185cd36e69737f3b0e/aiofiles-23.2.1-py3-none-any.whl", hash = "sha256:19297512c647d4b27a2cf7c34caa7e405c0d60b5560618a29a9fe027b18b0107", size = 15727 }, +] + +[[package]] +name = "aiomysql" +version = "0.2.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pymysql" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/76/2c5b55e4406a1957ffdfd933a94c2517455291c97d2b81cec6813754791a/aiomysql-0.2.0.tar.gz", hash = "sha256:558b9c26d580d08b8c5fd1be23c5231ce3aeff2dadad989540fee740253deb67", size = 114706 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/42/87/c982ee8b333c85b8ae16306387d703a1fcdfc81a2f3f15a24820ab1a512d/aiomysql-0.2.0-py3-none-any.whl", hash = "sha256:b7c26da0daf23a5ec5e0b133c03d20657276e4eae9b73e040b72787f6f6ade0a", size = 44215 }, +] + +[[package]] +name = "aiosqlite" +version = "0.21.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, +] + +[[package]] +name = "anyio" +version = "4.9.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916 }, +] + +[[package]] +name = "async-timeout" +version = "5.0.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 }, +] + +[[package]] +name = "certifi" +version = "2025.6.15" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650 }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/28/9901804da60055b406e1a1c5ba7aac1276fb77f1dde635aabfc7fd84b8ab/charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941", size = 201818 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d9/9b/892a8c8af9110935e5adcbb06d9c6fe741b6bb02608c6513983048ba1a18/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd", size = 144649 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7b/a5/4179abd063ff6414223575e008593861d62abfc22455b5d1a44995b7c101/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6", size = 155045 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/95/bc08c7dfeddd26b4be8c8287b9bb055716f31077c8b0ea1cd09553794665/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d", size = 147356 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/2d/7a5b635aa65284bf3eab7653e8b4151ab420ecbae918d3e359d1947b4d61/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86", size = 149471 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/38/51fc6ac74251fd331a8cfdb7ec57beba8c23fd5493f1050f71c87ef77ed0/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c", size = 151317 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/17/edee1e32215ee6e9e46c3e482645b46575a44a2d72c7dfd49e49f60ce6bf/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0", size = 146368 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/2c/ea3e66f2b5f21fd00b2825c94cafb8c326ea6240cd80a91eb09e4a285830/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef", size = 154491 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/47/7be7fa972422ad062e909fd62460d45c3ef4c141805b7078dbab15904ff7/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6", size = 157695 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2f/42/9f02c194da282b2b340f28e5fb60762de1151387a36842a92b533685c61e/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366", size = 154849 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/44/89cacd6628f31fb0b63201a618049be4be2a7435a31b55b5eb1c3674547a/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db", size = 150091 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1f/79/4b8da9f712bc079c0f16b6d67b099b0b8d808c2292c937f267d816ec5ecc/charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a", size = 98445 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7d/d7/96970afb4fb66497a40761cdf7bd4f6fca0fc7bafde3a84f836c1f57a926/charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509", size = 105782 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/2d/fb55fdf41964ec782febbf33cb64be480a6b8f16ded2dbe8db27a405c09f/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214", size = 146626 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/73/6ede2ec59bce19b3edf4209d70004253ec5f4e319f9a2e3f2f15601ed5f7/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a", size = 148567 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/14/957d03c6dc343c04904530b6bef4e5efae5ec7d7990a7cbb868e4595ee30/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd", size = 150957 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0d/c8/8174d0e5c10ccebdcb1b53cc959591c4c722a3ad92461a273e86b9f5a302/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981", size = 145408 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/aa/8904b84bc8084ac19dc52feb4f5952c6df03ffb460a887b42615ee1382e8/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c", size = 153399 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c2/26/89ee1f0e264d201cb65cf054aca6038c03b1a0c6b4ae998070392a3ce605/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b", size = 156815 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/07/68e95b4b345bad3dbbd3a8681737b4338ff2c9df29856a6d6d23ac4c73cb/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d", size = 154537 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/1a/5eefc0ce04affb98af07bc05f3bac9094513c0e23b0562d64af46a06aae4/charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f", size = 149565 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/37/a0/2410e5e6032a174c95e0806b1a6585eb21e12f445ebe239fac441995226a/charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c", size = 98357 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/4f/c02d5c493967af3eda9c771ad4d2bbc8df6f99ddbeb37ceea6e8716a32bc/charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e", size = 105776 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/f8/dfb01ff6cc9af38552c69c9027501ff5a5117c4cc18dcd27cb5259fa1888/charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4", size = 201671 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/fb/74e26ee556a9dbfe3bd264289b67be1e6d616329403036f6507bb9f3f29c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7", size = 144744 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ad/06/8499ee5aa7addc6f6d72e068691826ff093329fe59891e83b092ae4c851c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836", size = 154993 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/a2/5e4c187680728219254ef107a6949c60ee0e9a916a5dadb148c7ae82459c/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597", size = 147382 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4c/fe/56aca740dda674f0cc1ba1418c4d84534be51f639b5f98f538b332dc9a95/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7", size = 149536 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/53/13/db2e7779f892386b589173dd689c1b1e304621c5792046edd8a978cbf9e0/charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f", size = 151349 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/69/35/e52ab9a276186f729bce7a0638585d2982f50402046e4b0faa5d2c3ef2da/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba", size = 146365 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/d8/af7333f732fc2e7635867d56cb7c349c28c7094910c72267586947561b4b/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12", size = 154499 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/3d/a5b2e48acef264d71e036ff30bcc49e51bde80219bb628ba3e00cf59baac/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518", size = 157735 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/d8/23e2c112532a29f3eef374375a8684a4f3b8e784f62b01da931186f43494/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5", size = 154786 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/57/93e0169f08ecc20fe82d12254a200dfaceddc1c12a4077bf454ecc597e33/charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3", size = 150203 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/9d/9bf2b005138e7e060d7ebdec7503d0ef3240141587651f4b445bdf7286c2/charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471", size = 98436 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6d/24/5849d46cf4311bbf21b424c443b09b459f5b436b1558c04e45dbb7cc478b/charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e", size = 105772 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626 }, +] + +[[package]] +name = "click" +version = "8.1.8" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version < '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin'", + "python_full_version < '3.10' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, +] + +[[package]] +name = "click" +version = "8.2.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.10.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215 }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "contourpy" +version = "1.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version < '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin'", + "python_full_version < '3.10' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/f6/31a8f28b4a2a4fa0e01085e542f3081ab0588eff8e589d39d775172c9792/contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4", size = 13464370 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/e0/be8dcc796cfdd96708933e0e2da99ba4bb8f9b2caa9d560a50f3f09a65f3/contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7", size = 265366 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/d6/c953b400219443535d412fcbbc42e7a5e823291236bc0bb88936e3cc9317/contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42", size = 249226 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6f/b4/6fffdf213ffccc28483c524b9dad46bb78332851133b36ad354b856ddc7c/contourpy-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92f8557cbb07415a4d6fa191f20fd9d2d9eb9c0b61d1b2f52a8926e43c6e9af7", size = 308460 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cf/6c/118fc917b4050f0afe07179a6dcbe4f3f4ec69b94f36c9e128c4af480fb8/contourpy-1.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36f965570cff02b874773c49bfe85562b47030805d7d8360748f3eca570f4cab", size = 347623 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/a4/30ff110a81bfe3abf7b9673284d21ddce8cc1278f6f77393c91199da4c90/contourpy-1.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacd81e2d4b6f89c9f8a5b69b86490152ff39afc58a95af002a398273e5ce589", size = 317761 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/e6/d11966962b1aa515f5586d3907ad019f4b812c04e4546cc19ebf62b5178e/contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69375194457ad0fad3a839b9e29aa0b0ed53bb54db1bfb6c3ae43d111c31ce41", size = 322015 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/e3/182383743751d22b7b59c3c753277b6aee3637049197624f333dac5b4c80/contourpy-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a52040312b1a858b5e31ef28c2e865376a386c60c0e248370bbea2d3f3b760d", size = 1262672 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/53/974400c815b2e605f252c8fb9297e2204347d1755a5374354ee77b1ea259/contourpy-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3faeb2998e4fcb256542e8a926d08da08977f7f5e62cf733f3c211c2a5586223", size = 1321688 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/29/99f849faed5593b2926a68a31882af98afbeac39c7fdf7de491d9c85ec6a/contourpy-1.3.0-cp310-cp310-win32.whl", hash = "sha256:36e0cff201bcb17a0a8ecc7f454fe078437fa6bda730e695a92f2d9932bd507f", size = 171145 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/97/3f89bba79ff6ff2b07a3cbc40aa693c360d5efa90d66e914f0ff03b95ec7/contourpy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:87ddffef1dbe5e669b5c2440b643d3fdd8622a348fe1983fad7a0f0ccb1cd67b", size = 216019 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/1f/9375917786cb39270b0ee6634536c0e22abf225825602688990d8f5c6c19/contourpy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fa4c02abe6c446ba70d96ece336e621efa4aecae43eaa9b030ae5fb92b309ad", size = 266356 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/46/9256dd162ea52790c127cb58cfc3b9e3413a6e3478917d1f811d420772ec/contourpy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:834e0cfe17ba12f79963861e0f908556b2cedd52e1f75e6578801febcc6a9f49", size = 250915 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/5d/3056c167fa4486900dfbd7e26a2fdc2338dc58eee36d490a0ed3ddda5ded/contourpy-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbc4c3217eee163fa3984fd1567632b48d6dfd29216da3ded3d7b844a8014a66", size = 310443 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/c2/1a612e475492e07f11c8e267ea5ec1ce0d89971be496c195e27afa97e14a/contourpy-1.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4865cd1d419e0c7a7bf6de1777b185eebdc51470800a9f42b9e9decf17762081", size = 348548 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/45/cf/2c2fc6bb5874158277b4faf136847f0689e1b1a1f640a36d76d52e78907c/contourpy-1.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:303c252947ab4b14c08afeb52375b26781ccd6a5ccd81abcdfc1fafd14cf93c1", size = 319118 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/03/33/003065374f38894cdf1040cef474ad0546368eea7e3a51d48b8a423961f8/contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637f674226be46f6ba372fd29d9523dd977a291f66ab2a74fbeb5530bb3f445d", size = 323162 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/42/80/e637326e85e4105a802e42959f56cff2cd39a6b5ef68d5d9aee3ea5f0e4c/contourpy-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76a896b2f195b57db25d6b44e7e03f221d32fe318d03ede41f8b4d9ba1bff53c", size = 1265396 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/3b/8cbd6416ca1bbc0202b50f9c13b2e0b922b64be888f9d9ee88e6cfabfb51/contourpy-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e1fd23e9d01591bab45546c089ae89d926917a66dceb3abcf01f6105d927e2cb", size = 1324297 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/2c/021a7afaa52fe891f25535506cc861c30c3c4e5a1c1ce94215e04b293e72/contourpy-1.3.0-cp311-cp311-win32.whl", hash = "sha256:d402880b84df3bec6eab53cd0cf802cae6a2ef9537e70cf75e91618a3801c20c", size = 171808 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/2f/804f02ff30a7fae21f98198828d0857439ec4c91a96e20cf2d6c49372966/contourpy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:6cb6cc968059db9c62cb35fbf70248f40994dfcd7aa10444bbf8b3faeb7c2d67", size = 217181 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c9/92/8e0bbfe6b70c0e2d3d81272b58c98ac69ff1a4329f18c73bd64824d8b12e/contourpy-1.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:570ef7cf892f0afbe5b2ee410c507ce12e15a5fa91017a0009f79f7d93a1268f", size = 267838 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/04/33351c5d5108460a8ce6d512307690b023f0cfcad5899499f5c83b9d63b1/contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:da84c537cb8b97d153e9fb208c221c45605f73147bd4cadd23bdae915042aad6", size = 251549 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/51/3d/aa0fe6ae67e3ef9f178389e4caaaa68daf2f9024092aa3c6032e3d174670/contourpy-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0be4d8425bfa755e0fd76ee1e019636ccc7c29f77a7c86b4328a9eb6a26d0639", size = 303177 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/56/c3/c85a7e3e0cab635575d3b657f9535443a6f5d20fac1a1911eaa4bbe1aceb/contourpy-1.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c0da700bf58f6e0b65312d0a5e695179a71d0163957fa381bb3c1f72972537c", size = 341735 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/8d/20f7a211a7be966a53f474bc90b1a8202e9844b3f1ef85f3ae45a77151ee/contourpy-1.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb8b141bb00fa977d9122636b16aa67d37fd40a3d8b52dd837e536d64b9a4d06", size = 314679 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/be/524e377567defac0e21a46e2a529652d165fed130a0d8a863219303cee18/contourpy-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3634b5385c6716c258d0419c46d05c8aa7dc8cb70326c9a4fb66b69ad2b52e09", size = 320549 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/96/fdb2552a172942d888915f3a6663812e9bc3d359d53dafd4289a0fb462f0/contourpy-1.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dce35502151b6bd35027ac39ba6e5a44be13a68f55735c3612c568cac3805fd", size = 1263068 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/25/632eab595e3140adfa92f1322bf8915f68c932bac468e89eae9974cf1c00/contourpy-1.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea348f053c645100612b333adc5983d87be69acdc6d77d3169c090d3b01dc35", size = 1322833 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/e3/69738782e315a1d26d29d71a550dbbe3eb6c653b028b150f70c1a5f4f229/contourpy-1.3.0-cp312-cp312-win32.whl", hash = "sha256:90f73a5116ad1ba7174341ef3ea5c3150ddf20b024b98fb0c3b29034752c8aeb", size = 172681 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0c/89/9830ba00d88e43d15e53d64931e66b8792b46eb25e2050a88fec4a0df3d5/contourpy-1.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b11b39aea6be6764f84360fce6c82211a9db32a7c7de8fa6dd5397cf1d079c3b", size = 218283 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/53/a1/d20415febfb2267af2d7f06338e82171824d08614084714fb2c1dac9901f/contourpy-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3e1c7fa44aaae40a2247e2e8e0627f4bea3dd257014764aa644f319a5f8600e3", size = 267879 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/aa/45/5a28a3570ff6218d8bdfc291a272a20d2648104815f01f0177d103d985e1/contourpy-1.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:364174c2a76057feef647c802652f00953b575723062560498dc7930fc9b1cb7", size = 251573 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/1c/d3f51540108e3affa84f095c8b04f0aa833bb797bc8baa218a952a98117d/contourpy-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32b238b3b3b649e09ce9aaf51f0c261d38644bdfa35cbaf7b263457850957a84", size = 303184 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/00/56/1348a44fb6c3a558c1a3a0cd23d329d604c99d81bf5a4b58c6b71aab328f/contourpy-1.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d51fca85f9f7ad0b65b4b9fe800406d0d77017d7270d31ec3fb1cc07358fdea0", size = 340262 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/23/00d665ba67e1bb666152131da07e0f24c95c3632d7722caa97fb61470eca/contourpy-1.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:732896af21716b29ab3e988d4ce14bc5133733b85956316fb0c56355f398099b", size = 313806 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/42/3cf40f7040bb8362aea19af9a5fb7b32ce420f645dd1590edcee2c657cd5/contourpy-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d73f659398a0904e125280836ae6f88ba9b178b2fed6884f3b1f95b989d2c8da", size = 319710 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/32/f3bfa3fc083b25e1a7ae09197f897476ee68e7386e10404bdf9aac7391f0/contourpy-1.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c6c7c2408b7048082932cf4e641fa3b8ca848259212f51c8c59c45aa7ac18f14", size = 1264107 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1c/1e/1019d34473a736664f2439542b890b2dc4c6245f5c0d8cdfc0ccc2cab80c/contourpy-1.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f317576606de89da6b7e0861cf6061f6146ead3528acabff9236458a6ba467f8", size = 1322458 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/85/4f8bfd83972cf8909a4d36d16b177f7b8bdd942178ea4bf877d4a380a91c/contourpy-1.3.0-cp313-cp313-win32.whl", hash = "sha256:31cd3a85dbdf1fc002280c65caa7e2b5f65e4a973fcdf70dd2fdcb9868069294", size = 172643 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/4a/fb3c83c1baba64ba90443626c228ca14f19a87c51975d3b1de308dd2cf08/contourpy-1.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:4553c421929ec95fb07b3aaca0fae668b2eb5a5203d1217ca7c34c063c53d087", size = 218301 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/65/702f4064f397821fea0cb493f7d3bc95a5d703e20954dce7d6d39bacf378/contourpy-1.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:345af746d7766821d05d72cb8f3845dfd08dd137101a2cb9b24de277d716def8", size = 278972 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/85/21f5bba56dba75c10a45ec00ad3b8190dbac7fd9a8a8c46c6116c933e9cf/contourpy-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3bb3808858a9dc68f6f03d319acd5f1b8a337e6cdda197f02f4b8ff67ad2057b", size = 263375 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0a/64/084c86ab71d43149f91ab3a4054ccf18565f0a8af36abfa92b1467813ed6/contourpy-1.3.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:420d39daa61aab1221567b42eecb01112908b2cab7f1b4106a52caaec8d36973", size = 307188 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/ff/d61a4c288dc42da0084b8d9dc2aa219a850767165d7d9a9c364ff530b509/contourpy-1.3.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d63ee447261e963af02642ffcb864e5a2ee4cbfd78080657a9880b8b1868e18", size = 345644 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/aa/00d2313d35ec03f188e8f0786c2fc61f589306e02fdc158233697546fd58/contourpy-1.3.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:167d6c890815e1dac9536dca00828b445d5d0df4d6a8c6adb4a7ec3166812fa8", size = 317141 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/6a/b5242c8cb32d87f6abf4f5e3044ca397cb1a76712e3fa2424772e3ff495f/contourpy-1.3.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:710a26b3dc80c0e4febf04555de66f5fd17e9cf7170a7b08000601a10570bda6", size = 323469 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6f/a6/73e929d43028a9079aca4bde107494864d54f0d72d9db508a51ff0878593/contourpy-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:75ee7cb1a14c617f34a51d11fa7524173e56551646828353c4af859c56b766e2", size = 1260894 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/1e/1e726ba66eddf21c940821df8cf1a7d15cb165f0682d62161eaa5e93dae1/contourpy-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:33c92cdae89ec5135d036e7218e69b0bb2851206077251f04a6c4e0e21f03927", size = 1314829 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/e3/b9f72758adb6ef7397327ceb8b9c39c75711affb220e4f53c745ea1d5a9a/contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a11077e395f67ffc2c44ec2418cfebed032cd6da3022a94fc227b6faf8e2acb8", size = 265518 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/22/19f5b948367ab5260fb41d842c7a78dae645603881ea6bc39738bcfcabf6/contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8134301d7e204c88ed7ab50028ba06c683000040ede1d617298611f9dc6240c", size = 249350 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/76/0c7d43263dd00ae21a91a24381b7e813d286a3294d95d179ef3a7b9fb1d7/contourpy-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e12968fdfd5bb45ffdf6192a590bd8ddd3ba9e58360b29683c6bb71a7b41edca", size = 309167 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/96/3b/cadff6773e89f2a5a492c1a8068e21d3fccaf1a1c1df7d65e7c8e3ef60ba/contourpy-1.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd2a0fc506eccaaa7595b7e1418951f213cf8255be2600f1ea1b61e46a60c55f", size = 348279 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/86/158cc43aa549d2081a955ab11c6bdccc7a22caacc2af93186d26f5f48746/contourpy-1.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfb5c62ce023dfc410d6059c936dcf96442ba40814aefbfa575425a3a7f19dc", size = 318519 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/11/57335544a3027e9b96a05948c32e566328e3a2f84b7b99a325b7a06d2b06/contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68a32389b06b82c2fdd68276148d7b9275b5f5cf13e5417e4252f6d1a34f72a2", size = 321922 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/e3/02114f96543f4a1b694333b92a6dcd4f8eebbefcc3a5f3bbb1316634178f/contourpy-1.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94e848a6b83da10898cbf1311a815f770acc9b6a3f2d646f330d57eb4e87592e", size = 1258017 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f3/3b/bfe4c81c6d5881c1c643dde6620be0b42bf8aab155976dd644595cfab95c/contourpy-1.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d78ab28a03c854a873787a0a42254a0ccb3cb133c672f645c9f9c8f3ae9d0800", size = 1316773 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/17/c52d2970784383cafb0bd918b6fb036d98d96bbf0bc1befb5d1e31a07a70/contourpy-1.3.0-cp39-cp39-win32.whl", hash = "sha256:81cb5ed4952aae6014bc9d0421dec7c5835c9c8c31cdf51910b708f548cf58e5", size = 171353 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/53/23/db9f69676308e094d3c45f20cc52e12d10d64f027541c995d89c11ad5c75/contourpy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:14e262f67bd7e6eb6880bc564dcda30b15e351a594657e55b7eec94b6ef72843", size = 211817 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/09/60e486dc2b64c94ed33e58dcfb6f808192c03dfc5574c016218b9b7680dc/contourpy-1.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fe41b41505a5a33aeaed2a613dccaeaa74e0e3ead6dd6fd3a118fb471644fd6c", size = 261886 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/20/b57f9f7174fcd439a7789fb47d764974ab646fa34d1790551de386457a8e/contourpy-1.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca7e17a65f72a5133bdbec9ecf22401c62bcf4821361ef7811faee695799779", size = 311008 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/fc/5040d42623a1845d4f17a418e590fd7a79ae8cb2bad2b2f83de63c3bdca4/contourpy-1.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1ec4dc6bf570f5b22ed0d7efba0dfa9c5b9e0431aeea7581aa217542d9e809a4", size = 215690 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/24/dc3dcd77ac7460ab7e9d2b01a618cb31406902e50e605a8d6091f0a8f7cc/contourpy-1.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:00ccd0dbaad6d804ab259820fa7cb0b8036bda0686ef844d24125d8287178ce0", size = 261894 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b1/db/531642a01cfec39d1682e46b5457b07cf805e3c3c584ec27e2a6223f8f6c/contourpy-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca947601224119117f7c19c9cdf6b3ab54c5726ef1d906aa4a69dfb6dd58102", size = 311099 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/38/1e/94bda024d629f254143a134eead69e21c836429a2a6ce82209a00ddcb79a/contourpy-1.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6ec93afeb848a0845a18989da3beca3eec2c0f852322efe21af1931147d12cb", size = 215838 }, +] + +[[package]] +name = "contourpy" +version = "1.3.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.10.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/a3/da4153ec8fe25d263aa48c1a4cbde7f49b59af86f0b6f7862788c60da737/contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934", size = 268551 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2f/6c/330de89ae1087eb622bfca0177d32a7ece50c3ef07b28002de4757d9d875/contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989", size = 253399 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/bd/20c6726b1b7f81a8bee5271bed5c165f0a8e1f572578a9d27e2ccb763cb2/contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d", size = 312061 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/fc/a9665c88f8a2473f823cf1ec601de9e5375050f1958cbb356cdf06ef1ab6/contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9", size = 351956 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/eb/9f0a0238f305ad8fb7ef42481020d6e20cf15e46be99a1fcf939546a177e/contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512", size = 320872 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/5c/1ee32d1c7956923202f00cf8d2a14a62ed7517bdc0ee1e55301227fc273c/contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631", size = 325027 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/83/bf/9baed89785ba743ef329c2b07fd0611d12bfecbedbdd3eeecf929d8d3b52/contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f", size = 1306641 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/cc/74e5e83d1e35de2d28bd97033426b450bc4fd96e092a1f7a63dc7369b55d/contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2", size = 1374075 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0c/42/17f3b798fd5e033b46a16f8d9fcb39f1aba051307f5ebf441bad1ecf78f8/contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0", size = 177534 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/ec/5162b8582f2c994721018d0c9ece9dc6ff769d298a8ac6b6a652c307e7df/contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a", size = 221188 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/b9/ede788a0b56fc5b071639d06c33cb893f68b1178938f3425debebe2dab78/contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445", size = 269636 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e6/75/3469f011d64b8bbfa04f709bfc23e1dd71be54d05b1b083be9f5b22750d1/contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773", size = 254636 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/2f/95adb8dae08ce0ebca4fd8e7ad653159565d9739128b2d5977806656fcd2/contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1", size = 313053 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c3/a6/8ccf97a50f31adfa36917707fe39c9a0cbc24b3bbb58185577f119736cc9/contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43", size = 352985 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/b6/7925ab9b77386143f39d9c3243fdd101621b4532eb126743201160ffa7e6/contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab", size = 323750 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c2/f3/20c5d1ef4f4748e52d60771b8560cf00b69d5c6368b5c2e9311bcfa2a08b/contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7", size = 326246 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/e5/9dae809e7e0b2d9d70c52b3d24cba134dd3dad979eb3e5e71f5df22ed1f5/contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83", size = 1308728 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/4a/0058ba34aeea35c0b442ae61a4f4d4ca84d6df8f91309bc2d43bb8dd248f/contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd", size = 1375762 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/33/7174bdfc8b7767ef2c08ed81244762d93d5c579336fc0b51ca57b33d1b80/contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f", size = 178196 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5e/fe/4029038b4e1c4485cef18e480b0e2cd2d755448bb071eb9977caac80b77b/contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878", size = 222017 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/f7/44785876384eff370c251d58fd65f6ad7f39adce4a093c934d4a67a7c6b6/contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2", size = 271580 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/93/3b/0004767622a9826ea3d95f0e9d98cd8729015768075d61f9fea8eeca42a8/contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15", size = 255530 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/bb/7bd49e1f4fa805772d9fd130e0d375554ebc771ed7172f48dfcd4ca61549/contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92", size = 307688 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/97/e1d5dbbfa170725ef78357a9a0edc996b09ae4af170927ba8ce977e60a5f/contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87", size = 347331 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6f/66/e69e6e904f5ecf6901be3dd16e7e54d41b6ec6ae3405a535286d4418ffb4/contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415", size = 318963 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/32/b8a1c8965e4f72482ff2d1ac2cd670ce0b542f203c8e1d34e7c3e6925da7/contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe", size = 323681 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/c6/12a7e6811d08757c7162a541ca4c5c6a34c0f4e98ef2b338791093518e40/contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441", size = 1308674 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/8a/bebe5a3f68b484d3a2b8ffaf84704b3e343ef1addea528132ef148e22b3b/contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e", size = 1380480 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/db/fcd325f19b5978fb509a7d55e06d99f5f856294c1991097534360b307cf1/contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912", size = 178489 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/c8/fadd0b92ffa7b5eb5949bf340a63a4a496a6930a6c37a7ba0f12acb076d6/contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73", size = 223042 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/61/5673f7e364b31e4e7ef6f61a4b5121c5f170f941895912f773d95270f3a2/contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb", size = 271630 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/66/a40badddd1223822c95798c55292844b7e871e50f6bfd9f158cb25e0bd39/contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08", size = 255670 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1e/c7/cf9fdee8200805c9bc3b148f49cb9482a4e3ea2719e772602a425c9b09f8/contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c", size = 306694 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/e7/ccb9bec80e1ba121efbffad7f38021021cda5be87532ec16fd96533bb2e0/contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f", size = 345986 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/49/ca13bb2da90391fa4219fdb23b078d6065ada886658ac7818e5441448b78/contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85", size = 318060 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c8/65/5245ce8c548a8422236c13ffcdcdada6a2a812c361e9e0c70548bb40b661/contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841", size = 322747 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/72/30/669b8eb48e0a01c660ead3752a25b44fdb2e5ebc13a55782f639170772f9/contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422", size = 1308895 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/5a/b569f4250decee6e8d54498be7bdf29021a4c256e77fe8138c8319ef8eb3/contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef", size = 1379098 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/ba/b227c3886d120e60e41b28740ac3617b2f2b971b9f601c835661194579f1/contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f", size = 178535 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/6e/2fed56cd47ca739b43e892707ae9a13790a486a3173be063681ca67d2262/contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9", size = 223096 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/4c/e76fe2a03014a7c767d79ea35c86a747e9325537a8b7627e0e5b3ba266b4/contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f", size = 285090 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7b/e2/5aba47debd55d668e00baf9651b721e7733975dc9fc27264a62b0dd26eb8/contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739", size = 268643 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/37/cd45f1f051fe6230f751cc5cdd2728bb3a203f5619510ef11e732109593c/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823", size = 310443 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/a2/36ea6140c306c9ff6dd38e3bcec80b3b018474ef4d17eb68ceecd26675f4/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5", size = 349865 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/b7/2fc76bc539693180488f7b6cc518da7acbbb9e3b931fd9280504128bf956/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532", size = 321162 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/10/76d4f778458b0aa83f96e59d65ece72a060bacb20cfbee46cf6cd5ceba41/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b", size = 327355 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/43/a3/10cf483ea683f9f8ab096c24bad3cce20e0d1dd9a4baa0e2093c1c962d9d/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52", size = 1307935 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/73/69dd9a024444489e22d86108e7b913f3528f56cfc312b5c5727a44188471/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd", size = 1372168 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/1b/96d586ccf1b1a9d2004dd519b25fbf104a11589abfd05484ff12199cca21/contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1", size = 189550 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/e6/6000d0094e8a5e32ad62591c8609e269febb6e4db83a1c75ff8868b42731/contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69", size = 238214 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/33/05/b26e3c6ecc05f349ee0013f0bb850a761016d89cec528a98193a48c34033/contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c", size = 265681 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/25/ac07d6ad12affa7d1ffed11b77417d0a6308170f44ff20fa1d5aa6333f03/contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16", size = 315101 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/4d/5bb3192bbe9d3f27e3061a6a8e7733c9120e203cb8515767d30973f71030/contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad", size = 220599 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/c0/91f1215d0d9f9f343e4773ba6c9b89e8c0cc7a64a6263f21139da639d848/contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0", size = 266807 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/79/6be7e90c955c0487e7712660d6cead01fa17bff98e0ea275737cc2bc8e71/contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5", size = 318729 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791 }, +] + +[[package]] +name = "cssselect" +version = "1.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 }, +] + +[[package]] +name = "cycler" +version = "0.12.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30" }, +] + +[[package]] +name = "exceptiongroup" +version = "1.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674 }, +] + +[[package]] +name = "fastapi" +version = "0.110.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/50/59805603199f4f18b1ba73de2d146b568cd83d68e807cad6a4746ce84ea9/fastapi-0.110.2.tar.gz", hash = "sha256:b53d673652da3b65e8cd787ad214ec0fe303cad00d2b529b86ce7db13f17518d", size = 11992951 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ad/0f/feb7fd8957714498fc4a6be7f13408869619f868f418698a2d934afa82a7/fastapi-0.110.2-py3-none-any.whl", hash = "sha256:239403f2c0a3dda07a9420f95157a7f014ddb2b770acdbc984f9bdf3ead7afdb", size = 91870 }, +] + +[[package]] +name = "fonttools" +version = "4.58.4" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/5a/1124b2c8cb3a8015faf552e92714040bcdbc145dfa29928891b02d147a18/fonttools-4.58.4.tar.gz", hash = "sha256:928a8009b9884ed3aae17724b960987575155ca23c6f0b8146e400cc9e0d44ba", size = 3525026 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/86/d22c24caa574449b56e994ed1a96d23b23af85557fb62a92df96439d3f6c/fonttools-4.58.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:834542f13fee7625ad753b2db035edb674b07522fcbdd0ed9e9a9e2a1034467f", size = 2748349 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/b8/384aca93856def00e7de30341f1e27f439694857d82c35d74a809c705ed0/fonttools-4.58.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e6c61ce330142525296170cd65666e46121fc0d44383cbbcfa39cf8f58383df", size = 2318565 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/f2/273edfdc8d9db89ecfbbf659bd894f7e07b6d53448b19837a4bdba148d17/fonttools-4.58.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9c75f8faa29579c0fbf29b56ae6a3660c6c025f3b671803cb6a9caa7e4e3a98", size = 4838855 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/fa/403703548c093c30b52ab37e109b369558afa221130e67f06bef7513f28a/fonttools-4.58.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:88dedcedbd5549e35b2ea3db3de02579c27e62e51af56779c021e7b33caadd0e", size = 4767637 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/a8/3380e1e0bff6defb0f81c9abf274a5b4a0f30bc8cab4fd4e346c6f923b4c/fonttools-4.58.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ae80a895adab43586f4da1521d58fd4f4377cef322ee0cc205abcefa3a5effc3", size = 4819397 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cd/1b/99e47eb17a8ca51d808622a4658584fa8f340857438a4e9d7ac326d4a041/fonttools-4.58.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0d3acc7f0d151da116e87a182aefb569cf0a3c8e0fd4c9cd0a7c1e7d3e7adb26", size = 4926641 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/75/415254408f038e35b36c8525fc31feb8561f98445688dd2267c23eafd7a2/fonttools-4.58.4-cp310-cp310-win32.whl", hash = "sha256:1244f69686008e7e8d2581d9f37eef330a73fee3843f1107993eb82c9d306577", size = 2201917 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/69/f019a15ed2946317c5318e1bcc8876f8a54a313848604ad1d4cfc4c07916/fonttools-4.58.4-cp310-cp310-win_amd64.whl", hash = "sha256:2a66c0af8a01eb2b78645af60f3b787de5fe5eb1fd8348163715b80bdbfbde1f", size = 2246327 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/7b/cc6e9bb41bab223bd2dc70ba0b21386b85f604e27f4c3206b4205085a2ab/fonttools-4.58.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3841991c9ee2dc0562eb7f23d333d34ce81e8e27c903846f0487da21e0028eb", size = 2768901 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/15/98d75df9f2b4e7605f3260359ad6e18e027c11fa549f74fce567270ac891/fonttools-4.58.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3c98f91b6a9604e7ffb5ece6ea346fa617f967c2c0944228801246ed56084664", size = 2328696 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/c8/dc92b80f5452c9c40164e01b3f78f04b835a00e673bd9355ca257008ff61/fonttools-4.58.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ab9f891eb687ddf6a4e5f82901e00f992e18012ca97ab7acd15f13632acd14c1", size = 5018830 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/48/8322cf177680505d6b0b6062e204f01860cb573466a88077a9b795cb70e8/fonttools-4.58.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:891c5771e8f0094b7c0dc90eda8fc75e72930b32581418f2c285a9feedfd9a68", size = 4960922 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/14/e0/2aff149ed7eb0916de36da513d473c6fff574a7146891ce42de914899395/fonttools-4.58.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:43ba4d9646045c375d22e3473b7d82b18b31ee2ac715cd94220ffab7bc2d5c1d", size = 4997135 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e6/6f/4d9829b29a64a2e63a121cb11ecb1b6a9524086eef3e35470949837a1692/fonttools-4.58.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33d19f16e6d2ffd6669bda574a6589941f6c99a8d5cfb9f464038244c71555de", size = 5108701 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6f/1e/2d656ddd1b0cd0d222f44b2d008052c2689e66b702b9af1cd8903ddce319/fonttools-4.58.4-cp311-cp311-win32.whl", hash = "sha256:b59e5109b907da19dc9df1287454821a34a75f2632a491dd406e46ff432c2a24", size = 2200177 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fb/83/ba71ad053fddf4157cb0697c8da8eff6718d059f2a22986fa5f312b49c92/fonttools-4.58.4-cp311-cp311-win_amd64.whl", hash = "sha256:3d471a5b567a0d1648f2e148c9a8bcf00d9ac76eb89e976d9976582044cc2509", size = 2247892 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/3c/1d1792bfe91ef46f22a3d23b4deb514c325e73c17d4f196b385b5e2faf1c/fonttools-4.58.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:462211c0f37a278494e74267a994f6be9a2023d0557aaa9ecbcbfce0f403b5a6", size = 2754082 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/1f/2b261689c901a1c3bc57a6690b0b9fc21a9a93a8b0c83aae911d3149f34e/fonttools-4.58.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0c7a12fb6f769165547f00fcaa8d0df9517603ae7e04b625e5acb8639809b82d", size = 2321677 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fe/6b/4607add1755a1e6581ae1fc0c9a640648e0d9cdd6591cc2d581c2e07b8c3/fonttools-4.58.4-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d42c63020a922154add0a326388a60a55504629edc3274bc273cd3806b4659f", size = 4896354 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cd/95/34b4f483643d0cb11a1f830b72c03fdd18dbd3792d77a2eb2e130a96fada/fonttools-4.58.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8f2b4e6fd45edc6805f5f2c355590b092ffc7e10a945bd6a569fc66c1d2ae7aa", size = 4941633 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/81/ac/9bafbdb7694059c960de523e643fa5a61dd2f698f3f72c0ca18ae99257c7/fonttools-4.58.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f155b927f6efb1213a79334e4cb9904d1e18973376ffc17a0d7cd43d31981f1e", size = 4886170 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/44/a3a3b70d5709405f7525bb7cb497b4e46151e0c02e3c8a0e40e5e9fe030b/fonttools-4.58.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e38f687d5de97c7fb7da3e58169fb5ba349e464e141f83c3c2e2beb91d317816", size = 5037851 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/cb/e8923d197c78969454eb876a4a55a07b59c9c4c46598f02b02411dc3b45c/fonttools-4.58.4-cp312-cp312-win32.whl", hash = "sha256:636c073b4da9db053aa683db99580cac0f7c213a953b678f69acbca3443c12cc", size = 2187428 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/e6/fe50183b1a0e1018e7487ee740fa8bb127b9f5075a41e20d017201e8ab14/fonttools-4.58.4-cp312-cp312-win_amd64.whl", hash = "sha256:82e8470535743409b30913ba2822e20077acf9ea70acec40b10fcf5671dceb58", size = 2236649 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/4f/c05cab5fc1a4293e6bc535c6cb272607155a0517700f5418a4165b7f9ec8/fonttools-4.58.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5f4a64846495c543796fa59b90b7a7a9dff6839bd852741ab35a71994d685c6d", size = 2745197 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/d3/49211b1f96ae49308f4f78ca7664742377a6867f00f704cdb31b57e4b432/fonttools-4.58.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e80661793a5d4d7ad132a2aa1eae2e160fbdbb50831a0edf37c7c63b2ed36574", size = 2317272 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/11/c9972e46a6abd752a40a46960e431c795ad1f306775fc1f9e8c3081a1274/fonttools-4.58.4-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fe5807fc64e4ba5130f1974c045a6e8d795f3b7fb6debfa511d1773290dbb76b", size = 4877184 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/24/5017c01c9ef8df572cc9eaf9f12be83ad8ed722ff6dc67991d3d752956e4/fonttools-4.58.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b610b9bef841cb8f4b50472494158b1e347d15cad56eac414c722eda695a6cfd", size = 4939445 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/79/b0/538cc4d0284b5a8826b4abed93a69db52e358525d4b55c47c8cef3669767/fonttools-4.58.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2daa7f0e213c38f05f054eb5e1730bd0424aebddbeac094489ea1585807dd187", size = 4878800 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/9b/a891446b7a8250e65bffceb248508587958a94db467ffd33972723ab86c9/fonttools-4.58.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:66cccb6c0b944496b7f26450e9a66e997739c513ffaac728d24930df2fd9d35b", size = 5021259 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/b2/c4d2872cff3ace3ddd1388bf15b76a1d8d5313f0a61f234e9aed287e674d/fonttools-4.58.4-cp313-cp313-win32.whl", hash = "sha256:94d2aebb5ca59a5107825520fde596e344652c1f18170ef01dacbe48fa60c889", size = 2185824 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/98/57/cddf8bcc911d4f47dfca1956c1e3aeeb9f7c9b8e88b2a312fe8c22714e0b/fonttools-4.58.4-cp313-cp313-win_amd64.whl", hash = "sha256:b554bd6e80bba582fd326ddab296e563c20c64dca816d5e30489760e0c41529f", size = 2236382 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/45/20/787d70ba4cb831706fa587c56ee472a88ebc28752be660f4b58e598af6fc/fonttools-4.58.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ca773fe7812e4e1197ee4e63b9691e89650ab55f679e12ac86052d2fe0d152cd", size = 2754537 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/a5/ccb7ef1b8ab4bbf48f7753b6df512b61e73af82cd27aa486a03d6afb8635/fonttools-4.58.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e31289101221910f44245472e02b1a2f7d671c6d06a45c07b354ecb25829ad92", size = 2321715 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/5c/b361a7eae95950afaadb7049f55b214b619cb5368086cb3253726fe0c478/fonttools-4.58.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90c9e3c01475bb9602cb617f69f02c4ba7ab7784d93f0b0d685e84286f4c1a10", size = 4819004 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d5/2f/3006fbb1f57704cd60af82fb8127788cfb102f12d39c39fb5996af595cf3/fonttools-4.58.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e00a826f2bc745a010341ac102082fe5e3fb9f0861b90ed9ff32277598813711", size = 4749072 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c2/42/ea79e2c3d5e4441e4508d6456b268a7de275452f3dba3a13fc9d73f3e03d/fonttools-4.58.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bc75e72e9d2a4ad0935c59713bd38679d51c6fefab1eadde80e3ed4c2a11ea84", size = 4802023 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/70/90a196f57faa2bcd1485710c6d08eedceca500cdf2166640b3478e72072c/fonttools-4.58.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f57a795e540059ce3de68508acfaaf177899b39c36ef0a2833b2308db98c71f1", size = 4911103 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cb/3f/a7d38e606e98701dbcb6198406c8b554a77ed06c5b21e425251813fd3775/fonttools-4.58.4-cp39-cp39-win32.whl", hash = "sha256:a7d04f64c88b48ede655abcf76f2b2952f04933567884d99be7c89e0a4495131", size = 1471393 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/37/6e/08158deaebeb5b0c7a0fb251ca6827defb5f5159958a23ba427e0b677e95/fonttools-4.58.4-cp39-cp39-win_amd64.whl", hash = "sha256:5a8bc5dfd425c89b1c38380bc138787b0a830f761b82b37139aa080915503b69", size = 1515901 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/2f/c536b5b9bb3c071e91d536a4d11f969e911dbb6b227939f4c5b0bca090df/fonttools-4.58.4-py3-none-any.whl", hash = "sha256:a10ce13a13f26cbb9f37512a4346bb437ad7e002ff6fa966a7ce7ff5ac3528bd", size = 1114660 }, +] + +[[package]] +name = "greenlet" +version = "3.0.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/14/3bddb1298b9a6786539ac609ba4b7c9c0842e12aa73aaa4d8d73ec8f8185/greenlet-3.0.3.tar.gz", hash = "sha256:43374442353259554ce33599da8b692d5aa96f8976d567d4badf263371fbe491", size = 182013 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/64/bea53c592e3e45799f7c8039a8ee7d6883c518eafef1fcae60beb776070f/greenlet-3.0.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9da2bd29ed9e4f15955dd1595ad7bc9320308a3b766ef7f837e23ad4b4aac31a", size = 270098 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/d6/408ad9603339db28ce334021b1403dfcfbcb7501a435d49698408d928de7/greenlet-3.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d353cadd6083fdb056bb46ed07e4340b0869c305c8ca54ef9da3421acbdf6881", size = 651930 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/90/5b14670653f7363fb3e1665f8da6d64bd4c31d53a796d09ef69f48be7273/greenlet-3.0.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dca1e2f3ca00b84a396bc1bce13dd21f680f035314d2379c4160c98153b2059b", size = 667643 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/17/e8e72cabfb5a906c0d976d7fbcc88310df292beea0f816efbefdaf694284/greenlet-3.0.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ed7fb269f15dc662787f4119ec300ad0702fa1b19d2135a37c2c4de6fadfd4a", size = 659188 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1c/2f/64628f6ae48e05f585e0eb3fb7399b52e240ef99f602107b445bf6be23ef/greenlet-3.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd4f49ae60e10adbc94b45c0b5e6a179acc1736cf7a90160b404076ee283cf83", size = 662673 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/35/945d5b10648fec9b20bcc6df8952d20bb3bba76413cd71c1fdbee98f5616/greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:73a411ef564e0e097dbe7e866bb2dda0f027e072b04da387282b02c308807405", size = 616002 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/00/27e2da76b926e9b5a2c97d3f4c0baf1b7d8181209d3026c0171f621ae6c0/greenlet-3.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7f362975f2d179f9e26928c5b517524e89dd48530a0202570d55ad6ca5d8a56f", size = 1150603 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/65/506e0a80931170b0dac1a03d36b7fc299f3fa3576235b916718602fff2c3/greenlet-3.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:649dde7de1a5eceb258f9cb00bdf50e978c9db1b996964cd80703614c86495eb", size = 1176756 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/76/e1ee9f290bb0d46b09704c2fb0e609cae329eb308ad404c0ee6fa1ecb8a5/greenlet-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:68834da854554926fbedd38c76e60c4a2e3198c6fbed520b106a8986445caaf9", size = 292349 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/20/68a278a6f93fa36e21cfc3d7599399a8a831225644eb3b6b18755cd3d6fc/greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:b1b5667cced97081bf57b8fa1d6bfca67814b0afd38208d52538316e9422fc61", size = 271666 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/b4/90e06e07c78513ab03855768200bdb35c8e764e805b3f14fb488e56f82dc/greenlet-3.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52f59dd9c96ad2fc0d5724107444f76eb20aaccb675bf825df6435acb7703559", size = 657689 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f6/a2/0ed21078039072f9dc738bbf3af12b103a84106b1385ac4723841f846ce7/greenlet-3.0.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:afaff6cf5200befd5cec055b07d1c0a5a06c040fe5ad148abcd11ba6ab9b114e", size = 673009 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/42/11/42ad6b1104c357826bbee7d7b9e4f24dbd9fde94899a03efb004aab62963/greenlet-3.0.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe754d231288e1e64323cfad462fcee8f0288654c10bdf4f603a39ed923bef33", size = 667432 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bb/6b/384dee7e0121cbd1757bdc1824a5ee28e43d8d4e3f99aa59521f629442fe/greenlet-3.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2797aa5aedac23af156bbb5a6aa2cd3427ada2972c828244eb7d1b9255846379", size = 667442 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/1f/12d5a6cc26e8b483c2e7975f9c22e088ac735c0d8dcb8a8f72d31a4e5f04/greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7f009caad047246ed379e1c4dbcb8b020f0a390667ea74d2387be2998f58a22", size = 620032 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/ec/85b647e59e0f137c7792a809156f413e38379cf7f3f2e1353c37f4be4026/greenlet-3.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c5e1536de2aad7bf62e27baf79225d0d64360d4168cf2e6becb91baf1ed074f3", size = 1154218 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/ed/1e5f4bca691a81700e5a88e86d6f0e538acb10188cd2cc17140e523255ef/greenlet-3.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:894393ce10ceac937e56ec00bb71c4c2f8209ad516e96033e4b3b1de270e200d", size = 1180754 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/47/79/26d54d7d700ef65b689fc2665a40846d13e834da0486674a8d4f0f371a47/greenlet-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:1ea188d4f49089fc6fb283845ab18a2518d279c7cd9da1065d7a84e991748728", size = 292822 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/2f/461615adc53ba81e99471303b15ac6b2a6daa8d2a0f7f77fd15605e16d5b/greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:70fb482fdf2c707765ab5f0b6655e9cfcf3780d8d87355a063547b41177599be", size = 273085 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/55/2c3cfa3cdbb940cf7321fbcf544f0e9c74898eed43bf678abf416812d132/greenlet-3.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4d1ac74f5c0c0524e4a24335350edad7e5f03b9532da7ea4d3c54d527784f2e", size = 660514 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/38/77/efb21ab402651896c74f24a172eb4d7479f9f53898bd5e56b9e20bb24ffd/greenlet-3.0.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149e94a2dd82d19838fe4b2259f1b6b9957d5ba1b25640d2380bea9c5df37676", size = 674295 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/3a/92f188ace0190f0066dca3636cf1b09481d0854c46e92ec5e29c7cefe5b1/greenlet-3.0.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15d79dd26056573940fcb8c7413d84118086f2ec1a8acdfa854631084393efcc", size = 669395 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/0f/847ed02cdfce10f0e6e3425cd054296bddb11a17ef1b34681fa01a055187/greenlet-3.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b7db1ebff4ba09aaaeae6aa491daeb226c8150fc20e836ad00041bcb11230", size = 670455 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bd/37/56b0da468a85e7704f3b2bc045015301bdf4be2184a44868c71f6dca6fe2/greenlet-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fcd2469d6a2cf298f198f0487e0a5b1a47a42ca0fa4dfd1b6862c999f018ebbf", size = 625692 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/68/b5f4084c0a252d7e9c0d95fc1cfc845d08622037adb74e05be3a49831186/greenlet-3.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f672519db1796ca0d8753f9e78ec02355e862d0998193038c7073045899f305", size = 1152597 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/fa/31e22345518adcd69d1d6ab5087a12c178aa7f3c51103f6d5d702199d243/greenlet-3.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2516a9957eed41dd8f1ec0c604f1cdc86758b587d964668b5b196a9db5bfcde6", size = 1181043 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/53/80/3d94d5999b4179d91bcc93745d1b0815b073d61be79dd546b840d17adb18/greenlet-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:bba5387a6975598857d86de9eac14210a49d554a77eb8261cc68b7d082f78ce2", size = 293635 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/8a/f5140c8713f919af0e98e6aaa40cb20edaaf3739d18c4a077581e2422ac4/greenlet-3.0.3-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:da70d4d51c8b306bb7a031d5cff6cc25ad253affe89b70352af5f1cb68e74b53", size = 269242 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cf/5b/2de4a398840d3b4d99c4a3476cda0d82badfa349f3f89846ada2e32e9500/greenlet-3.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086152f8fbc5955df88382e8a75984e2bb1c892ad2e3c80a2508954e52295257", size = 650174 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/c3/06ca5f34b01af6d6e2fd2f97c0ad3673123a442bf4a3add548d374b1cc7c/greenlet-3.0.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d73a9fe764d77f87f8ec26a0c85144d6a951a6c438dfe50487df5595c6373eac", size = 666285 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/92/f11dbbcf33809421447b24d2eefee0575c59c8569d6d03f7ca4d2b34d56f/greenlet-3.0.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7dcbe92cc99f08c8dd11f930de4d99ef756c3591a5377d1d9cd7dd5e896da71", size = 658521 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9d/ea/8bc7ed08ba274bdaff08f2cb546d832b8f44af267e03ca6e449840486915/greenlet-3.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1551a8195c0d4a68fac7a4325efac0d541b48def35feb49d803674ac32582f61", size = 660753 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/05/b7e068070a6c143f34dfcd7e9144684271b8067e310f6da68269580db1d8/greenlet-3.0.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64d7675ad83578e3fc149b617a444fab8efdafc9385471f868eb5ff83e446b8b", size = 614348 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/82/9737e7dee4ccb9e1be2a8f17cf760458be2c36c6ff7bbaef55cbe279e729/greenlet-3.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b37eef18ea55f2ffd8f00ff8fe7c8d3818abd3e25fb73fae2ca3b672e333a7a6", size = 1149569 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/4b/965a542baf157f23912e466b50fa9c49dd66132d9495d201e6c607ea16f2/greenlet-3.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:77457465d89b8263bca14759d7c1684df840b6811b2499838cc5b040a8b5b113", size = 1176361 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/70/2f99bdcb4e3912d844dee279e077ee670ec43161d96670a9dfad16b89dd1/greenlet-3.0.3-cp39-cp39-win32.whl", hash = "sha256:57e8974f23e47dac22b83436bdcf23080ade568ce77df33159e019d161ce1d1e", size = 272960 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c3/80/01ff837bc7122d049971960123d749ed16adbd43cbc008afdb780a40e3fa/greenlet-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c5ee858cfe08f34712f548c3c363e807e7186f03ad7a5039ebadb29e8c6be067", size = 290843 }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784 }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, +] + +[[package]] +name = "importlib-resources" +version = "6.5.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cf/8c/f834fbf984f691b4f7ff60f50b514cc3de5cc08abfc3295564dd89c5e2e7/importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c", size = 44693 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/ed/1f1afb2e9e7f38a545d628f864d562a5ae64fe6f7a10e28ffb9b185b4e89/importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec", size = 37461 }, +] + +[[package]] +name = "jieba" +version = "0.42.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz", hash = "sha256:055ca12f62674fafed09427f176506079bc135638a14e23e25be909131928db2", size = 19214172 } + +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256 }, +] + +[[package]] +name = "kiwisolver" +version = "1.4.7" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version < '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin'", + "python_full_version < '3.10' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/4d/2255e1c76304cbd60b48cee302b66d1dde4468dc5b1160e4b7cb43778f2a/kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60", size = 97286 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/14/fc943dd65268a96347472b4fbe5dcc2f6f55034516f80576cd0dd3a8930f/kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6", size = 122440 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1e/46/e68fed66236b69dd02fcdb506218c05ac0e39745d696d22709498896875d/kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17", size = 65758 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/fa/65de49c85838681fc9cb05de2a68067a683717321e01ddafb5b8024286f0/kiwisolver-1.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa0abdf853e09aff551db11fce173e2177d00786c688203f52c87ad7fcd91ef9", size = 64311 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/42/9c/cc8d90f6ef550f65443bad5872ffa68f3dee36de4974768628bea7c14979/kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8d53103597a252fb3ab8b5845af04c7a26d5e7ea8122303dd7a021176a87e8b9", size = 1637109 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/55/91/0a57ce324caf2ff5403edab71c508dd8f648094b18cfbb4c8cc0fde4a6ac/kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:88f17c5ffa8e9462fb79f62746428dd57b46eb931698e42e990ad63103f35e6c", size = 1617814 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/5d/c36140313f2510e20207708adf36ae4919416d697ee0236b0ddfb6fd1050/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a9ca9c710d598fd75ee5de59d5bda2684d9db36a9f50b6125eaea3969c2599", size = 1400881 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/56/d0/786e524f9ed648324a466ca8df86298780ef2b29c25313d9a4f16992d3cf/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f4d742cb7af1c28303a51b7a27aaee540e71bb8e24f68c736f6f2ffc82f2bf05", size = 1512972 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/5a/77851f2f201e6141d63c10a0708e996a1363efaf9e1609ad0441b343763b/kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28c7fea2196bf4c2f8d46a0415c77a1c480cc0724722f23d7410ffe9842c407", size = 1444787 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/06/5f/1f5eaab84355885e224a6fc8d73089e8713dc7e91c121f00b9a1c58a2195/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e968b84db54f9d42046cf154e02911e39c0435c9801681e3fc9ce8a3c4130278", size = 2199212 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b5/28/9152a3bfe976a0ae21d445415defc9d1cd8614b2910b7614b30b27a47270/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0c18ec74c0472de033e1bebb2911c3c310eef5649133dd0bedf2a169a1b269e5", size = 2346399 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/f6/453d1904c52ac3b400f4d5e240ac5fec25263716723e44be65f4d7149d13/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8f0ea6da6d393d8b2e187e6a5e3fb81f5862010a40c3945e2c6d12ae45cfb2ad", size = 2308688 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/9a/d4968499441b9ae187e81745e3277a8b4d7c60840a52dc9d535a7909fac3/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:f106407dda69ae456dd1227966bf445b157ccc80ba0dff3802bb63f30b74e895", size = 2445493 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/c9/032267192e7828520dacb64dfdb1d74f292765f179e467c1cba97687f17d/kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ec80df401cfee1457063732d90022f93951944b5b58975d34ab56bb150dfb3", size = 2262191 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/ad/db0aedb638a58b2951da46ddaeecf204be8b4f5454df020d850c7fa8dca8/kiwisolver-1.4.7-cp310-cp310-win32.whl", hash = "sha256:71bb308552200fb2c195e35ef05de12f0c878c07fc91c270eb3d6e41698c3bcc", size = 46644 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/ca/d0f7b7ffbb0be1e7c2258b53554efec1fd652921f10d7d85045aff93ab61/kiwisolver-1.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:44756f9fd339de0fb6ee4f8c1696cfd19b2422e0d70b4cefc1cc7f1f64045a8c", size = 55877 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/6c/cfcc128672f47a3e3c0d918ecb67830600078b025bfc32d858f2e2d5c6a4/kiwisolver-1.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:78a42513018c41c2ffd262eb676442315cbfe3c44eed82385c2ed043bc63210a", size = 48347 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/44/77429fa0a58f941d6e1c58da9efe08597d2e86bf2b2cce6626834f49d07b/kiwisolver-1.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d2b0e12a42fb4e72d509fc994713d099cbb15ebf1103545e8a45f14da2dfca54", size = 122442 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e5/20/8c75caed8f2462d63c7fd65e16c832b8f76cda331ac9e615e914ee80bac9/kiwisolver-1.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a8781ac3edc42ea4b90bc23e7d37b665d89423818e26eb6df90698aa2287c95", size = 65762 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/98/fe010f15dc7230f45bc4cf367b012d651367fd203caaa992fd1f5963560e/kiwisolver-1.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46707a10836894b559e04b0fd143e343945c97fd170d69a2d26d640b4e297935", size = 64319 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/1b/b5d618f4e58c0675654c1e5051bcf42c776703edb21c02b8c74135541f60/kiwisolver-1.4.7-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef97b8df011141c9b0f6caf23b29379f87dd13183c978a30a3c546d2c47314cb", size = 1334260 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b8/01/946852b13057a162a8c32c4c8d2e9ed79f0bb5d86569a40c0b5fb103e373/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ab58c12a2cd0fc769089e6d38466c46d7f76aced0a1f54c77652446733d2d02", size = 1426589 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/70/d1/c9f96df26b459e15cf8a965304e6e6f4eb291e0f7a9460b4ad97b047561e/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:803b8e1459341c1bb56d1c5c010406d5edec8a0713a0945851290a7930679b51", size = 1541080 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/73/2686990eb8b02d05f3de759d6a23a4ee7d491e659007dd4c075fede4b5d0/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9a9e8a507420fe35992ee9ecb302dab68550dedc0da9e2880dd88071c5fb052", size = 1470049 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/4b/2db7af3ed3af7c35f388d5f53c28e155cd402a55432d800c543dc6deb731/kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18077b53dc3bb490e330669a99920c5e6a496889ae8c63b58fbc57c3d7f33a18", size = 1426376 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/83/2857317d04ea46dc5d115f0df7e676997bbd968ced8e2bd6f7f19cfc8d7f/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6af936f79086a89b3680a280c47ea90b4df7047b5bdf3aa5c524bbedddb9e545", size = 2222231 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0d/b5/866f86f5897cd4ab6d25d22e403404766a123f138bd6a02ecb2cdde52c18/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3abc5b19d24af4b77d1598a585b8a719beb8569a71568b66f4ebe1fb0449460b", size = 2368634 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/ee/73de8385403faba55f782a41260210528fe3273d0cddcf6d51648202d6d0/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:933d4de052939d90afbe6e9d5273ae05fb836cc86c15b686edd4b3560cc0ee36", size = 2329024 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/e7/cd101d8cd2cdfaa42dc06c433df17c8303d31129c9fdd16c0ea37672af91/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:65e720d2ab2b53f1f72fb5da5fb477455905ce2c88aaa671ff0a447c2c80e8e3", size = 2468484 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/72/84f09d45a10bc57a40bb58b81b99d8f22b58b2040c912b7eb97ebf625bf2/kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3bf1ed55088f214ba6427484c59553123fdd9b218a42bbc8c6496d6754b1e523", size = 2284078 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d2/d4/71828f32b956612dc36efd7be1788980cb1e66bfb3706e6dec9acad9b4f9/kiwisolver-1.4.7-cp311-cp311-win32.whl", hash = "sha256:4c00336b9dd5ad96d0a558fd18a8b6f711b7449acce4c157e7343ba92dd0cf3d", size = 46645 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/65/d43e9a20aabcf2e798ad1aff6c143ae3a42cf506754bcb6a7ed8259c8425/kiwisolver-1.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:929e294c1ac1e9f615c62a4e4313ca1823ba37326c164ec720a803287c4c499b", size = 56022 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/b3/9f75a2e06f1b4ca00b2b192bc2b739334127d27f1d0625627ff8479302ba/kiwisolver-1.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:e33e8fbd440c917106b237ef1a2f1449dfbb9b6f6e1ce17c94cd6a1e0d438376", size = 48536 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/9c/0a11c714cf8b6ef91001c8212c4ef207f772dd84540104952c45c1f0a249/kiwisolver-1.4.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5360cc32706dab3931f738d3079652d20982511f7c0ac5711483e6eab08efff2", size = 121808 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/d8/0fe8c5f5d35878ddd135f44f2af0e4e1d379e1c7b0716f97cdcb88d4fd27/kiwisolver-1.4.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942216596dc64ddb25adb215c3c783215b23626f8d84e8eff8d6d45c3f29f75a", size = 65531 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/c5/57fa58276dfdfa612241d640a64ca2f76adc6ffcebdbd135b4ef60095098/kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48b571ecd8bae15702e4f22d3ff6a0f13e54d3d00cd25216d5e7f658242065ee", size = 63894 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/e9/26d3edd4c4ad1c5b891d8747a4f81b1b0aba9fb9721de6600a4adc09773b/kiwisolver-1.4.7-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad42ba922c67c5f219097b28fae965e10045ddf145d2928bfac2eb2e17673640", size = 1369296 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/67/3f4850b5e6cffb75ec40577ddf54f7b82b15269cc5097ff2e968ee32ea7d/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:612a10bdae23404a72941a0fc8fa2660c6ea1217c4ce0dbcab8a8f6543ea9e7f", size = 1461450 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/be/86cbb9c9a315e98a8dc6b1d23c43cffd91d97d49318854f9c37b0e41cd68/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e838bba3a3bac0fe06d849d29772eb1afb9745a59710762e4ba3f4cb8424483", size = 1579168 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/00/65061acf64bd5fd34c1f4ae53f20b43b0a017a541f242a60b135b9d1e301/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:22f499f6157236c19f4bbbd472fa55b063db77a16cd74d49afe28992dff8c258", size = 1507308 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/e4/c0b6746fd2eb62fe702118b3ca0cb384ce95e1261cfada58ff693aeec08a/kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693902d433cf585133699972b6d7c42a8b9f8f826ebcaf0132ff55200afc599e", size = 1464186 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0a/0f/529d0a9fffb4d514f2782c829b0b4b371f7f441d61aa55f1de1c614c4ef3/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4e77f2126c3e0b0d055f44513ed349038ac180371ed9b52fe96a32aa071a5107", size = 2247877 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/e1/66603ad779258843036d45adcbe1af0d1a889a07af4635f8b4ec7dccda35/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:657a05857bda581c3656bfc3b20e353c232e9193eb167766ad2dc58b56504948", size = 2404204 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/61/de5fb1ca7ad1f9ab7970e340a5b833d735df24689047de6ae71ab9d8d0e7/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4bfa75a048c056a411f9705856abfc872558e33c055d80af6a380e3658766038", size = 2352461 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/d2/0edc00a852e369827f7e05fd008275f550353f1f9bcd55db9363d779fc63/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:34ea1de54beef1c104422d210c47c7d2a4999bdecf42c7b5718fbe59a4cac383", size = 2501358 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/84/15/adc15a483506aec6986c01fb7f237c3aec4d9ed4ac10b756e98a76835933/kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:90da3b5f694b85231cf93586dad5e90e2d71b9428f9aad96952c99055582f520", size = 2314119 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/08/3a5bb2c53c89660863a5aa1ee236912269f2af8762af04a2e11df851d7b2/kiwisolver-1.4.7-cp312-cp312-win32.whl", hash = "sha256:18e0cca3e008e17fe9b164b55735a325140a5a35faad8de92dd80265cd5eb80b", size = 46367 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/93/c05f0a6d825c643779fc3c70876bff1ac221f0e31e6f701f0e9578690d70/kiwisolver-1.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:58cb20602b18f86f83a5c87d3ee1c766a79c0d452f8def86d925e6c60fbf7bfb", size = 55884 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d2/f9/3828d8f21b6de4279f0667fb50a9f5215e6fe57d5ec0d61905914f5b6099/kiwisolver-1.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:f5a8b53bdc0b3961f8b6125e198617c40aeed638b387913bf1ce78afb1b0be2a", size = 48528 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/06/7da99b04259b0f18b557a4effd1b9c901a747f7fdd84cf834ccf520cb0b2/kiwisolver-1.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2e6039dcbe79a8e0f044f1c39db1986a1b8071051efba3ee4d74f5b365f5226e", size = 121913 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/f5/b8a370d1aa593c17882af0a6f6755aaecd643640c0ed72dcfd2eafc388b9/kiwisolver-1.4.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a1ecf0ac1c518487d9d23b1cd7139a6a65bc460cd101ab01f1be82ecf09794b6", size = 65627 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/fc/6c0374f7503522539e2d4d1b497f5ebad3f8ed07ab51aed2af988dd0fb65/kiwisolver-1.4.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ab9ccab2b5bd5702ab0803676a580fffa2aa178c2badc5557a84cc943fcf750", size = 63888 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/3e/0b7172793d0f41cae5c923492da89a2ffcd1adf764c16159ca047463ebd3/kiwisolver-1.4.7-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f816dd2277f8d63d79f9c8473a79fe54047bc0467754962840782c575522224d", size = 1369145 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/92/47d050d6f6aced2d634258123f2688fbfef8ded3c5baf2c79d94d91f1f58/kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf8bcc23ceb5a1b624572a1623b9f79d2c3b337c8c455405ef231933a10da379", size = 1461448 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/1b/8f80b18e20b3b294546a1adb41701e79ae21915f4175f311a90d042301cf/kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dea0bf229319828467d7fca8c7c189780aa9ff679c94539eed7532ebe33ed37c", size = 1578750 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/fe/fe8e72f3be0a844f257cadd72689c0848c6d5c51bc1d60429e2d14ad776e/kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c06a4c7cf15ec739ce0e5971b26c93638730090add60e183530d70848ebdd34", size = 1507175 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/fa/cdc0b6105d90eadc3bee525fecc9179e2b41e1ce0293caaf49cb631a6aaf/kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:913983ad2deb14e66d83c28b632fd35ba2b825031f2fa4ca29675e665dfecbe1", size = 1463963 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/5c/0c03c4e542720c6177d4f408e56d1c8315899db72d46261a4e15b8b33a41/kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5337ec7809bcd0f424c6b705ecf97941c46279cf5ed92311782c7c9c2026f07f", size = 2248220 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/ee/55ef86d5a574f4e767df7da3a3a7ff4954c996e12d4fbe9c408170cd7dcc/kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c26ed10c4f6fa6ddb329a5120ba3b6db349ca192ae211e882970bfc9d91420b", size = 2404463 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/6d/73ad36170b4bff4825dc588acf4f3e6319cb97cd1fb3eb04d9faa6b6f212/kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c619b101e6de2222c1fcb0531e1b17bbffbe54294bfba43ea0d411d428618c27", size = 2352842 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/16/fa531ff9199d3b6473bb4d0f47416cdb08d556c03b8bc1cccf04e756b56d/kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:073a36c8273647592ea332e816e75ef8da5c303236ec0167196793eb1e34657a", size = 2501635 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/7e/aa9422e78419db0cbe75fb86d8e72b433818f2e62e2e394992d23d23a583/kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3ce6b2b0231bda412463e152fc18335ba32faf4e8c23a754ad50ffa70e4091ee", size = 2314556 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/b2/15f7f556df0a6e5b3772a1e076a9d9f6c538ce5f05bd590eca8106508e06/kiwisolver-1.4.7-cp313-cp313-win32.whl", hash = "sha256:f4c9aee212bc89d4e13f58be11a56cc8036cabad119259d12ace14b34476fd07", size = 46364 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/db/32e897e43a330eee8e4770bfd2737a9584b23e33587a0812b8e20aac38f7/kiwisolver-1.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:8a3ec5aa8e38fc4c8af308917ce12c536f1c88452ce554027e55b22cbbfbff76", size = 55887 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c8/a4/df2bdca5270ca85fd25253049eb6708d4127be2ed0e5c2650217450b59e9/kiwisolver-1.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:76c8094ac20ec259471ac53e774623eb62e6e1f56cd8690c67ce6ce4fcb05650", size = 48530 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/88/37ea0ea64512997b13d69772db8dcdc3bfca5442cda3a5e4bb943652ee3e/kiwisolver-1.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9362ecfca44c863569d3d3c033dbe8ba452ff8eed6f6b5806382741a1334bd", size = 122449 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4e/45/5a5c46078362cb3882dcacad687c503089263c017ca1241e0483857791eb/kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8df2eb9b2bac43ef8b082e06f750350fbbaf2887534a5be97f6cf07b19d9583", size = 65757 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8a/be/a6ae58978772f685d48dd2e84460937761c53c4bbd84e42b0336473d9775/kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f32d6edbc638cde7652bd690c3e728b25332acbadd7cad670cc4a02558d9c417", size = 64312 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/04/18ef6f452d311e1e1eb180c9bf5589187fa1f042db877e6fe443ef10099c/kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e2e6c39bd7b9372b0be21456caab138e8e69cc0fc1190a9dfa92bd45a1e6e904", size = 1626966 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/b1/40655f6c3fa11ce740e8a964fa8e4c0479c87d6a7944b95af799c7a55dfe/kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dda56c24d869b1193fcc763f1284b9126550eaf84b88bbc7256e15028f19188a", size = 1607044 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/93/af67dbcfb9b3323bbd2c2db1385a7139d8f77630e4a37bb945b57188eb2d/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79849239c39b5e1fd906556c474d9b0439ea6792b637511f3fe3a41158d89ca8", size = 1391879 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/6f/d60770ef98e77b365d96061d090c0cd9e23418121c55fff188fa4bdf0b54/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e3bc157fed2a4c02ec468de4ecd12a6e22818d4f09cde2c31ee3226ffbefab2", size = 1504751 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fa/3a/5f38667d313e983c432f3fcd86932177519ed8790c724e07d77d1de0188a/kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3da53da805b71e41053dc670f9a820d1157aae77b6b944e08024d17bcd51ef88", size = 1436990 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cb/3b/1520301a47326e6a6043b502647e42892be33b3f051e9791cc8bb43f1a32/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8705f17dfeb43139a692298cb6637ee2e59c0194538153e83e9ee0c75c2eddde", size = 2191122 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cf/c4/eb52da300c166239a2233f1f9c4a1b767dfab98fae27681bfb7ea4873cb6/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:82a5c2f4b87c26bb1a0ef3d16b5c4753434633b83d365cc0ddf2770c93829e3c", size = 2338126 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/cb/42b92fd5eadd708dd9107c089e817945500685f3437ce1fd387efebc6d6e/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce8be0466f4c0d585cdb6c1e2ed07232221df101a4c6f28821d2aa754ca2d9e2", size = 2298313 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4f/eb/be25aa791fe5fc75a8b1e0c965e00f942496bc04635c9aae8035f6b76dcd/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:409afdfe1e2e90e6ee7fc896f3df9a7fec8e793e58bfa0d052c8a82f99c37abb", size = 2437784 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/22/30a66be7f3368d76ff95689e1c2e28d382383952964ab15330a15d8bfd03/kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5b9c3f4ee0b9a439d2415012bd1b1cc2df59e4d6a9939f4d669241d30b414327", size = 2253988 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/d3/5f2ecb94b5211c8a04f218a76133cc8d6d153b0f9cd0b45fad79907f0689/kiwisolver-1.4.7-cp39-cp39-win32.whl", hash = "sha256:a79ae34384df2b615eefca647a2873842ac3b596418032bef9a7283675962644", size = 46980 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/17/cd10d020578764ea91740204edc6b3236ed8106228a46f568d716b11feb2/kiwisolver-1.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:cf0438b42121a66a3a667de17e779330fc0f20b0d97d59d2f2121e182b0505e4", size = 55847 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/91/84/32232502020bd78d1d12be7afde15811c64a95ed1f606c10456db4e4c3ac/kiwisolver-1.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:764202cc7e70f767dab49e8df52c7455e8de0df5d858fa801a11aa0d882ccf3f", size = 48494 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/59/741b79775d67ab67ced9bb38552da688c0305c16e7ee24bba7a2be253fb7/kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94252291e3fe68001b1dd747b4c0b3be12582839b95ad4d1b641924d68fd4643", size = 59491 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/cc/fb239294c29a5656e99e3527f7369b174dd9cc7c3ef2dea7cb3c54a8737b/kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b7dfa3b546da08a9f622bb6becdb14b3e24aaa30adba66749d38f3cc7ea9706", size = 57648 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/ef/2f009ac1f7aab9f81efb2d837301d255279d618d27b6015780115ac64bdd/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd3de6481f4ed8b734da5df134cd5a6a64fe32124fe83dde1e5b5f29fe30b1e6", size = 84257 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/81/e1/c64f50987f85b68b1c52b464bb5bf73e71570c0f7782d626d1eb283ad620/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a91b5f9f1205845d488c928e8570dcb62b893372f63b8b6e98b863ebd2368ff2", size = 80906 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/71/1687c5c0a0be2cee39a5c9c389e546f9c6e215e46b691d00d9f646892083/kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fa14dbd66b8b8f470d5fc79c089a66185619d31645f9b0773b88b19f7223c4", size = 79951 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d5/df/ce37d9b26f07ab90880923c94d12a6ff4d27447096b4c849bfc4339ccfdf/kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b01aac285f91ca889c800042c35ad3b239e704b150cfd3382adfc9dcc780e39", size = 58666 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/d3/e4b04f43bc629ac8e186b77b2b1a251cdfa5b7610fa189dc0db622672ce6/kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:48be928f59a1f5c8207154f935334d374e79f2b5d212826307d072595ad76a2e", size = 57088 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/1c/752df58e2d339e670a535514d2db4fe8c842ce459776b8080fbe08ebb98e/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f37cfe618a117e50d8c240555331160d73d0411422b59b5ee217843d7b693608", size = 84321 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/f8/fe6484e847bc6e238ec9f9828089fb2c0bb53f2f5f3a79351fde5b565e4f/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:599b5c873c63a1f6ed7eead644a8a380cfbdf5db91dcb6f85707aaab213b1674", size = 80776 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9b/57/d7163c0379f250ef763aba85330a19feefb5ce6cb541ade853aaba881524/kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:801fa7802e5cfabe3ab0c81a34c323a319b097dfb5004be950482d882f3d7225", size = 79984 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/95/4a103776c265d13b3d2cd24fb0494d4e04ea435a8ef97e1b2c026d43250b/kiwisolver-1.4.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0c6c43471bc764fad4bc99c5c2d6d16a676b1abf844ca7c8702bdae92df01ee0", size = 55811 }, +] + +[[package]] +name = "kiwisolver" +version = "1.4.8" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.10.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/47/5f/4d8e9e852d98ecd26cdf8eaf7ed8bc33174033bba5e07001b289f07308fd/kiwisolver-1.4.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db", size = 124623 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/70/7f5af2a18a76fe92ea14675f8bd88ce53ee79e37900fa5f1a1d8e0b42998/kiwisolver-1.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c72941acb7b67138f35b879bbe85be0f6c6a70cab78fe3ef6db9c024d9223e5b", size = 66720 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/13/e15f804a142353aefd089fadc8f1d985561a15358c97aca27b0979cb0785/kiwisolver-1.4.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce2cf1e5688edcb727fdf7cd1bbd0b6416758996826a8be1d958f91880d0809d", size = 65413 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/6d/67d36c4d2054e83fb875c6b59d0809d5c530de8148846b1370475eeeece9/kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c8bf637892dc6e6aad2bc6d4d69d08764166e5e3f69d469e55427b6ac001b19d", size = 1650826 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/de/c6/7b9bb8044e150d4d1558423a1568e4f227193662a02231064e3824f37e0a/kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:034d2c891f76bd3edbdb3ea11140d8510dca675443da7304205a2eaa45d8334c", size = 1628231 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/38/ad10d437563063eaaedbe2c3540a71101fc7fb07a7e71f855e93ea4de605/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47b28d1dfe0793d5e96bce90835e17edf9a499b53969b03c6c47ea5985844c3", size = 1408938 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/ce/c0106b3bd7f9e665c5f5bc1e07cc95b5dabd4e08e3dad42dbe2faad467e7/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb158fe28ca0c29f2260cca8c43005329ad58452c36f0edf298204de32a9a3ed", size = 1422799 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/87/efb704b1d75dc9758087ba374c0f23d3254505edaedd09cf9d247f7878b9/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5536185fce131780ebd809f8e623bf4030ce1b161353166c49a3c74c287897f", size = 1354362 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/b3/fd760dc214ec9a8f208b99e42e8f0130ff4b384eca8b29dd0efc62052176/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:369b75d40abedc1da2c1f4de13f3482cb99e3237b38726710f4a793432b1c5ff", size = 2222695 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/09/a27fb36cca3fc01700687cc45dae7a6a5f8eeb5f657b9f710f788748e10d/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:641f2ddf9358c80faa22e22eb4c9f54bd3f0e442e038728f500e3b978d00aa7d", size = 2370802 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/c3/ba0a0346db35fe4dc1f2f2cf8b99362fbb922d7562e5f911f7ce7a7b60fa/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d561d2d8883e0819445cfe58d7ddd673e4015c3c57261d7bdcd3710d0d14005c", size = 2334646 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/52/942cf69e562f5ed253ac67d5c92a693745f0bed3c81f49fc0cbebe4d6b00/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1732e065704b47c9afca7ffa272f845300a4eb959276bf6970dc07265e73b605", size = 2467260 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/26/2d9668f30d8a494b0411d4d7d4ea1345ba12deb6a75274d58dd6ea01e951/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bcb1ebc3547619c3b58a39e2448af089ea2ef44b37988caf432447374941574e", size = 2288633 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/98/99/0dd05071654aa44fe5d5e350729961e7bb535372935a45ac89a8924316e6/kiwisolver-1.4.8-cp310-cp310-win_amd64.whl", hash = "sha256:89c107041f7b27844179ea9c85d6da275aa55ecf28413e87624d033cf1f6b751", size = 71885 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/fc/822e532262a97442989335394d441cd1d0448c2e46d26d3e04efca84df22/kiwisolver-1.4.8-cp310-cp310-win_arm64.whl", hash = "sha256:b5773efa2be9eb9fcf5415ea3ab70fc785d598729fd6057bea38d539ead28271", size = 65175 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/da/ed/c913ee28936c371418cb167b128066ffb20bbf37771eecc2c97edf8a6e4c/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84", size = 124635 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4c/45/4a7f896f7467aaf5f56ef093d1f329346f3b594e77c6a3c327b2d415f521/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561", size = 66717 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5f/b4/c12b3ac0852a3a68f94598d4c8d569f55361beef6159dce4e7b624160da2/kiwisolver-1.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7", size = 65413 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/98/1df4089b1ed23d83d410adfdc5947245c753bddfbe06541c4aae330e9e70/kiwisolver-1.4.8-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03", size = 1343994 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/bf/b4b169b050c8421a7c53ea1ea74e4ef9c335ee9013216c558a047f162d20/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954", size = 1434804 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/5a/e13bd341fbcf73325ea60fdc8af752addf75c5079867af2e04cc41f34434/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79", size = 1450690 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9b/4f/5955dcb376ba4a830384cc6fab7d7547bd6759fe75a09564910e9e3bb8ea/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6", size = 1376839 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/97/5edbed69a9d0caa2e4aa616ae7df8127e10f6586940aa683a496c2c280b9/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0", size = 1435109 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/fc/e756382cb64e556af6c1809a1bbb22c141bbc2445049f2da06b420fe52bf/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab", size = 2245269 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/15/e59e45829d7f41c776d138245cabae6515cb4eb44b418f6d4109c478b481/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc", size = 2393468 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/39/483558c2a913ab8384d6e4b66a932406f87c95a6080112433da5ed668559/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25", size = 2355394 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/aa/efad1fbca6570a161d29224f14b082960c7e08268a133fe5dc0f6906820e/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc", size = 2490901 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c9/4f/15988966ba46bcd5ab9d0c8296914436720dd67fca689ae1a75b4ec1c72f/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67", size = 2312306 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2d/27/bdf1c769c83f74d98cbc34483a972f221440703054894a37d174fba8aa68/kiwisolver-1.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34", size = 71966 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/c9/9642ea855604aeb2968a8e145fc662edf61db7632ad2e4fb92424be6b6c0/kiwisolver-1.4.8-cp311-cp311-win_arm64.whl", hash = "sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2", size = 65311 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/aa/cea685c4ab647f349c3bc92d2daf7ae34c8e8cf405a6dcd3a497f58a2ac3/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502", size = 124152 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/0b/8db6d2e2452d60d5ebc4ce4b204feeb16176a851fd42462f66ade6808084/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31", size = 66555 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/26/d6a0db6785dd35d3ba5bf2b2df0aedc5af089962c6eb2cbf67a15b81369e/kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb", size = 65067 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c9/ed/1d97f7e3561e09757a196231edccc1bcf59d55ddccefa2afc9c615abd8e0/kiwisolver-1.4.8-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111793b232842991be367ed828076b03d96202c19221b5ebab421ce8bcad016f", size = 1378443 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/61/39d30b99954e6b46f760e6289c12fede2ab96a254c443639052d1b573fbc/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:257af1622860e51b1a9d0ce387bf5c2c4f36a90594cb9514f55b074bcc787cfc", size = 1472728 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0c/3e/804163b932f7603ef256e4a715e5843a9600802bb23a68b4e08c8c0ff61d/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b5637c3f316cab1ec1c9a12b8c5f4750a4c4b71af9157645bf32830e39c03a", size = 1478388 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8a/9e/60eaa75169a154700be74f875a4d9961b11ba048bef315fbe89cb6999056/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:782bb86f245ec18009890e7cb8d13a5ef54dcf2ebe18ed65f795e635a96a1c6a", size = 1413849 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/b3/9458adb9472e61a998c8c4d95cfdfec91c73c53a375b30b1428310f923e4/kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc978a80a0db3a66d25767b03688f1147a69e6237175c0f4ffffaaedf744055a", size = 1475533 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/7a/0a42d9571e35798de80aef4bb43a9b672aa7f8e58643d7bd1950398ffb0a/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:36dbbfd34838500a31f52c9786990d00150860e46cd5041386f217101350f0d3", size = 2268898 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d9/07/1255dc8d80271400126ed8db35a1795b1a2c098ac3a72645075d06fe5c5d/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:eaa973f1e05131de5ff3569bbba7f5fd07ea0595d3870ed4a526d486fe57fa1b", size = 2425605 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/84/df/5a3b4cf13780ef6f6942df67b138b03b7e79e9f1f08f57c49957d5867f6e/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a66f60f8d0c87ab7f59b6fb80e642ebb29fec354a4dfad687ca4092ae69d04f4", size = 2375801 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/10/2348d068e8b0f635c8c86892788dac7a6b5c0cb12356620ab575775aad89/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858416b7fb777a53f0c59ca08190ce24e9abbd3cffa18886a5781b8e3e26f65d", size = 2520077 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/d8/014b89fee5d4dce157d814303b0fce4d31385a2af4c41fed194b173b81ac/kiwisolver-1.4.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:085940635c62697391baafaaeabdf3dd7a6c3643577dde337f4d66eba021b2b8", size = 2338410 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bd/72/dfff0cc97f2a0776e1c9eb5bef1ddfd45f46246c6533b0191887a427bca5/kiwisolver-1.4.8-cp312-cp312-win_amd64.whl", hash = "sha256:01c3d31902c7db5fb6182832713d3b4122ad9317c2c5877d0539227d96bb2e50", size = 71853 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/85/220d13d914485c0948a00f0b9eb419efaf6da81b7d72e88ce2391f7aed8d/kiwisolver-1.4.8-cp312-cp312-win_arm64.whl", hash = "sha256:a3c44cb68861de93f0c4a8175fbaa691f0aa22550c331fefef02b618a9dcb476", size = 65424 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/79/b3/e62464a652f4f8cd9006e13d07abad844a47df1e6537f73ddfbf1bc997ec/kiwisolver-1.4.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1c8ceb754339793c24aee1c9fb2485b5b1f5bb1c2c214ff13368431e51fc9a09", size = 124156 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/2d/f13d06998b546a2ad4f48607a146e045bbe48030774de29f90bdc573df15/kiwisolver-1.4.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a62808ac74b5e55a04a408cda6156f986cefbcf0ada13572696b507cc92fa1", size = 66555 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/e3/b8bd14b0a54998a9fd1e8da591c60998dc003618cb19a3f94cb233ec1511/kiwisolver-1.4.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:68269e60ee4929893aad82666821aaacbd455284124817af45c11e50a4b42e3c", size = 65071 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/1c/6c86f6d85ffe4d0ce04228d976f00674f1df5dc893bf2dd4f1928748f187/kiwisolver-1.4.8-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34d142fba9c464bc3bbfeff15c96eab0e7310343d6aefb62a79d51421fcc5f1b", size = 1378053 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4e/b9/1c6e9f6dcb103ac5cf87cb695845f5fa71379021500153566d8a8a9fc291/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc373e0eef45b59197de815b1b28ef89ae3955e7722cc9710fb91cd77b7f47", size = 1472278 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/81/aca1eb176de671f8bda479b11acdc42c132b61a2ac861c883907dde6debb/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:77e6f57a20b9bd4e1e2cedda4d0b986ebd0216236f0106e55c28aea3d3d69b16", size = 1478139 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/f4/e081522473671c97b2687d380e9e4c26f748a86363ce5af48b4a28e48d06/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08e77738ed7538f036cd1170cbed942ef749137b1311fa2bbe2a7fda2f6bf3cc", size = 1413517 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/e9/6a7d025d8da8c4931522922cd706105aa32b3291d1add8c5427cdcd66e63/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5ce1e481a74b44dd5e92ff03ea0cb371ae7a0268318e202be06c8f04f4f1246", size = 1474952 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/13/13fa685ae167bee5d94b415991c4fc7bb0a1b6ebea6e753a87044b209678/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fc2ace710ba7c1dfd1a3b42530b62b9ceed115f19a1656adefce7b1782a37794", size = 2269132 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/92/bb7c9395489b99a6cb41d502d3686bac692586db2045adc19e45ee64ed23/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3452046c37c7692bd52b0e752b87954ef86ee2224e624ef7ce6cb21e8c41cc1b", size = 2425997 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/12/87f0e9271e2b63d35d0d8524954145837dd1a6c15b62a2d8c1ebe0f182b4/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7e9a60b50fe8b2ec6f448fe8d81b07e40141bfced7f896309df271a0b92f80f3", size = 2376060 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/02/6e/c8af39288edbce8bf0fa35dee427b082758a4b71e9c91ef18fa667782138/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:918139571133f366e8362fa4a297aeba86c7816b7ecf0bc79168080e2bd79957", size = 2520471 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/78/df381bc7b26e535c91469f77f16adcd073beb3e2dd25042efd064af82323/kiwisolver-1.4.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e063ef9f89885a1d68dd8b2e18f5ead48653176d10a0e324e3b0030e3a69adeb", size = 2338793 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/dc/c1abe38c37c071d0fc71c9a474fd0b9ede05d42f5a458d584619cfd2371a/kiwisolver-1.4.8-cp313-cp313-win_amd64.whl", hash = "sha256:a17b7c4f5b2c51bb68ed379defd608a03954a1845dfed7cc0117f1cc8a9b7fd2", size = 71855 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a0/b6/21529d595b126ac298fdd90b705d87d4c5693de60023e0efcb4f387ed99e/kiwisolver-1.4.8-cp313-cp313-win_arm64.whl", hash = "sha256:3cd3bc628b25f74aedc6d374d5babf0166a92ff1317f46267f12d2ed54bc1d30", size = 65430 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/bd/b89380b7298e3af9b39f49334e3e2a4af0e04819789f04b43d560516c0c8/kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:370fd2df41660ed4e26b8c9d6bbcad668fbe2560462cba151a721d49e5b6628c", size = 126294 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/83/41/5857dc72e5e4148eaac5aa76e0703e594e4465f8ab7ec0fc60e3a9bb8fea/kiwisolver-1.4.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:84a2f830d42707de1d191b9490ac186bf7997a9495d4e9072210a1296345f7dc", size = 67736 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/d1/be059b8db56ac270489fb0b3297fd1e53d195ba76e9bbb30e5401fa6b759/kiwisolver-1.4.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7a3ad337add5148cf51ce0b55642dc551c0b9d6248458a757f98796ca7348712", size = 66194 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/83/4b73975f149819eb7dcf9299ed467eba068ecb16439a98990dcb12e63fdd/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7506488470f41169b86d8c9aeff587293f530a23a23a49d6bc64dab66bedc71e", size = 1465942 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/2c/30a5cdde5102958e602c07466bce058b9d7cb48734aa7a4327261ac8e002/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f0121b07b356a22fb0414cec4666bbe36fd6d0d759db3d37228f496ed67c880", size = 1595341 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/9b/1e71db1c000385aa069704f5990574b8244cce854ecd83119c19e83c9586/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d6d6bd87df62c27d4185de7c511c6248040afae67028a8a22012b010bc7ad062", size = 1598455 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/92/c8fec52ddf06231b31cbb779af77e99b8253cd96bd135250b9498144c78b/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:291331973c64bb9cce50bbe871fb2e675c4331dab4f31abe89f175ad7679a4d7", size = 1522138 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/51/9eb7e2cd07a15d8bdd976f6190c0164f92ce1904e5c0c79198c4972926b7/kiwisolver-1.4.8-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:893f5525bb92d3d735878ec00f781b2de998333659507d29ea4466208df37bed", size = 1582857 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/95/c5a00387a5405e68ba32cc64af65ce881a39b98d73cc394b24143bebc5b8/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b47a465040146981dc9db8647981b8cb96366fbc8d452b031e4f8fdffec3f26d", size = 2293129 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/83/eeb7af7d706b8347548313fa3a3a15931f404533cc54fe01f39e830dd231/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:99cea8b9dd34ff80c521aef46a1dddb0dcc0283cf18bde6d756f1e6f31772165", size = 2421538 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/f9/27e94c1b3eb29e6933b6986ffc5fa1177d2cd1f0c8efc5f02c91c9ac61de/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:151dffc4865e5fe6dafce5480fab84f950d14566c480c08a53c663a0020504b6", size = 2390661 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d9/d4/3c9735faa36ac591a4afcc2980d2691000506050b7a7e80bcfe44048daa7/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:577facaa411c10421314598b50413aa1ebcf5126f704f1e5d72d7e4e9f020d90", size = 2546710 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4c/fa/be89a49c640930180657482a74970cdcf6f7072c8d2471e1babe17a222dc/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85", size = 2349213 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1f/f9/ae81c47a43e33b93b0a9819cac6723257f5da2a5a60daf46aa5c7226ea85/kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e7a019419b7b510f0f7c9dceff8c5eae2392037eae483a7f9162625233802b0a", size = 60403 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/ca/f92b5cb6f4ce0c1ebfcfe3e2e42b96917e16f7090e45b21102941924f18f/kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:286b18e86682fd2217a48fc6be6b0f20c1d0ed10958d8dc53453ad58d7be0bf8", size = 58657 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/28/ae0240f732f0484d3a4dc885d055653c47144bdf59b670aae0ec3c65a7c8/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4191ee8dfd0be1c3666ccbac178c5a05d5f8d689bbe3fc92f3c4abec817f8fe0", size = 84948 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/eb/78d50346c51db22c7203c1611f9b513075f35c4e0e4877c5dde378d66043/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd2785b9391f2873ad46088ed7599a6a71e762e1ea33e87514b1a441ed1da1c", size = 81186 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/43/f8/7259f18c77adca88d5f64f9a522792e178b2691f3748817a8750c2d216ef/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c07b29089b7ba090b6f1a669f1411f27221c3662b3a1b7010e67b59bb5a6f10b", size = 80279 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/1d/50ad811d1c5dae091e4cf046beba925bcae0a610e79ae4c538f996f63ed5/kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b", size = 71762 }, +] + +[[package]] +name = "lxml" +version = "6.0.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/ed/60eb6fa2923602fba988d9ca7c5cdbd7cf25faa795162ed538b527a35411/lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72", size = 4096938 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4b/e9/9c3ca02fbbb7585116c2e274b354a2d92b5c70561687dd733ec7b2018490/lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:35bc626eec405f745199200ccb5c6b36f202675d204aa29bb52e27ba2b71dea8", size = 8399057 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/25/10a6e9001191854bf283515020f3633b1b1f96fd1b39aa30bf8fff7aa666/lxml-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:246b40f8a4aec341cbbf52617cad8ab7c888d944bfe12a6abd2b1f6cfb6f6082", size = 4569676 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/a5/378033415ff61d9175c81de23e7ad20a3ffb614df4ffc2ffc86bc6746ffd/lxml-6.0.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2793a627e95d119e9f1e19720730472f5543a6d84c50ea33313ce328d870f2dd", size = 5291361 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/a6/19c87c4f3b9362b08dc5452a3c3bce528130ac9105fc8fff97ce895ce62e/lxml-6.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:46b9ed911f36bfeb6338e0b482e7fe7c27d362c52fde29f221fddbc9ee2227e7", size = 5008290 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/d1/e9b7ad4b4164d359c4d87ed8c49cb69b443225cb495777e75be0478da5d5/lxml-6.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b4790b558bee331a933e08883c423f65bbcd07e278f91b2272489e31ab1e2b4", size = 5163192 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/56/d6/b3eba234dc1584744b0b374a7f6c26ceee5dc2147369a7e7526e25a72332/lxml-6.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2030956cf4886b10be9a0285c6802e078ec2391e1dd7ff3eb509c2c95a69b76", size = 5076973 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/47/897142dd9385dcc1925acec0c4afe14cc16d310ce02c41fcd9010ac5d15d/lxml-6.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d23854ecf381ab1facc8f353dcd9adeddef3652268ee75297c1164c987c11dc", size = 5297795 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fb/db/551ad84515c6f415cea70193a0ff11d70210174dc0563219f4ce711655c6/lxml-6.0.0-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:43fe5af2d590bf4691531b1d9a2495d7aab2090547eaacd224a3afec95706d76", size = 4776547 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e0/14/c4a77ab4f89aaf35037a03c472f1ccc54147191888626079bd05babd6808/lxml-6.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74e748012f8c19b47f7d6321ac929a9a94ee92ef12bc4298c47e8b7219b26541", size = 5124904 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/70/b4/12ae6a51b8da106adec6a2e9c60f532350a24ce954622367f39269e509b1/lxml-6.0.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:43cfbb7db02b30ad3926e8fceaef260ba2fb7df787e38fa2df890c1ca7966c3b", size = 4805804 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/b6/2e82d34d49f6219cdcb6e3e03837ca5fb8b7f86c2f35106fb8610ac7f5b8/lxml-6.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34190a1ec4f1e84af256495436b2d196529c3f2094f0af80202947567fdbf2e7", size = 5323477 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/e6/b83ddc903b05cd08a5723fefd528eee84b0edd07bdf87f6c53a1fda841fd/lxml-6.0.0-cp310-cp310-win32.whl", hash = "sha256:5967fe415b1920a3877a4195e9a2b779249630ee49ece22021c690320ff07452", size = 3613840 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/af/874fb368dd0c663c030acb92612341005e52e281a102b72a4c96f42942e1/lxml-6.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f3389924581d9a770c6caa4df4e74b606180869043b9073e2cec324bad6e306e", size = 3993584 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/f4/d296bc22c17d5607653008f6dd7b46afdfda12efd31021705b507df652bb/lxml-6.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:522fe7abb41309e9543b0d9b8b434f2b630c5fdaf6482bee642b34c8c70079c8", size = 3681400 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/23/828d4cc7da96c611ec0ce6147bbcea2fdbde023dc995a165afa512399bbf/lxml-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4ee56288d0df919e4aac43b539dd0e34bb55d6a12a6562038e8d6f3ed07f9e36", size = 8438217 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/33/5ac521212c5bcb097d573145d54b2b4a3c9766cda88af5a0e91f66037c6e/lxml-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8dd6dd0e9c1992613ccda2bcb74fc9d49159dbe0f0ca4753f37527749885c25", size = 4590317 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/2e/45b7ca8bee304c07f54933c37afe7dd4d39ff61ba2757f519dcc71bc5d44/lxml-6.0.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:d7ae472f74afcc47320238b5dbfd363aba111a525943c8a34a1b657c6be934c3", size = 5221628 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/23/526d19f7eb2b85da1f62cffb2556f647b049ebe2a5aa8d4d41b1fb2c7d36/lxml-6.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5592401cdf3dc682194727c1ddaa8aa0f3ddc57ca64fd03226a430b955eab6f6", size = 4949429 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/cc/f6be27a5c656a43a5344e064d9ae004d4dcb1d3c9d4f323c8189ddfe4d13/lxml-6.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58ffd35bd5425c3c3b9692d078bf7ab851441434531a7e517c4984d5634cd65b", size = 5087909 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/e6/8ec91b5bfbe6972458bc105aeb42088e50e4b23777170404aab5dfb0c62d/lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f720a14aa102a38907c6d5030e3d66b3b680c3e6f6bc95473931ea3c00c59967", size = 5031713 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/33/cf/05e78e613840a40e5be3e40d892c48ad3e475804db23d4bad751b8cadb9b/lxml-6.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2a5e8d207311a0170aca0eb6b160af91adc29ec121832e4ac151a57743a1e1e", size = 5232417 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/8c/6b306b3e35c59d5f0b32e3b9b6b3b0739b32c0dc42a295415ba111e76495/lxml-6.0.0-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:2dd1cc3ea7e60bfb31ff32cafe07e24839df573a5e7c2d33304082a5019bcd58", size = 4681443 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/43/0bd96bece5f7eea14b7220476835a60d2b27f8e9ca99c175f37c085cb154/lxml-6.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cfcf84f1defed7e5798ef4f88aa25fcc52d279be731ce904789aa7ccfb7e8d2", size = 5074542 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/3d/32103036287a8ca012d8518071f8852c68f2b3bfe048cef2a0202eb05910/lxml-6.0.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a52a4704811e2623b0324a18d41ad4b9fabf43ce5ff99b14e40a520e2190c851", size = 4729471 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/a8/7be5d17df12d637d81854bd8648cd329f29640a61e9a72a3f77add4a311b/lxml-6.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c16304bba98f48a28ae10e32a8e75c349dd742c45156f297e16eeb1ba9287a1f", size = 5256285 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cd/d0/6cb96174c25e0d749932557c8d51d60c6e292c877b46fae616afa23ed31a/lxml-6.0.0-cp311-cp311-win32.whl", hash = "sha256:f8d19565ae3eb956d84da3ef367aa7def14a2735d05bd275cd54c0301f0d0d6c", size = 3612004 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/77/6ad43b165dfc6dead001410adeb45e88597b25185f4479b7ca3b16a5808f/lxml-6.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b2d71cdefda9424adff9a3607ba5bbfc60ee972d73c21c7e3c19e71037574816", size = 4003470 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a0/bc/4c50ec0eb14f932a18efc34fc86ee936a66c0eb5f2fe065744a2da8a68b2/lxml-6.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:8a2e76efbf8772add72d002d67a4c3d0958638696f541734304c7f28217a9cab", size = 3682477 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/89/c3/d01d735c298d7e0ddcedf6f028bf556577e5ab4f4da45175ecd909c79378/lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78718d8454a6e928470d511bf8ac93f469283a45c354995f7d19e77292f26108", size = 8429515 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/06/37/0e3eae3043d366b73da55a86274a590bae76dc45aa004b7042e6f97803b1/lxml-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:84ef591495ffd3f9dcabffd6391db7bb70d7230b5c35ef5148354a134f56f2be", size = 4601387 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/28/e1a9a881e6d6e29dda13d633885d13acb0058f65e95da67841c8dd02b4a8/lxml-6.0.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2930aa001a3776c3e2601cb8e0a15d21b8270528d89cc308be4843ade546b9ab", size = 5228928 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/55/2cb24ea48aa30c99f805921c1c7860c1f45c0e811e44ee4e6a155668de06/lxml-6.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563", size = 4952289 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/c0/b25d9528df296b9a3306ba21ff982fc5b698c45ab78b94d18c2d6ae71fd9/lxml-6.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7", size = 5111310 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/af/681a8b3e4f668bea6e6514cbcb297beb6de2b641e70f09d3d78655f4f44c/lxml-6.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7", size = 5025457 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/b6/3a7971aa05b7be7dfebc7ab57262ec527775c2c3c5b2f43675cac0458cad/lxml-6.0.0-cp312-cp312-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991", size = 5657016 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/69/f8/693b1a10a891197143c0673fcce5b75fc69132afa81a36e4568c12c8faba/lxml-6.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da", size = 5257565 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/96/e08ff98f2c6426c98c8964513c5dab8d6eb81dadcd0af6f0c538ada78d33/lxml-6.0.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e", size = 4713390 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/83/6184aba6cc94d7413959f6f8f54807dc318fdcd4985c347fe3ea6937f772/lxml-6.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741", size = 5066103 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/01/8bf1f4035852d0ff2e36a4d9aacdbcc57e93a6cd35a54e05fa984cdf73ab/lxml-6.0.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3", size = 4791428 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/31/c0267d03b16954a85ed6b065116b621d37f559553d9339c7dcc4943a76f1/lxml-6.0.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16", size = 5678523 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/f7/5495829a864bc5f8b0798d2b52a807c89966523140f3d6fa3a58ab6720ea/lxml-6.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0", size = 5281290 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/79/56/6b8edb79d9ed294ccc4e881f4db1023af56ba451909b9ce79f2a2cd7c532/lxml-6.0.0-cp312-cp312-win32.whl", hash = "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a", size = 3613495 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/1e/cc32034b40ad6af80b6fd9b66301fc0f180f300002e5c3eb5a6110a93317/lxml-6.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3", size = 4014711 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/55/10/dc8e5290ae4c94bdc1a4c55865be7e1f31dfd857a88b21cbba68b5fea61b/lxml-6.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:8cb26f51c82d77483cdcd2b4a53cda55bbee29b3c2f3ddeb47182a2a9064e4eb", size = 3674431 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/79/21/6e7c060822a3c954ff085e5e1b94b4a25757c06529eac91e550f3f5cd8b8/lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6da7cd4f405fd7db56e51e96bff0865b9853ae70df0e6720624049da76bde2da", size = 8414372 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/f6/051b1607a459db670fc3a244fa4f06f101a8adf86cda263d1a56b3a4f9d5/lxml-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b34339898bb556a2351a1830f88f751679f343eabf9cf05841c95b165152c9e7", size = 4593940 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/74/dd595d92a40bda3c687d70d4487b2c7eff93fd63b568acd64fedd2ba00fe/lxml-6.0.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:51a5e4c61a4541bd1cd3ba74766d0c9b6c12d6a1a4964ef60026832aac8e79b3", size = 5214329 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/46/3572761efc1bd45fcafb44a63b3b0feeb5b3f0066886821e94b0254f9253/lxml-6.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81", size = 4947559 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/8a/5e40de920e67c4f2eef9151097deb9b52d86c95762d8ee238134aff2125d/lxml-6.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1", size = 5102143 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/4b/20555bdd75d57945bdabfbc45fdb1a36a1a0ff9eae4653e951b2b79c9209/lxml-6.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24", size = 5021931 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/6e/cf03b412f3763d4ca23b25e70c96a74cfece64cec3addf1c4ec639586b13/lxml-6.0.0-cp313-cp313-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a", size = 5645469 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/dd/39c8507c16db6031f8c1ddf70ed95dbb0a6d466a40002a3522c128aba472/lxml-6.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29", size = 5247467 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/56/732d49def0631ad633844cfb2664563c830173a98d5efd9b172e89a4800d/lxml-6.0.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4", size = 4720601 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/7f/6b956fab95fa73462bca25d1ea7fc8274ddf68fb8e60b78d56c03b65278e/lxml-6.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca", size = 5060227 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/06/e851ac2924447e8b15a294855caf3d543424364a143c001014d22c8ca94c/lxml-6.0.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf", size = 4790637 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/06/d4/fd216f3cd6625022c25b336c7570d11f4a43adbaf0a56106d3d496f727a7/lxml-6.0.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f", size = 5662049 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/03/0e764ce00b95e008d76b99d432f1807f3574fb2945b496a17807a1645dbd/lxml-6.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef", size = 5272430 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5f/01/d48cc141bc47bc1644d20fe97bbd5e8afb30415ec94f146f2f76d0d9d098/lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181", size = 3612896 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/87/6456b9541d186ee7d4cb53bf1b9a0d7f3b1068532676940fdd594ac90865/lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e", size = 4013132 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/42/85b3aa8f06ca0d24962f8100f001828e1f1f1a38c954c16e71154ed7d53a/lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03", size = 3672642 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/04/a53941fb0d7c60eed08301942c70aa63650a59308d15e05eb823acbce41d/lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261", size = 8407699 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/d2/e1d4526e903afebe147f858322f1c0b36e44969d5c87e5d243c23f81987f/lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316", size = 4574678 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/aa/b0a8ee233c00f2f437dbb6e7bd2df115a996d8211b7d03f4ab029b8e3378/lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512", size = 5292694 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/53/7f/e6f377489b2ac4289418b879c34ed664e5a1174b2a91590936ec4174e773/lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f", size = 5009177 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/05/ae239e997374680741b768044545251a29abc21ada42248638dbed749a0a/lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a", size = 5163787 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/da/4f27222570d008fd2386e19d6923af6e64c317ee6116bbb2b98247f98f31/lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93", size = 5075755 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1f/65/12552caf7b3e3b9b9aba12349370dc53a36d4058e4ed482811f1d262deee/lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281", size = 5297070 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/6a/f053a8369fdf4e3b8127a6ffb079c519167e684e956a1281392c5c3679b6/lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922", size = 4779864 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/7b/b2a392ad34ce37a17d1cf3aec303e15125768061cf0e355a92d292d20d37/lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0", size = 5122039 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/0e/6459ff8ae7d87188e1f99f11691d0f32831caa6429599c3b289de9f08b21/lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a", size = 4805117 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/78/4186f573805ff623d28a8736788a3b29eeaf589afdcf0233de2c9bb9fc50/lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5", size = 5322300 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/97/352e07992901473529c8e19dbfdba6430ba6a37f6b46a4d0fa93321f8fee/lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256", size = 3615832 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/71/93/8f3b880e2618e548fb0ca157349abb526d81cb4f01ef5ea3a0f22bd4d0df/lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae", size = 4038551 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/8a/046cbf5b262dd2858c6e65833339100fd5f1c017b37b26bc47c92d4584d7/lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402", size = 3684237 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/e1/2c22a3cff9e16e1d717014a1e6ec2bf671bf56ea8716bb64466fcf820247/lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72", size = 3898804 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/3a/d68cbcb4393a2a0a867528741fafb7ce92dac5c9f4a1680df98e5e53e8f5/lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4", size = 4216406 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/15/8f/d9bfb13dff715ee3b2a1ec2f4a021347ea3caf9aba93dea0cfe54c01969b/lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc", size = 4326455 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/8b/fde194529ee8a27e6f5966d7eef05fa16f0567e4a8e8abc3b855ef6b3400/lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8", size = 4268788 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/a8/3b8e2581b4f8370fc9e8dc343af4abdfadd9b9229970fc71e67bd31c7df1/lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065", size = 4411394 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/a5/899a4719e02ff4383f3f96e5d1878f882f734377f10dfb69e73b5f223e44/lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141", size = 3517946 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/93/e3/ef14f1d23aea1dec1eccbe2c07a93b6d0be693fd9d5f248a47155e436701/lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5", size = 3892325 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/8a/1410b9e1ec43f606f9aac0661d09892509d86032e229711798906e1b5e7a/lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06", size = 4210839 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/79/cb/6696ce0d1712c5ae94b18bdf225086a5fb04b23938ac4d2011b323b3860b/lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9", size = 4321235 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f3/98/04997f61d720cf320a0daee66b3096e3a3b57453e15549c14b87058c2acd/lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca", size = 4265071 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e6/86/e5f6fa80154a5f5bf2c1e89d6265892299942edeb115081ca72afe7c7199/lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734", size = 4406816 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/a6/ae69e0e6f5fb6293eb8cbfbf8a259e37d71608bbae3658a768dd26b69f3e/lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349", size = 3515499 }, +] + +[[package]] +name = "matplotlib" +version = "3.9.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "contourpy", version = "1.3.0", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "importlib-resources", marker = "python_full_version < '3.10'" }, + { name = "kiwisolver", version = "1.4.7", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "kiwisolver", version = "1.4.8", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/a4/a7236bf8b0137deff48737c6ccf2154ef4486e57c6a5b7c309bf515992bd/matplotlib-3.9.0.tar.gz", hash = "sha256:e6d29ea6c19e34b30fb7d88b7081f869a03014f66fe06d62cc77d5a6ea88ed7a", size = 36069890 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/03/a0/669c37c6e6737de909c19eb30d7b17d1d6be6d896aa2f5dc63e66231b7f4/matplotlib-3.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2bcee1dffaf60fe7656183ac2190bd630842ff87b3153afb3e384d966b57fe56", size = 7883911 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/1f/a0f1a692af13b85335a9d7bd226fc0cae8d0062f1fb940980bc9b38d3b5c/matplotlib-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3f988bafb0fa39d1074ddd5bacd958c853e11def40800c5824556eb630f94d3b", size = 7765903 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/3d/58182994c955ff2fc722f883e96ad9de3439d3ead668fce33ad1c3fe4242/matplotlib-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe428e191ea016bb278758c8ee82a8129c51d81d8c4bc0846c09e7e8e9057241", size = 8183679 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/68/16e7b9154fae61fb29f0f3450b39b855b89e6d2c598d67302e70f96883af/matplotlib-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaf3978060a106fab40c328778b148f590e27f6fa3cd15a19d6892575bce387d", size = 8296303 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/66/ad8d69aa13fd6e1b09fe7b91b512d07eaf175a0b0e7c4bcba87e8d2e01d6/matplotlib-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e7f03e5cbbfacdd48c8ea394d365d91ee8f3cae7e6ec611409927b5ed997ee4", size = 8594927 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b9/55/6138ad64c789bad13d18e0240da75e73dbd364fdc0aa670fff87a5eef5ab/matplotlib-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:13beb4840317d45ffd4183a778685e215939be7b08616f431c7795276e067463", size = 7954080 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/49/569b50eb5e5a75b61f7a0bacb6029e9ea9c8a1190df55a39a31789244e09/matplotlib-3.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:063af8587fceeac13b0936c42a2b6c732c2ab1c98d38abc3337e430e1ff75e38", size = 7893678 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/b4/c1700c8b2ff8d379c187f37055e61bd7a611eb2c544466600a7734793d54/matplotlib-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9a2fa6d899e17ddca6d6526cf6e7ba677738bf2a6a9590d702c277204a7c6152", size = 7775027 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/9e/b09513717f60071fefcb28c7c783aa658f939f3d4ba1cefb6c05138c6657/matplotlib-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:550cdda3adbd596078cca7d13ed50b77879104e2e46392dcd7c75259d8f00e85", size = 8192694 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/f1/115e7c79b4506b4f0533acba742babd9718ff92eeca6d4205843173b6173/matplotlib-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76cce0f31b351e3551d1f3779420cf8f6ec0d4a8cf9c0237a3b549fd28eb4abb", size = 8307002 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/a2/5c1a64d188c4cae7368ebb8c28a354e3f262cb86b28c38ffa6ee3ad532ba/matplotlib-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c53aeb514ccbbcbab55a27f912d79ea30ab21ee0531ee2c09f13800efb272674", size = 8600548 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/c8/6936e8c7b279a5abac82f399d8d72ac25da530cf5f78a0e40063e492558c/matplotlib-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5be985db2596d761cdf0c2eaf52396f26e6a64ab46bd8cd810c48972349d1be", size = 7963606 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/43/54b7dfd91ed33da92973dc5d50231ef7b2d0622c8ae72babbad26bc1a319/matplotlib-3.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c79f3a585f1368da6049318bdf1f85568d8d04b2e89fc24b7e02cc9b62017382", size = 7884612 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4c/88/15bbb864b0d871707294ff325f9ffd0dfa486db2637eb34dd5f8dcf5b9bf/matplotlib-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bdd1ecbe268eb3e7653e04f451635f0fb0f77f07fd070242b44c076c9106da84", size = 7769852 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/af/8ed9b852fc041fc5bd101f9964682874ccbf24f9c08323edee6a1600eb04/matplotlib-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d38e85a1a6d732f645f1403ce5e6727fd9418cd4574521d5803d3d94911038e5", size = 8185646 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/ff/da311c1e679eed54d3aed67754a4e859bd3b773060c2fa187962e60fcb85/matplotlib-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a490715b3b9984fa609116481b22178348c1a220a4499cda79132000a79b4db", size = 8298411 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/db/8c/1014baa6776503914865d87e1e8a803ee9faa7b722ca5e655463b79c966e/matplotlib-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8146ce83cbc5dc71c223a74a1996d446cd35cfb6a04b683e1446b7e6c73603b7", size = 8591196 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/91/febbb6c1063ae05a62fdbe038c2917b348b1b35f0482cee4738e6870a44a/matplotlib-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:d91a4ffc587bacf5c4ce4ecfe4bcd23a4b675e76315f2866e588686cc97fccdf", size = 7968581 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/17/7fae59bf7c5ff97abaea6baad2d21cc3f68aed2c82323b0cdaac743959d5/matplotlib-3.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:616fabf4981a3b3c5a15cd95eba359c8489c4e20e03717aea42866d8d0465956", size = 7884763 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/2b/3b82a88735fb2fff513990963ce288f67b78b08c9ec528210387fb3a2757/matplotlib-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cd53c79fd02f1c1808d2cfc87dd3cf4dbc63c5244a58ee7944497107469c8d8a", size = 7766727 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8a/de/ed28038ba354617c442a8b09c1fc4848ac50460747577c4ebfb5ef71de53/matplotlib-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06a478f0d67636554fa78558cfbcd7b9dba85b51f5c3b5a0c9be49010cf5f321", size = 8185424 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/6d/45837c5b3d0005a5a9b04729b218a16bf3aa195701c6b33b2cc39ae943b6/matplotlib-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81c40af649d19c85f8073e25e5806926986806fa6d54be506fbf02aef47d5a89", size = 8298763 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/96/92/7a534d63958f6ec837857b112b50ac29996e60f39d4b55ae39c6e64c8a1a/matplotlib-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52146fc3bd7813cc784562cb93a15788be0b2875c4655e2cc6ea646bfa30344b", size = 8597376 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/ea/7f029057338138643d745fd4a88d745f8ec810ef81652cc43103324ec549/matplotlib-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:0fc51eaa5262553868461c083d9adadb11a6017315f3a757fc45ec6ec5f02888", size = 7949603 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/af/824734d3b5381688da89748c0ad71a3c414bf5322f55ccbb049197ce5d19/matplotlib-3.9.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bd4f2831168afac55b881db82a7730992aa41c4f007f1913465fb182d6fb20c0", size = 7873642 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/5f/2bb0cd3a28f1d4ede70d3d47ded36dcf0d0cbe012bcafc4cd8053eb53d1c/matplotlib-3.9.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:290d304e59be2b33ef5c2d768d0237f5bd132986bdcc66f80bc9bcc300066a03", size = 7757288 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/62/ce7c20b5bbf49bbbd679400d8c38a17d40f0eaaece364f7e602fe8112d75/matplotlib-3.9.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ff2e239c26be4f24bfa45860c20ffccd118d270c5b5d081fa4ea409b5469fcd", size = 8298454 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/d4/668a809e3e12cb20fc73f34f7fd886a314e512073484bea48fa3480687ad/matplotlib-3.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:af4001b7cae70f7eaacfb063db605280058246de590fa7874f00f62259f2df7e", size = 7974664 }, +] + +[[package]] +name = "mediacrawler" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "aiofiles" }, + { name = "aiomysql" }, + { name = "aiosqlite" }, + { name = "fastapi" }, + { name = "httpx" }, + { name = "jieba" }, + { name = "matplotlib" }, + { name = "opencv-python" }, + { name = "pandas" }, + { name = "parsel" }, + { name = "pillow" }, + { name = "playwright" }, + { name = "pydantic" }, + { name = "pyexecjs" }, + { name = "python-dotenv" }, + { name = "redis" }, + { name = "requests" }, + { name = "tenacity" }, + { name = "uvicorn" }, + { name = "wordcloud" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiofiles", specifier = "~=23.2.1" }, + { name = "aiomysql", specifier = "==0.2.0" }, + { name = "aiosqlite", specifier = ">=0.21.0" }, + { name = "fastapi", specifier = "==0.110.2" }, + { name = "httpx", specifier = "==0.28.1" }, + { name = "jieba", specifier = "==0.42.1" }, + { name = "matplotlib", specifier = "==3.9.0" }, + { name = "opencv-python", specifier = ">=4.11.0.86" }, + { name = "pandas", specifier = "==2.2.3" }, + { name = "parsel", specifier = "==1.9.1" }, + { name = "pillow", specifier = "==9.5.0" }, + { name = "playwright", specifier = "==1.45.0" }, + { name = "pydantic", specifier = "==2.5.2" }, + { name = "pyexecjs", specifier = "==1.5.1" }, + { name = "python-dotenv", specifier = "==1.0.1" }, + { name = "redis", specifier = "~=4.6.0" }, + { name = "requests", specifier = "==2.32.3" }, + { name = "tenacity", specifier = "==8.2.2" }, + { name = "uvicorn", specifier = "==0.29.0" }, + { name = "wordcloud", specifier = "==1.9.3" }, +] + +[[package]] +name = "numpy" +version = "2.0.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version < '3.10' and platform_machine == 'arm64' and sys_platform == 'darwin'", + "python_full_version < '3.10' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.10' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784 }, +] + +[[package]] +name = "numpy" +version = "2.2.6" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version == '3.10.*' and sys_platform == 'darwin'", + "python_full_version == '3.10.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.10.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666 }, +] + +[[package]] +name = "numpy" +version = "2.3.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version == '3.11.*' and sys_platform == 'darwin'", + "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/19/d7c972dfe90a353dbd3efbbe1d14a5951de80c99c9dc1b93cd998d51dc0f/numpy-2.3.1.tar.gz", hash = "sha256:1ec9ae20a4226da374362cca3c62cd753faf2f951440b0e3b98e93c235441d2b", size = 20390372 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/c7/87c64d7ab426156530676000c94784ef55676df2f13b2796f97722464124/numpy-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ea9e48336a402551f52cd8f593343699003d2353daa4b72ce8d34f66b722070", size = 21199346 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/0e/0966c2f44beeac12af8d836e5b5f826a407cf34c45cb73ddcdfce9f5960b/numpy-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ccb7336eaf0e77c1635b232c141846493a588ec9ea777a7c24d7166bb8533ae", size = 14361143 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7d/31/6e35a247acb1bfc19226791dfc7d4c30002cd4e620e11e58b0ddf836fe52/numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0bb3a4a61e1d327e035275d2a993c96fa786e4913aa089843e6a2d9dd205c66a", size = 5378989 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/25/93b621219bb6f5a2d4e713a824522c69ab1f06a57cd571cda70e2e31af44/numpy-2.3.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:e344eb79dab01f1e838ebb67aab09965fb271d6da6b00adda26328ac27d4a66e", size = 6912890 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/60/6b06ed98d11fb32e27fb59468b42383f3877146d3ee639f733776b6ac596/numpy-2.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:467db865b392168ceb1ef1ffa6f5a86e62468c43e0cfb4ab6da667ede10e58db", size = 14569032 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/c9/9bec03675192077467a9c7c2bdd1f2e922bd01d3a69b15c3a0fdcd8548f6/numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:afed2ce4a84f6b0fc6c1ce734ff368cbf5a5e24e8954a338f3bdffa0718adffb", size = 16930354 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/e2/5756a00cabcf50a3f527a0c968b2b4881c62b1379223931853114fa04cda/numpy-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0025048b3c1557a20bc80d06fdeb8cc7fc193721484cca82b2cfa072fec71a93", size = 15879605 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/86/a471f65f0a86f1ca62dcc90b9fa46174dd48f50214e5446bc16a775646c5/numpy-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5ee121b60aa509679b682819c602579e1df14a5b07fe95671c8849aad8f2115", size = 18666994 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/43/a6/482a53e469b32be6500aaf61cfafd1de7a0b0d484babf679209c3298852e/numpy-2.3.1-cp311-cp311-win32.whl", hash = "sha256:a8b740f5579ae4585831b3cf0e3b0425c667274f82a484866d2adf9570539369", size = 6603672 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/fb/bb613f4122c310a13ec67585c70e14b03bfc7ebabd24f4d5138b97371d7c/numpy-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:d4580adadc53311b163444f877e0789f1c8861e2698f6b2a4ca852fda154f3ff", size = 13024015 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/51/58/2d842825af9a0c041aca246dc92eb725e1bc5e1c9ac89712625db0c4e11c/numpy-2.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:ec0bdafa906f95adc9a0c6f26a4871fa753f25caaa0e032578a30457bff0af6a", size = 10456989 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/56/71ad5022e2f63cfe0ca93559403d0edef14aea70a841d640bd13cdba578e/numpy-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2959d8f268f3d8ee402b04a9ec4bb7604555aeacf78b360dc4ec27f1d508177d", size = 20896664 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/65/2db52ba049813670f7f987cc5db6dac9be7cd95e923cc6832b3d32d87cef/numpy-2.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:762e0c0c6b56bdedfef9a8e1d4538556438288c4276901ea008ae44091954e29", size = 14131078 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/dd/28fa3c17b0e751047ac928c1e1b6990238faad76e9b147e585b573d9d1bd/numpy-2.3.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:867ef172a0976aaa1f1d1b63cf2090de8b636a7674607d514505fb7276ab08fc", size = 5112554 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c9/fc/84ea0cba8e760c4644b708b6819d91784c290288c27aca916115e3311d17/numpy-2.3.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:4e602e1b8682c2b833af89ba641ad4176053aaa50f5cacda1a27004352dde943", size = 6646560 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/b2/512b0c2ddec985ad1e496b0bd853eeb572315c0f07cd6997473ced8f15e2/numpy-2.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8e333040d069eba1652fb08962ec5b76af7f2c7bce1df7e1418c8055cf776f25", size = 14260638 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/45/c51cb248e679a6c6ab14b7a8e3ead3f4a3fe7425fc7a6f98b3f147bec532/numpy-2.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e7cbf5a5eafd8d230a3ce356d892512185230e4781a361229bd902ff403bc660", size = 16632729 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/ff/feb4be2e5c09a3da161b412019caf47183099cbea1132fd98061808c2df2/numpy-2.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5f1b8f26d1086835f442286c1d9b64bb3974b0b1e41bb105358fd07d20872952", size = 15565330 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/6d/ceafe87587101e9ab0d370e4f6e5f3f3a85b9a697f2318738e5e7e176ce3/numpy-2.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ee8340cb48c9b7a5899d1149eece41ca535513a9698098edbade2a8e7a84da77", size = 18361734 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/19/0fb49a3ea088be691f040c9bf1817e4669a339d6e98579f91859b902c636/numpy-2.3.1-cp312-cp312-win32.whl", hash = "sha256:e772dda20a6002ef7061713dc1e2585bc1b534e7909b2030b5a46dae8ff077ab", size = 6320411 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b1/3e/e28f4c1dd9e042eb57a3eb652f200225e311b608632bc727ae378623d4f8/numpy-2.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:cfecc7822543abdea6de08758091da655ea2210b8ffa1faf116b940693d3df76", size = 12734973 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/a8/8a5e9079dc722acf53522b8f8842e79541ea81835e9b5483388701421073/numpy-2.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:7be91b2239af2658653c5bb6f1b8bccafaf08226a258caf78ce44710a0160d30", size = 10191491 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/bd/35ad97006d8abff8631293f8ea6adf07b0108ce6fec68da3c3fcca1197f2/numpy-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25a1992b0a3fdcdaec9f552ef10d8103186f5397ab45e2d25f8ac51b1a6b97e8", size = 20889381 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/4f/df5923874d8095b6062495b39729178eef4a922119cee32a12ee1bd4664c/numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7dea630156d39b02a63c18f508f85010230409db5b2927ba59c8ba4ab3e8272e", size = 14152726 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/0f/a1f269b125806212a876f7efb049b06c6f8772cf0121139f97774cd95626/numpy-2.3.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:bada6058dd886061f10ea15f230ccf7dfff40572e99fef440a4a857c8728c9c0", size = 5105145 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6d/63/a7f7fd5f375b0361682f6ffbf686787e82b7bbd561268e4f30afad2bb3c0/numpy-2.3.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:a894f3816eb17b29e4783e5873f92faf55b710c2519e5c351767c51f79d8526d", size = 6639409 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/0d/1854a4121af895aab383f4aa233748f1df4671ef331d898e32426756a8a6/numpy-2.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:18703df6c4a4fee55fd3d6e5a253d01c5d33a295409b03fda0c86b3ca2ff41a1", size = 14257630 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/30/af1b277b443f2fb08acf1c55ce9d68ee540043f158630d62cef012750f9f/numpy-2.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5902660491bd7a48b2ec16c23ccb9124b8abfd9583c5fdfa123fe6b421e03de1", size = 16627546 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/ec/3b68220c277e463095342d254c61be8144c31208db18d3fd8ef02712bcd6/numpy-2.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:36890eb9e9d2081137bd78d29050ba63b8dab95dff7912eadf1185e80074b2a0", size = 15562538 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/2b/4014f2bcc4404484021c74d4c5ee8eb3de7e3f7ac75f06672f8dcf85140a/numpy-2.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a780033466159c2270531e2b8ac063704592a0bc62ec4a1b991c7c40705eb0e8", size = 18360327 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/8d/2ddd6c9b30fcf920837b8672f6c65590c7d92e43084c25fc65edc22e93ca/numpy-2.3.1-cp313-cp313-win32.whl", hash = "sha256:39bff12c076812595c3a306f22bfe49919c5513aa1e0e70fac756a0be7c2a2b8", size = 6312330 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/c8/beaba449925988d415efccb45bf977ff8327a02f655090627318f6398c7b/numpy-2.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d5ee6eec45f08ce507a6570e06f2f879b374a552087a4179ea7838edbcbfa42", size = 12731565 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/c3/5c0c575d7ec78c1126998071f58facfc124006635da75b090805e642c62e/numpy-2.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:0c4d9e0a8368db90f93bd192bfa771ace63137c3488d198ee21dfb8e7771916e", size = 10190262 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/19/a029cd335cf72f79d2644dcfc22d90f09caa86265cbbde3b5702ccef6890/numpy-2.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b0b5397374f32ec0649dd98c652a1798192042e715df918c20672c62fb52d4b8", size = 20987593 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/91/8ea8894406209107d9ce19b66314194675d31761fe2cb3c84fe2eeae2f37/numpy-2.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c5bdf2015ccfcee8253fb8be695516ac4457c743473a43290fd36eba6a1777eb", size = 14300523 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/7f/06187b0066eefc9e7ce77d5f2ddb4e314a55220ad62dd0bfc9f2c44bac14/numpy-2.3.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d70f20df7f08b90a2062c1f07737dd340adccf2068d0f1b9b3d56e2038979fee", size = 5227993 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/ec/a926c293c605fa75e9cfb09f1e4840098ed46d2edaa6e2152ee35dc01ed3/numpy-2.3.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:2fb86b7e58f9ac50e1e9dd1290154107e47d1eef23a0ae9145ded06ea606f992", size = 6736652 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/62/d68e52fb6fde5586650d4c0ce0b05ff3a48ad4df4ffd1b8866479d1d671d/numpy-2.3.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:23ab05b2d241f76cb883ce8b9a93a680752fbfcbd51c50eff0b88b979e471d8c", size = 14331561 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/ec/b74d3f2430960044bdad6900d9f5edc2dc0fb8bf5a0be0f65287bf2cbe27/numpy-2.3.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ce2ce9e5de4703a673e705183f64fd5da5bf36e7beddcb63a25ee2286e71ca48", size = 16693349 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0d/15/def96774b9d7eb198ddadfcbd20281b20ebb510580419197e225f5c55c3e/numpy-2.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c4913079974eeb5c16ccfd2b1f09354b8fed7e0d6f2cab933104a09a6419b1ee", size = 15642053 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/57/c3203974762a759540c6ae71d0ea2341c1fa41d84e4971a8e76d7141678a/numpy-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:010ce9b4f00d5c036053ca684c77441f2f2c934fd23bee058b4d6f196efd8280", size = 18434184 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/8a/ccdf201457ed8ac6245187850aff4ca56a79edbea4829f4e9f14d46fa9a5/numpy-2.3.1-cp313-cp313t-win32.whl", hash = "sha256:6269b9edfe32912584ec496d91b00b6d34282ca1d07eb10e82dfc780907d6c2e", size = 6440678 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/7e/7f431d8bd8eb7e03d79294aed238b1b0b174b3148570d03a8a8a8f6a0da9/numpy-2.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2a809637460e88a113e186e87f228d74ae2852a2e0c44de275263376f17b5bdc", size = 12870697 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/ca/af82bf0fad4c3e573c6930ed743b5308492ff19917c7caaf2f9b6f9e2e98/numpy-2.3.1-cp313-cp313t-win_arm64.whl", hash = "sha256:eccb9a159db9aed60800187bc47a6d3451553f0e1b08b068d8b277ddfbb9b244", size = 10260376 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/34/facc13b9b42ddca30498fc51f7f73c3d0f2be179943a4b4da8686e259740/numpy-2.3.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ad506d4b09e684394c42c966ec1527f6ebc25da7f4da4b1b056606ffe446b8a3", size = 21070637 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/65/b6/41b705d9dbae04649b529fc9bd3387664c3281c7cd78b404a4efe73dcc45/numpy-2.3.1-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:ebb8603d45bc86bbd5edb0d63e52c5fd9e7945d3a503b77e486bd88dde67a19b", size = 5304087 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/b4/fe3ac1902bff7a4934a22d49e1c9d71a623204d654d4cc43c6e8fe337fcb/numpy-2.3.1-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:15aa4c392ac396e2ad3d0a2680c0f0dee420f9fed14eef09bdb9450ee6dcb7b7", size = 6817588 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/ee/89bedf69c36ace1ac8f59e97811c1f5031e179a37e4821c3a230bf750142/numpy-2.3.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c6e0bf9d1a2f50d2b65a7cf56db37c095af17b59f6c132396f7c6d5dd76484df", size = 14399010 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/15/08/e00e7070ede29b2b176165eba18d6f9784d5349be3c0c1218338e79c27fd/numpy-2.3.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:eabd7e8740d494ce2b4ea0ff05afa1b7b291e978c0ae075487c51e8bd93c0c68", size = 16752042 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/48/6b/1c6b515a83d5564b1698a61efa245727c8feecf308f4091f565988519d20/numpy-2.3.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e610832418a2bc09d974cc9fecebfa51e9532d6190223bc5ef6a7402ebf3b5cb", size = 12927246 }, +] + +[[package]] +name = "opencv-python" +version = "4.11.0.86" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044 }, +] + +[[package]] +name = "packaging" +version = "25.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 }, +] + +[[package]] +name = "pandas" +version = "2.2.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 }, +] + +[[package]] +name = "parsel" +version = "1.9.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "cssselect" }, + { name = "jmespath" }, + { name = "lxml" }, + { name = "packaging" }, + { name = "w3lib" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/87/bd/b982085f091367ca25ccb61f2d127655a0daac1716ecfde014ab7c538116/parsel-1.9.1.tar.gz", hash = "sha256:14e00dc07731c9030db620c195fcae884b5b4848e9f9c523c6119f708ccfa9ac", size = 51225 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/7e/e3f1a7ff69303a4e08a8742a285406e5786650d8218ff194743eff292a1e/parsel-1.9.1-py2.py3-none-any.whl", hash = "sha256:c4a777ee6c3ff5e39652b58e351c5cf02c12ff420d05b07a7966aebb68ab1700", size = 17116 }, +] + +[[package]] +name = "pillow" +version = "9.5.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/00/d5/4903f310765e0ff2b8e91ffe55031ac6af77d982f0156061e20a4d1a8b2d/Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1", size = 50488147 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1b/bc/cff591742feea45f88a3b8a83f7cab4a1dcdb4bcdfc51a06d92f96c81165/Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16", size = 3395758 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/38/06/de304914ecd2c911939a28579546bd4d9b6ae0b3c07ce5fe9bd7d100eb34/Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa", size = 3077111 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/57/7864b6a22acb5f1d4b70af8c92cbd5e3af25f4d5869c24cd8074ca1f3593/Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38", size = 3112529 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/62/88/46a35f690ee4f8b08aef5fdb47f63d29c34f6874834155e52bf4456d9566/Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062", size = 3386670 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/1d/26a56ed1deae695a8c7d13fb514284ba8b9fd62bab9ebe6d6b474523b8b0/Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e", size = 3308572 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/36/d22b0fac821a14572fdb9a8015b2bf19ee81eaa560ea25a6772760c86a30/Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5", size = 3163999 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/25/6b/d3c35d207c9c0b6c2f855420f62e64ef43d348e8c797ad1c32b9f2106a19/Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d", size = 3415623 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/6a/a7df39c502caeadd942d8bf97bc2fdfc819fbdc7499a2ab05e7db43611ac/Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903", size = 3350658 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/ad/d29c8c48498da680521665b8483beb78a9343269bbd0730970e9396b01f0/Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a", size = 3414574 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/93/54/9d7f01fd3fe4069c88827728646e3c8f1aff0995e8422d841b38f034f39a/Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44", size = 2211916 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/14/0030e542f2acfea43635e55584c114e6cfd94d342393a5f71f74c172dc35/Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb", size = 2511474 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/a8/3c2d737d856eb9cd8c18e78f6fe0ed08a2805bded74cbb0455584859023b/Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32", size = 3395792 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/15/310cde63cb15a091de889ded26281924cf9cfa5c000b36b06bd0c7f50261/Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c", size = 3077092 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/66/20db69c0361902a2f6ee2086d3e83c70133e3fb4cb31470e59a8ed37184e/Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3", size = 3112543 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/a8/ff526cdec6b56eb20c992e7083f02c8065049ed1e62fbc159390d7a3dd5e/Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a", size = 3386654 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/70/e9a45a2e9c58c23e023fcda5af9686f5b42c718cc9bc86194e0025cf0ec5/Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1", size = 3308566 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/a5/ee306d6cc53c9a30c23ba2313b43b67fdf76c611ca5afd0cdd62922cbd3e/Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99", size = 3164027 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/59/e6bd2c3715ace343d9739276ceed79657fe116923238d102cf731ab463dd/Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625", size = 3415610 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/6d/9beb596ba5a5e61081c843187bcdbb42a5c9a9ef552751b554894247da7a/Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579", size = 3350704 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1e/e4/de633d85be3b3c770c554a37a89e8273069bd19c34b15a419c2795600310/Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296", size = 3414604 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/a0/e410f655300932308e70e883dd60c0c51e6f74bed138641ea9193e64fd7c/Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec", size = 2211929 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0c/02/7729c8aecbc525b560c7eb283ffa34c6f5a6d0ed6d1339570c65a3e63088/Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4", size = 2511551 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b9/8b/d38cc68796be4ac238db327682a1acfbc5deccf64a150aa44ee1efbaafae/Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089", size = 2489206 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/38/b7bcbab3bfe1946ba9cf71c1fa03e541b498069457be49eadcdc229412ef/Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb", size = 2211914 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/8a/f4cf3f32bc554f9260b645ea1151449ac13525796d3d1a42076d75945d8d/Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b", size = 2511483 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/46/672289c0ff87733fb93854dedf3a8d65642a25c0bfc88e7f6d722f9161a5/Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66", size = 3395750 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/70/9259e93534d01f846f7d0501f19bb7d8cc1751741bc20826fc8d3a20fe32/Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e", size = 3077133 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/62/8a943681db5f6588498ed86aa1568dd31c63f6afdabe50841589fc662c68/Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115", size = 3112534 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/43/0892913d499c8df2c88dee69d59e77de19e0c51754a9be82023880641c09/Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3", size = 3386725 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/fc/48a51c0fe2a00d5def57b9981a1e0f8339b516351da7a51500383d833bc8/Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef", size = 3308605 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/b7/f9faf80e3c93b02712c5748f10c75a8948e74eca61ec2408f7e1d4c9dd16/Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705", size = 3164057 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/2b/57915b8af178e2c20bfd403ffed4521947881f9dbbfbaba48210dc59b9d7/Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1", size = 3415613 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/2a/f3ed578595f8486ee2cc07434460097d89aedd406a3db849b890ca8ec416/Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a", size = 3350667 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/a2/f2d0d584d45100a5419fd70a1233ade8f12469ffe6e8e3acd40364beaadb/Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865", size = 3414552 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/51/3a/a6701b987007aaa43559b7d8510629845b25686f09a0eb29f8946a62d767/Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964", size = 2229361 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/69/72/48cc52bff8731cf72bc4101e34dc44807a410c171f921afb582a511da50e/Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d", size = 2538580 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/35/92032a00f41bea9bf93f19d48f15daac27d1365c0038fe22dc4e7fc7c8b0/Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572", size = 3349772 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/ce/d39869c22904558ce32e664904cf72f13a9d47703b72392e881d9e7b6082/Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe", size = 3281583 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/75/4a382d1567efc6f4e3054f693167f8ce2d1ad939c5f6f12aa5c50f74b997/Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1", size = 3222603 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/51/d2/c10f72c44e000d08e41f822083cf322bb59afa7ed01ae7e3e47875b47600/Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7", size = 3298174 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/02/4a/d362f7f44f1e5801c6726f0eaaeaf869d0d43c554b717072b2c5540cefb4/Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799", size = 2538628 }, +] + +[[package]] +name = "playwright" +version = "1.45.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/ee/5ffdf7557b601a00fb4e0cc29b140cefc584e8d394ea21cc3a4eb05b4d45/playwright-1.45.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:7d49aee5907d8e72060f04bc299cb6851c2dc44cb227540ade89d7aa529e907a", size = 34747285 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/4e/d7df2eb27e2f228ec5fb2be453d3c780001ab9b3bd0f7ec5b6568adfe46e/playwright-1.45.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:210c9f848820f58b5b5ed48047748620b780ca3acc3e2b7560dafb2bfdd6d90a", size = 33064579 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/4f/2f364f0586d1c0e0b82ee1118505fc5a495f2b385e34575967b8a5fc8302/playwright-1.45.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:13b5398831f5499580e819ddc996633446a93bf88029e89451e51da188e16ae3", size = 34747281 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/3e/d68323360efc63c9565f8f9bc84d1074bb2d4726ec5bb45bd8c9fb72fe1b/playwright-1.45.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:0ba5a39f25fb9b9cf1bd48678f44536a29f6d83376329de2dee1567dac220afe", size = 37861759 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/b8/329494128bc9c90ad4a705e11e33bc1d90e79faf90ee27673c568ad0c8ba/playwright-1.45.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b09fa76614ba2926d45a4c0581f710c13652d5e32290ba6a1490fbafff7f0be8", size = 37605929 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a5/73/cde84614e51de244fe0cd1d0be7edb4cdedee7d7306f9646468807a22b6b/playwright-1.45.0-py3-none-win32.whl", hash = "sha256:97a7d53af89af54208b69c051046b462675fcf5b93f7fbfb7c0fa7f813424ee2", size = 29692683 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/87/0f/c8dcadb2f0dcfdab6052d5ecf57ccf19b439c0adc29fc510ed0830349345/playwright-1.45.0-py3-none-win_amd64.whl", hash = "sha256:701db496928429aec103739e48e3110806bd5cf49456cc95b89f28e1abda71da", size = 29692683 }, +] + +[[package]] +name = "pydantic" +version = "2.5.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/41/3c8108f79fb7da2d2b17f35744232af4ffcd9e764ebe1e3fd4b26669b325/pydantic-2.5.2.tar.gz", hash = "sha256:ff177ba64c6faf73d7afa2e8cad38fd456c0dbe01c9954e71038001cd15a6edd", size = 652158 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0a/2b/64066de1c4cf3d4ed623beeb3bbf3f8d0cc26661f1e7d180ec5eb66b75a5/pydantic-2.5.2-py3-none-any.whl", hash = "sha256:80c50fb8e3dcecfddae1adbcc00ec5822918490c99ab31f6cf6140ca1c1429f0", size = 381874 }, +] + +[[package]] +name = "pydantic-core" +version = "2.14.5" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/26/cffb93fe9c6b5a91c497f37fae14a4b073ecbc47fc36a9979c7aa888b245/pydantic_core-2.14.5.tar.gz", hash = "sha256:6d30226dfc816dd0fdf120cae611dd2215117e4f9b124af8c60ab9093b6e8e71", size = 360131 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c0/d2/b31c030802f29c35fa0c8ab92891bee9dcedd2793df560041b6d38f5fd49/pydantic_core-2.14.5-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:7e88f5696153dc516ba6e79f82cc4747e87027205f0e02390c21f7cb3bd8abfd", size = 1861818 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/9b/5246600a17467ad8071174250d7727b34f5dc0dfe74abf3e99dbdf1beee1/pydantic_core-2.14.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4641e8ad4efb697f38a9b64ca0523b557c7931c5f84e0fd377a9a3b05121f0de", size = 1735406 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/7b/9083133f247b9f712f5718c66b3e39194ea679fbe85567bf4dc9d08557bb/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:774de879d212db5ce02dfbf5b0da9a0ea386aeba12b0b95674a4ce0593df3d07", size = 1829696 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7d/de/df454233c7960a899846f037209204df1d8ab761bb81a7561abb4daf2288/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ebb4e035e28f49b6f1a7032920bb9a0c064aedbbabe52c543343d39341a5b2a3", size = 1854038 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/53/d4ae1f5273cbc83d5a4c158916a9235c1bfc8194be63958b4b5ff11bf838/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b53e9ad053cd064f7e473a5f29b37fc4cc9dc6d35f341e6afc0155ea257fc911", size = 2006079 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/ef/4fd3b40a82ea729a2566575aeec119449b0bf1b4c13d9255e8ac2a40a58b/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aa1768c151cf562a9992462239dfc356b3d1037cc5a3ac829bb7f3bda7cc1f9", size = 2985938 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/f5/3e59681bd53955da311a7f4efbb6315d01006e9d18b8a06b527a22d3d923/pydantic_core-2.14.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eac5c82fc632c599f4639a5886f96867ffced74458c7db61bc9a66ccb8ee3113", size = 2069435 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/df/5159aa30c4b2128f14634f3b3e9e19df228364c2107cda7910d058cc1bca/pydantic_core-2.14.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2ae91f50ccc5810b2f1b6b858257c9ad2e08da70bf890dee02de1775a387c66", size = 1915671 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/43/d94f10d82ccffc86bd69bfac73c54589703008236d63965dd40005a80af9/pydantic_core-2.14.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6b9ff467ffbab9110e80e8c8de3bcfce8e8b0fd5661ac44a09ae5901668ba997", size = 2010232 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/11/f3e35b74745b5167df5f1dc15bd2368dbaa9e70d2ad8438a0c9485b78da5/pydantic_core-2.14.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:61ea96a78378e3bd5a0be99b0e5ed00057b71f66115f5404d0dae4819f495093", size = 2136951 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/e8/d2a534d8c555f6e375296f7d534405dbc247b0da91f1c067cdca5220d95f/pydantic_core-2.14.5-cp310-none-win32.whl", hash = "sha256:bb4c2eda937a5e74c38a41b33d8c77220380a388d689bcdb9b187cf6224c9720", size = 1730860 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/9f/bd9a41853a8ad6854cf126e72bb19a4849f79efe2d544b1a44f5351b9748/pydantic_core-2.14.5-cp310-none-win_amd64.whl", hash = "sha256:b7851992faf25eac90bfcb7bfd19e1f5ffa00afd57daec8a0042e63c74a4551b", size = 1887073 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/00/bd693e0bf24fa016c7194ac9ca671903b0938a5aa2603f7b5779070a15a0/pydantic_core-2.14.5-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:4e40f2bd0d57dac3feb3a3aed50f17d83436c9e6b09b16af271b6230a2915459", size = 1858563 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ab/43/77d8f56eb332e84097f18fc294346d214e9f0d22fb9ec67ebed4b8e90e35/pydantic_core-2.14.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ab1cdb0f14dc161ebc268c09db04d2c9e6f70027f3b42446fa11c153521c0e88", size = 1735080 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/09/90f5a03ab19e21601c6fec11fc9dea30e3228731e12b2f75f58d02430b85/pydantic_core-2.14.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aae7ea3a1c5bb40c93cad361b3e869b180ac174656120c42b9fadebf685d121b", size = 1826884 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/11/3f332887a888217e28b23c115c343ef89ccf5f49bbbd88d9317c707b00ac/pydantic_core-2.14.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:60b7607753ba62cf0739177913b858140f11b8af72f22860c28eabb2f0a61937", size = 1851648 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/0a/1c0372929f3723587d66c188cbdd0c47d269447e0ac8f231f0db0f9bb03c/pydantic_core-2.14.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2248485b0322c75aee7565d95ad0e16f1c67403a470d02f94da7344184be770f", size = 2003694 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/e6/8887679b7f923290db2638bf80733c609aaefaae29b9fe99b83f800c1910/pydantic_core-2.14.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:823fcc638f67035137a5cd3f1584a4542d35a951c3cc68c6ead1df7dac825c26", size = 2984596 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/ed/ee221482b51f368884ea6453f3784eeaeb17f5b737589d39d68a89bffde7/pydantic_core-2.14.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96581cfefa9123accc465a5fd0cc833ac4d75d55cc30b633b402e00e7ced00a6", size = 2068579 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/cf/2f6e6410ae735c11df32c391948a6c601a22f40f414b5dfc24f2def8c064/pydantic_core-2.14.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a33324437018bf6ba1bb0f921788788641439e0ed654b233285b9c69704c27b4", size = 1913142 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/81/f5452ccf3b15aa280188fbf2b6ab39ed700623df4fcc28675f19eee9634a/pydantic_core-2.14.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9bd18fee0923ca10f9a3ff67d4851c9d3e22b7bc63d1eddc12f439f436f2aada", size = 2007100 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/ab/79c2126e5504a3f0ecc0b1d97768594f9baa090134b0053309a2d938efaa/pydantic_core-2.14.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:853a2295c00f1d4429db4c0fb9475958543ee80cfd310814b5c0ef502de24dda", size = 2134232 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2f/eb/4b07b31c4a728b02cae14cc2a447ebd460dfdf7076fe56a074ff7e27be4f/pydantic_core-2.14.5-cp311-none-win32.whl", hash = "sha256:cb774298da62aea5c80a89bd58c40205ab4c2abf4834453b5de207d59d2e1651", size = 1730376 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/a1/36cea283ded0641e8c374cdcacfdab035c102467ac5ec721b7527c8ac1cf/pydantic_core-2.14.5-cp311-none-win_amd64.whl", hash = "sha256:e87fc540c6cac7f29ede02e0f989d4233f88ad439c5cdee56f693cc9c1c78077", size = 1886861 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e6/7c/af522a1bce278dda0f0fdc9e64a081af51cbfedeafe44cbb6a4cc8617dad/pydantic_core-2.14.5-cp311-none-win_arm64.whl", hash = "sha256:57d52fa717ff445cb0a5ab5237db502e6be50809b43a596fb569630c665abddf", size = 1848339 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/dd/fc81e3ea962a356a705fa06965a7dbc0b204da014f238df95f1cd276bfab/pydantic_core-2.14.5-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:e60f112ac88db9261ad3a52032ea46388378034f3279c643499edb982536a093", size = 1851038 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/45/5eef8d36c2bf4c63e73e598fe523a0bc15069a97994481e27bef933ff423/pydantic_core-2.14.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6e227c40c02fd873c2a73a98c1280c10315cbebe26734c196ef4514776120aeb", size = 1713179 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9e/f3/9e3d334976b5051cd18e3feef06516ead3230efb8b9af8514bc52b2795b1/pydantic_core-2.14.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0cbc7fff06a90bbd875cc201f94ef0ee3929dfbd5c55a06674b60857b8b85ed", size = 1823940 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/89/5c/e0584d534863639757e05479a3c1172550e3d3dab0c39b79e41692d1804d/pydantic_core-2.14.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:103ef8d5b58596a731b690112819501ba1db7a36f4ee99f7892c40da02c3e189", size = 1834525 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/52/2fc8b7e07f360993bc3d5f9ea743aac9f59287002035887c7d4f45bc6fb6/pydantic_core-2.14.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c949f04ecad823f81b1ba94e7d189d9dfb81edbb94ed3f8acfce41e682e48cef", size = 1994576 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/ff/72d57544a70f4f37a06c40cfe1c4a038bc21db308e916a277faa1854a1d8/pydantic_core-2.14.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1452a1acdf914d194159439eb21e56b89aa903f2e1c65c60b9d874f9b950e5d", size = 3039433 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8f/af/b202d44845f89e9c997f2f351be35a76ff78304eb926b1bdb33929de40db/pydantic_core-2.14.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb4679d4c2b089e5ef89756bc73e1926745e995d76e11925e3e96a76d5fa51fc", size = 2063479 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cb/96/27421976cde52555eb20636d59743621d4fa3bba278a0e4dbb4751e3f5c1/pydantic_core-2.14.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf9d3fe53b1ee360e2421be95e62ca9b3296bf3f2fb2d3b83ca49ad3f925835e", size = 1919434 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/6f/52cb83061430628878c34fdb199ccc8313a104f1390d99bff4a29b2ff6fe/pydantic_core-2.14.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:70f4b4851dbb500129681d04cc955be2a90b2248d69273a787dda120d5cf1f69", size = 2004054 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/10/89/bbb9bb3bd59b1cb36a87c2f6b6e3b2858fdb6ac438539f67a6c93a91ba5e/pydantic_core-2.14.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:59986de5710ad9613ff61dd9b02bdd2f615f1a7052304b79cc8fa2eb4e336d2d", size = 2126498 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/80/b7678c547b947cec35c136d88baf315fa6837500d9f8ce7353347f50a521/pydantic_core-2.14.5-cp312-none-win32.whl", hash = "sha256:699156034181e2ce106c89ddb4b6504c30db8caa86e0c30de47b3e0654543260", size = 1741746 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/5a/3161e1a1c138407cd2037b12ecdbe29f4890ccda1c0a0be69438c7d0065d/pydantic_core-2.14.5-cp312-none-win_amd64.whl", hash = "sha256:5baab5455c7a538ac7e8bf1feec4278a66436197592a9bed538160a2e7d11e36", size = 1874829 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cf/b7/9bacf7f9439f785b2fe6d8199e28ad75ad25406f97f33c0186274a48a36d/pydantic_core-2.14.5-cp312-none-win_arm64.whl", hash = "sha256:e47e9a08bcc04d20975b6434cc50bf82665fbc751bcce739d04a3120428f3e27", size = 1844793 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/e1/c33fcdbdad7f5c29376fa2e57f8d60f966c44fc77fc36a70d0ae03bbe813/pydantic_core-2.14.5-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:a6a16f4a527aae4f49c875da3cdc9508ac7eef26e7977952608610104244e1b7", size = 1861617 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/d2/4820db26970effb5d6fdee68f578585448b2eb6dd7344ab55b20958a0874/pydantic_core-2.14.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:abf058be9517dc877227ec3223f0300034bd0e9f53aebd63cf4456c8cb1e0863", size = 1739010 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4f/10/c44d89cb2fa31a27766aeb39b11380ad2e01bdab7f4bf63b18dfea20ec00/pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:49b08aae5013640a3bfa25a8eebbd95638ec3f4b2eaf6ed82cf0c7047133f03b", size = 1829873 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/5c/cc41dad06acd213f093581454812d6bb20311524ecf265f893e05e4fbe84/pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c2d97e906b4ff36eb464d52a3bc7d720bd6261f64bc4bcdbcd2c557c02081ed2", size = 1853845 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/83/ae5698f7a8121599b239ea547f58f7b135e299e87cfe1a88fb1e6319d57c/pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3128e0bbc8c091ec4375a1828d6118bc20404883169ac95ffa8d983b293611e6", size = 2005879 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/bb/923eeeb3e87ba9024e311e0f3d1f0a4baad609ed7bfc7da7341e95981bd4/pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88e74ab0cdd84ad0614e2750f903bb0d610cc8af2cc17f72c28163acfcf372a4", size = 2987768 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2a/b7/f85e5fd4504fae0df3eadd4bf9e0c495ecbdb804dc9be65653119454571e/pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c339dabd8ee15f8259ee0f202679b6324926e5bc9e9a40bf981ce77c038553db", size = 2069500 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d2/d7/0f13f8cce749c4c5484ddfe60239bcce21a2a6cdcea250f13ae471cb86cb/pydantic_core-2.14.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3387277f1bf659caf1724e1afe8ee7dbc9952a82d90f858ebb931880216ea955", size = 1915855 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/b8/7f1ca7c80dcb44bd525ba5e5feba5e45be686daeee535b434628be0f6cd7/pydantic_core-2.14.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ba6b6b3846cfc10fdb4c971980a954e49d447cd215ed5a77ec8190bc93dd7bc5", size = 2008972 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/1c/d9ba54c20c76706eb04491187d2d22ce56982ec3d999c6915ceb16755ebd/pydantic_core-2.14.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ca61d858e4107ce5e1330a74724fe757fc7135190eb5ce5c9d0191729f033209", size = 2136776 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4e/77/02bb9e292fdce2c25cf690a5d7a63487eaf264ff200ecba03ffeff3376da/pydantic_core-2.14.5-cp39-none-win32.whl", hash = "sha256:ec1e72d6412f7126eb7b2e3bfca42b15e6e389e1bc88ea0069d0cc1742f477c6", size = 1730674 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/41/64/43de643a6d2d157a8ac508a7c2a6a9746c941a659a6c64e00ebd13d5db4f/pydantic_core-2.14.5-cp39-none-win_amd64.whl", hash = "sha256:c0b97ec434041827935044bbbe52b03d6018c2897349670ff8fe11ed24d1d4ab", size = 1888170 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/00/47/88baa62574f06e2dd5b9c0285b5b9b300c79e3d808c5d5a81f04e0817b82/pydantic_core-2.14.5-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:79e0a2cdbdc7af3f4aee3210b1172ab53d7ddb6a2d8c24119b5706e622b346d0", size = 1857942 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/c6/01758bde5022817fd202ee9de506ea5ba3cedc9aa4b421edabda0d1b9fa4/pydantic_core-2.14.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:678265f7b14e138d9a541ddabbe033012a2953315739f8cfa6d754cc8063e8ca", size = 1741454 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a9/2b/f1dca235271785f19e0f3696b31140d6a69ff5349970253c034f9c603b8e/pydantic_core-2.14.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95b15e855ae44f0c6341ceb74df61b606e11f1087e87dcb7482377374aac6abe", size = 1819377 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/0f/bb0bd20e5bbabdf99d0a25858cf77b74926826a75d0458dc4842cf360ea5/pydantic_core-2.14.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:09b0e985fbaf13e6b06a56d21694d12ebca6ce5414b9211edf6f17738d82b0f8", size = 1952395 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/a4/fcb082e0723f9e4fcdbc5564879255c7f6de1f3d4d6acdd1b8799a86aa97/pydantic_core-2.14.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3ad873900297bb36e4b6b3f7029d88ff9829ecdc15d5cf20161775ce12306f8a", size = 1903315 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/32/0a6ee79ed34e8934a54548495883017dfaf3fc742b0d0d02afa154f1f49d/pydantic_core-2.14.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:2d0ae0d8670164e10accbeb31d5ad45adb71292032d0fdb9079912907f0085f4", size = 2000022 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fb/84/f7e4556343ea0a483fa4e18505efaf10002581d2e980867a5b1ed22bfd21/pydantic_core-2.14.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d37f8ec982ead9ba0a22a996129594938138a1503237b87318392a48882d50b7", size = 2130197 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/1b/eb3861748a1669865f7b01dd73dedc185f1e2dad84c56a0fd00672e7fac8/pydantic_core-2.14.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:35613015f0ba7e14c29ac6c2483a657ec740e5ac5758d993fdd5870b07a61d8b", size = 1991642 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/ba/f3eee66c90f2e4f468fc01cace46ec633f9d47d53e1610ef3bc6003fc936/pydantic_core-2.14.5-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:7f8210297b04e53bc3da35db08b7302a6a1f4889c79173af69b72ec9754796b8", size = 1857920 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/be/6be1245f78b72da970cf52cf4c55d8abcfd1655114d122ee6cf5641fc3f5/pydantic_core-2.14.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:8c8a8812fe6f43a3a5b054af6ac2d7b8605c7bcab2804a8a7d68b53f3cd86e00", size = 1741506 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/84/01/079cd694491f1e05a1caae15a2ee32321a8fa748a34a183f6a38bf885af9/pydantic_core-2.14.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:206ed23aecd67c71daf5c02c3cd19c0501b01ef3cbf7782db9e4e051426b3d0d", size = 1819445 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/03/99/f7eb0cc34ea21e94aa0610a9c0794064847adc38ab824c8722e9fe35ebba/pydantic_core-2.14.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2027d05c8aebe61d898d4cffd774840a9cb82ed356ba47a90d99ad768f39789", size = 1952396 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ed/b0/afd8f57e4ac5eaa4f1562b6f04cf10140cd6596c97d378aae2af6a236234/pydantic_core-2.14.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:40180930807ce806aa71eda5a5a5447abb6b6a3c0b4b3b1b1962651906484d68", size = 1903310 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/cf/1348242330768c4014ba26c51a847c23db105da6b21bdcefbc9087926af3/pydantic_core-2.14.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:615a0a4bff11c45eb3c1996ceed5bdaa2f7b432425253a7c2eed33bb86d80abc", size = 1999952 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/e9/ffaec12924f90d4f2f589b0f6f510b671a561b02dce47ce9fad559b41ac3/pydantic_core-2.14.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5e412d717366e0677ef767eac93566582518fe8be923361a5c204c1a62eaafe", size = 2130223 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/08/01/ced0c6a1ac6737cfddbe8e81ec73278f3ec6e2627890fbf052b3ece56b48/pydantic_core-2.14.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:513b07e99c0a267b1d954243845d8a833758a6726a3b5d8948306e3fe14675e3", size = 1991649 }, +] + +[[package]] +name = "pyee" +version = "11.1.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/22/b4c7f3d9579204a014c4eda0e019e6bfe56af52a96cacc82004b60eec079/pyee-11.1.0.tar.gz", hash = "sha256:b53af98f6990c810edd9b56b87791021a8f54fd13db4edd1142438d44ba2263f", size = 29806 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/16/cc/5cea8a0a0d3deb90b5a0d39ad1a6a1ccaa40a9ea86d793eb8a49d32a6ed0/pyee-11.1.0-py3-none-any.whl", hash = "sha256:5d346a7d0f861a4b2e6c47960295bd895f816725b27d656181947346be98d7c1", size = 15263 }, +] + +[[package]] +name = "pyexecjs" +version = "1.5.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/8e/aedef81641c8dca6fd0fb7294de5bed9c45f3397d67fddf755c1042c2642/PyExecJS-1.5.1.tar.gz", hash = "sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c", size = 13344 } + +[[package]] +name = "pymysql" +version = "1.1.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/8f/ce59b5e5ed4ce8512f879ff1fa5ab699d211ae2495f1adaa5fbba2a1eada/pymysql-1.1.1.tar.gz", hash = "sha256:e127611aaf2b417403c60bf4dc570124aeb4a57f5f37b8e95ae399a42f904cd0", size = 47678 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0c/94/e4181a1f6286f545507528c78016e00065ea913276888db2262507693ce5/PyMySQL-1.1.1-py3-none-any.whl", hash = "sha256:4de15da4c61dc132f4fb9ab763063e693d521a80fd0e87943b9a453dd4c19d6c", size = 44972 }, +] + +[[package]] +name = "pyparsing" +version = "3.2.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bb/22/f1129e69d94ffff626bdb5c835506b3a5b4f3d070f17ea295e12c2c6f60f/pyparsing-3.2.3.tar.gz", hash = "sha256:b9c13f1ab8b3b542f72e28f634bad4de758ab3ce4546e4301970ad6fa77c38be", size = 1088608 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120 }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, +] + +[[package]] +name = "python-dotenv" +version = "1.0.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/57/e84d88dfe0aec03b7a2d4327012c1627ab5f03652216c63d49846d7a6c58/python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", size = 39115 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 }, +] + +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225 }, +] + +[[package]] +name = "redis" +version = "4.6.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version <= '3.11.2'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/88/63d802c2b18dd9eaa5b846cbf18917c6b2882f20efda398cc16a7500b02c/redis-4.6.0.tar.gz", hash = "sha256:585dc516b9eb042a619ef0a39c3d7d55fe81bdb4df09a52c9cdde0d07bf1aa7d", size = 4561721 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/2e/409703d645363352a20c944f5d119bdae3eb3034051a53724a7c5fee12b8/redis-4.6.0-py3-none-any.whl", hash = "sha256:e2b03db868160ee4591de3cb90d40ebb50a90dd302138775937f6a42b7ed183c", size = 241149 }, +] + +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, +] + +[[package]] +name = "starlette" +version = "0.37.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/b5/6bceb93ff20bd7ca36e6f7c540581abb18f53130fabb30ba526e26fd819b/starlette-0.37.2.tar.gz", hash = "sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823", size = 2843736 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/18/31fa32ed6c68ba66220204ef0be798c349d0a20c1901f9d4a794e08c76d8/starlette-0.37.2-py3-none-any.whl", hash = "sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee", size = 71908 }, +] + +[[package]] +name = "tenacity" +version = "8.2.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/f0/6ccd8854f4421ce1f227caf3421d9be2979aa046939268c9300030c0d250/tenacity-8.2.2.tar.gz", hash = "sha256:43af037822bd0029025877f3b2d97cc4d7bb0c2991000a3d59d71517c5c969e0", size = 40186 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/b0/c23bd61e1b32c9b96fbca996c87784e196a812da8d621d8d04851f6c8181/tenacity-8.2.2-py3-none-any.whl", hash = "sha256:2f277afb21b851637e8f52e6a613ff08734c347dc19ade928e519d7d2d8569b0", size = 24390 }, +] + +[[package]] +name = "typing-extensions" +version = "4.14.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839 }, +] + +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839 }, +] + +[[package]] +name = "urllib3" +version = "2.5.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795 }, +] + +[[package]] +name = "uvicorn" +version = "0.29.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "click", version = "8.1.8", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "click", version = "8.2.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "h11" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/8d/5005d39cd79c9ae87baf7d7aafdcdfe0b13aa69d9a1e3b7f1c984a2ac6d2/uvicorn-0.29.0.tar.gz", hash = "sha256:6a69214c0b6a087462412670b3ef21224fa48cae0e452b5883e8e8bdfdd11dd0", size = 40894 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/f5/cbb16fcbe277c1e0b8b3ddd188f2df0e0947f545c49119b589643632d156/uvicorn-0.29.0-py3-none-any.whl", hash = "sha256:2c2aac7ff4f4365c206fd773a39bf4ebd1047c238f8b8268ad996829323473de", size = 60813 }, +] + +[[package]] +name = "w3lib" +version = "2.3.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/7d/1172cfaa1e29beb9bf938e484c122b3bdc82e8e37b17a4f753ba6d6e009f/w3lib-2.3.1.tar.gz", hash = "sha256:5c8ac02a3027576174c2b61eb9a2170ba1b197cae767080771b6f1febda249a4", size = 49531 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/dd/56f0d8af71e475ed194d702f8b4cf9cea812c95e82ad823d239023c6558c/w3lib-2.3.1-py3-none-any.whl", hash = "sha256:9ccd2ae10c8c41c7279cd8ad4fe65f834be894fe7bfdd7304b991fd69325847b", size = 21751 }, +] + +[[package]] +name = "wordcloud" +version = "1.9.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.1", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pillow" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/60/5f927145b65de0f299079db846c89fa031d56e4df9764607add12a03714e/wordcloud-1.9.3.tar.gz", hash = "sha256:a9aa738d63ed674a40f0cc31adb83f4ca5fc195f03a6aff6e010d1f5807d1c58", size = 27563752 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/03/c9/eedf685caa682eefb71c9e827def4604d699da7c3cbf50aa3f577553c33e/wordcloud-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fce423a24e6ca1b89b2770a7c6917d6e26f04bcfefa601cf61819b2fc0770c4", size = 172579 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/2d/6850fc318aca19bf153cba2a7e0db3060412a46611d8792056bf407bf144/wordcloud-1.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b6adfc1465b9176b8bc602745dd3ed8ea782b006a81cb59eab3dde92ad9f94c", size = 168439 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/be/1a7a488f5edcfae6746ffb91e792a1795b6cc058364ea6888b3878d3476f/wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad6db37a6f5abeba51a5d503228ea320d4f2fa774864103e7b24acd9dd86fd0e", size = 511098 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/ac/d5836caf274b67a6ed7fa8271e119b3f6d5ee8938949b419c5d64b0140c4/wordcloud-1.9.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5e74ac99e9582873d7ee28bd03e125dcf73ae46666d55fb4c13e82e90c0e074a", size = 494706 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2d/b8/e351b54fec0f52ce53d229e99bc3b8541fa9a7d59e8dade60a22db63abe6/wordcloud-1.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4001317c0e3b5cb6fd106228ddcd27524d1caf9ae468b3c2c2fc571c6ce56b22", size = 492381 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c3/22/82aa9a1136833ea11ad19f668ec2ce955b1fbf2f92d116e704b29c28a9ed/wordcloud-1.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5f86042e5ce12e2795798033a56f0246906b4d7d9027d554b6cd951ce2fd342a", size = 513278 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/58/6e1158f2a967208b91762afb3687533009ddf13b1bed54458e6ce93ecb03/wordcloud-1.9.3-cp310-cp310-win32.whl", hash = "sha256:3b90f0390c0a05ba4b4580fb765a3d45d8d21519b50ca5006d6dbdc2a0b86507", size = 290173 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/7a/cad3b21a91cc55abe06f70146b564f20d9db2aee9631fdee580283bd5e1e/wordcloud-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:6f7977285df9254b8704d3f895c06814a6183c6c89e140d6281848c076635e91", size = 299990 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/4a/2684863905b72fceea24edaa10e3e16bc742a184042f463f3b09883d75b0/wordcloud-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7ced0d5c946d82cfc778febafe3eedeb0bae07dd57ea4f21fe06b9ec8225ab31", size = 172394 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/66/937d1d73389c0b501c928c4d8513653063d2b40272dff70d0e283d8b9144/wordcloud-1.9.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6f5499e6360219e61808dc0d2b00cd5104f78a82d2ae8f7986df04731713835f", size = 168299 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/ad/c22887189c23cc133ae7cbad48c71312f1a962b960fe7f51fd792808a307/wordcloud-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb1e8bb7d60f7a90fa8439c7b56dd1df60766115fd57480ac0d83ca5204e0117", size = 548338 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/27/0b81f0160883f366678d59df4a6e8cb1921934b2f8086de8dfb9540b0b10/wordcloud-1.9.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e33328044db5c01487f2a3a023b5476947942dacd6a5dc8c217fa039f6c5bd9", size = 528633 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/08/3390a827a6f3f4605d35f2fbb755c9e5d7a08b904b63266bff17fa255c72/wordcloud-1.9.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:998dc0dc8fcbff88f566f17cb5e0eb3bb21fcafd387b0670be6c14feacaf4cdc", size = 523490 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/3c/e803ad055fe511e33f091e5aedfdbb7f82adf67eb96dcbd80f075f95a6a1/wordcloud-1.9.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e1a1c3cfa86b605a19711ec58920ccb694dca9d5c9d00b373f4d5952d63793e9", size = 549004 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6d/a7/dff6e4f86739e383a5922be6198930c6705d953f37a2815ddfa6b11e3880/wordcloud-1.9.3-cp311-cp311-win32.whl", hash = "sha256:f504e3291256c0b6fca044602f8f0e5cb56b7c33724cde9d279c4077fa5b6d27", size = 289823 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f5/b0/247159f61c5d5d6647171bef84430b7efad4db504f0229674024f3a4f7f2/wordcloud-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:103c9b0465e1cf5b7a38b49ab1c3a0b0301762fa56602ac79287f9d22b46ade3", size = 300164 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/c0/399a255a2306867cc876de471547c8f727d940c2636c1311d354898e5851/wordcloud-1.9.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dfea303fc1dec4811e4a5671a8021a89724b6fa70639d059ad30c492932be447", size = 174069 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/03/cb/8228555eff393b3e86be815ecb5d506fc18ccfe3b8427b5c1ac2944c4ced/wordcloud-1.9.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:512f3c9a2e8579269a33ac9219d042fd0cc5a3a524ee68079238a3e4efe2b879", size = 169434 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/83/17f69e39c76913064d1ee60e08f9fb3eacf019c2085f5bdfd2b87bc304b9/wordcloud-1.9.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d00d916509a17b432032161d492ed7f30b2ebd921303090fe1d2b57011a49cc0", size = 541632 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/fe/1e877865f6e2119c044ab706a31ff20228764f702af12c15fc592f9667f1/wordcloud-1.9.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d5e0e7bbd269a62baa63ea2175faea4d74435c0ad828f3d5999fa4c33ebe0629", size = 519625 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/1b/6bc50f44fd7e51feb9765ed4a6f81fcb8eeafb1c4a1d4853a069a9d98055/wordcloud-1.9.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:483aa4f8d17b9744a3b238269593d1794b962fc757a72a9e7e8468c2665cffb7", size = 516073 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/80/f9cafd71562e168b093ba4373917d9c1d51fa4cbbd015b2b6ee770f07357/wordcloud-1.9.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:64b342a79553970fa04083761d041067323219ad62b5550a496e42436d23cbb3", size = 544135 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/c0/bc14fd7fa96e5b544aac4e9e65b5dd6f753d72184da35e35eb0b24c4dde4/wordcloud-1.9.3-cp312-cp312-win32.whl", hash = "sha256:419acfe0b1d1227b9e3e14ec1bb6c40fd7fa652df4adf81f0ba3e00daca500b5", size = 291251 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/a0/b8fa5f2d7147a7675e2cab99108f7d8d524b67481f81f289cdb2b64ed1ab/wordcloud-1.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:2061a9978a6243107ce1a8a9fa24f421b03a0f7e620769b6f5075857e75aa615", size = 301393 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/18/9e2bc9d5ee2c88514f368c8ccc82aae4f07392ccda41dc1706fe4cf52a0e/wordcloud-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:888d088f54a897b8597da2fae3954d74b1f7251f7d311bbcc30ec3c6987d3605", size = 173182 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/08/3834d47dc30f3add3899e5bdc5a371194d23e649e2923ce15a66c9903976/wordcloud-1.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:daa6cfa11ce24e7eb4e42dc896dae4f74ae2166cf90ec997996300566e6811d1", size = 168970 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/52/4fb51dde6c6a57669501e51ef205feb7520ce7b2dda100b30bb588e02866/wordcloud-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:387dc2bd528ff6bb661451f2a9fd4ccf74b86072d7a2c868285d4c0cf26abeb4", size = 513648 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1e/39/e40fd13613cf73c522b7fd7bdb2f6572c5399b24db792e68fc508cda8d0b/wordcloud-1.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40c32a324319db610b40f387a2a0b42d091817958a5272e0a4c4eb6a158588b5", size = 497488 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3e/b9/d3536ce9da99aafdf527ab830a14ff12c24173b13b9cb377077cf5459c19/wordcloud-1.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8078c6c58db4ccb893f120354e7e08bc48a5a5aac3e764f9008bc96a769b208c", size = 495569 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/5e/ea1812ec366f512498060e5c70e60efcdb39bbcebe1ca7ff41929e421e61/wordcloud-1.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81f15eb60abc1676808bb85e2edfdbdc0a9011383f2a729c1c2a0cb941516768", size = 515969 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/58/57/ba66e9b8680e9b0d76a1c87d2386ffa86472a568aa193c6707a4857a0639/wordcloud-1.9.3-cp39-cp39-win32.whl", hash = "sha256:1d1680bf6c3d1b2f8e3bd02ccfa868fee2655fe13cf5b9e9905251050448fbbd", size = 290872 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/67/ebbc7e0c6da1753d7c30c2ca47c6bd11dbf2259cfa9c98151f4ab3d8b950/wordcloud-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:c0f458681e4d49be36064f21bfb1dc8d8c3021fe30e474ee634666b4f84fd851", size = 300584 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/44/fc3055b4b5acf9301414c0f6d9df3fe70f4705b108e36481f2b93bec75d0/wordcloud-1.9.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:baea9ac88ec1ab317461c75834b64ad5dad12a02c4f2384dd546eac3c316dbbb", size = 157279 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7f/2c/c5153f6ae80132f42ab183064f532375d9440e0484395a983974b5e2f54d/wordcloud-1.9.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6956b9f0d0eb14a12f46d41aebb4e7ad2d4c2ec417cc7c586bebd2ddc9c8311", size = 172027 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b8/36/2b8dc0a4a89864b69355edf57367dbfc3e18d69f429bb4ca07a51ae276bc/wordcloud-1.9.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d221b4d0d1d2a1d79286c41d8a4c0ce70065488f153e5d81cc0be7fb494ff10f", size = 172956 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bd/fa/034250802f186bb0b5584e2e21dc77e4909bfe77a257956294f771833b4c/wordcloud-1.9.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:db39dbe91dd31ffb667edcd496f4eeb85ceea397fef4ad51d0766ab934088cc7", size = 295201 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/99/a9/d37ce3634ee743819ebfc8e75a01c6e13f2be401c12f3b5af41cbc9a8866/wordcloud-1.9.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0c8e18c4afa025819332efffe8008267a83a9c54fe72ae1bc889ddce0eec470d", size = 157112 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/5c/59ee9e6e59fc052cfe626f50deecdf81e6e1ce2c3153156be41d592c7bd4/wordcloud-1.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4df25cb5dd347e43d53e02a009418f5776e7651063aff991865da8f6336bf193", size = 171926 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/ac/717c260789242dddd74822deb1234a6e5376169af8bd27dc90292a4d63c4/wordcloud-1.9.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53489ad22d58be3896ec16ed47604832e393224c89f7d7eed040096b07141ac4", size = 172815 }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/de/6d244b5e9ca970869343755a1e43e2b3c165dd9d08908f5c788014b974ec/wordcloud-1.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:61de4a5f3bfd33e0cb013cce6143bcf71959f3cd8536650b90134d745a553c2c", size = 295032 }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547 } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276 }, +] diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/var.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/var.py new file mode 100644 index 0000000..98a1d1f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/var.py @@ -0,0 +1,25 @@ +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +from asyncio.tasks import Task +from contextvars import ContextVar +from typing import List + +import aiomysql + +from async_db import AsyncMysqlDB + +request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="") +crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="") +comment_tasks_var: ContextVar[List[Task]] = ContextVar("comment_tasks", default=[]) +media_crawler_db_var: ContextVar[AsyncMysqlDB] = ContextVar("media_crawler_db_var") +db_conn_pool_var: ContextVar[aiomysql.Pool] = ContextVar("db_conn_pool_var") +source_keyword_var: ContextVar[str] = ContextVar("source_keyword", default="") \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/keyword_manager.py b/MindSpider/DeepSentimentCrawling/keyword_manager.py new file mode 100644 index 0000000..1f1267b --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/keyword_manager.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +DeepSentimentCrawling模块 - 关键词管理器 +从BroadTopicExtraction模块获取关键词并分配给不同平台进行爬取 +""" + +import sys +import json +from datetime import date, timedelta, datetime +from pathlib import Path +from typing import List, Dict, Optional +import random +import pymysql +from pymysql.cursors import DictCursor + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + raise ImportError("无法导入config.py配置文件") + +class KeywordManager: + """关键词管理器""" + + def __init__(self): + """初始化关键词管理器""" + self.connection = None + self.connect() + + def connect(self): + """连接数据库""" + try: + self.connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + database=config.DB_NAME, + charset=config.DB_CHARSET, + autocommit=True, + cursorclass=DictCursor + ) + print(f"关键词管理器成功连接到数据库: {config.DB_NAME}") + except Exception as e: + print(f"关键词管理器数据库连接失败: {e}") + raise + + def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]: + """ + 获取最新的关键词列表 + + Args: + target_date: 目标日期,默认为今天 + max_keywords: 最大关键词数量 + + Returns: + 关键词列表 + """ + if not target_date: + target_date = date.today() + + print(f"正在获取 {target_date} 的关键词...") + + # 首先尝试获取指定日期的关键词 + topics_data = self.get_daily_topics(target_date) + + if topics_data and topics_data.get('keywords'): + keywords = topics_data['keywords'] + print(f"成功获取 {target_date} 的 {len(keywords)} 个关键词") + + # 如果关键词太多,随机选择指定数量 + if len(keywords) > max_keywords: + keywords = random.sample(keywords, max_keywords) + print(f"随机选择了 {max_keywords} 个关键词") + + return keywords + + # 如果没有当天的关键词,尝试获取最近几天的 + print(f"{target_date} 没有关键词数据,尝试获取最近的关键词...") + recent_topics = self.get_recent_topics(days=7) + + if recent_topics: + # 合并最近几天的关键词 + all_keywords = [] + for topic in recent_topics: + if topic.get('keywords'): + all_keywords.extend(topic['keywords']) + + # 去重并限制数量 + unique_keywords = list(set(all_keywords)) + if len(unique_keywords) > max_keywords: + unique_keywords = random.sample(unique_keywords, max_keywords) + + print(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词") + return unique_keywords + + # 如果都没有,返回默认关键词 + print("没有找到任何关键词数据,使用默认关键词") + return self._get_default_keywords() + + def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]: + """ + 获取每日话题分析 + + Args: + extract_date: 提取日期,默认为今天 + + Returns: + 话题分析数据,如果不存在返回None + """ + if not extract_date: + extract_date = date.today() + + try: + cursor = self.connection.cursor() + query = "SELECT * FROM daily_topics WHERE extract_date = %s" + cursor.execute(query, (extract_date,)) + result = cursor.fetchone() + + if result: + # 解析关键词JSON + result['keywords'] = json.loads(result['keywords']) + return result + else: + return None + + except Exception as e: + print(f"获取话题分析失败: {e}") + return None + + def get_recent_topics(self, days: int = 7) -> List[Dict]: + """ + 获取最近几天的话题分析 + + Args: + days: 天数 + + Returns: + 话题分析列表 + """ + try: + cursor = self.connection.cursor() + query = """ + SELECT * FROM daily_topics + WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + ORDER BY extract_date DESC + """ + cursor.execute(query, (days,)) + results = cursor.fetchall() + + # 解析每个结果的关键词JSON + for result in results: + result['keywords'] = json.loads(result['keywords']) + + return results + + except Exception as e: + print(f"获取最近话题分析失败: {e}") + return [] + + def _get_default_keywords(self) -> List[str]: + """获取默认关键词列表""" + return [ + "科技", "人工智能", "AI", "编程", "互联网", + "创业", "投资", "理财", "股市", "经济", + "教育", "学习", "考试", "大学", "就业", + "健康", "养生", "运动", "美食", "旅游", + "时尚", "美妆", "购物", "生活", "家居", + "电影", "音乐", "游戏", "娱乐", "明星", + "新闻", "热点", "社会", "政策", "环保" + ] + + def get_all_keywords_for_platforms(self, platforms: List[str], target_date: date = None, + max_keywords: int = 100) -> List[str]: + """ + 为所有平台获取相同的关键词列表 + + Args: + platforms: 平台列表 + target_date: 目标日期 + max_keywords: 最大关键词数量 + + Returns: + 关键词列表(所有平台共用) + """ + keywords = self.get_latest_keywords(target_date, max_keywords) + + if keywords: + print(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词") + print(f"每个关键词将在所有平台上进行爬取") + + return keywords + + def get_keywords_for_platform(self, platform: str, target_date: date = None, + max_keywords: int = 50) -> List[str]: + """ + 为特定平台获取关键词(现在所有平台使用相同关键词) + + Args: + platform: 平台名称 + target_date: 目标日期 + max_keywords: 最大关键词数量 + + Returns: + 关键词列表(与其他平台相同) + """ + keywords = self.get_latest_keywords(target_date, max_keywords) + + print(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)") + return keywords + + def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]: + """ + 根据平台特性过滤关键词 + + Args: + keywords: 原始关键词列表 + platform: 平台名称 + + Returns: + 过滤后的关键词列表 + """ + # 平台特性关键词映射(可以根据需要调整) + platform_preferences = { + 'xhs': ['美妆', '时尚', '生活', '美食', '旅游', '购物', '健康', '养生'], + 'dy': ['娱乐', '音乐', '舞蹈', '搞笑', '美食', '生活', '科技', '教育'], + 'ks': ['生活', '搞笑', '农村', '美食', '手工', '音乐', '娱乐'], + 'bili': ['科技', '游戏', '动漫', '学习', '编程', '数码', '科普'], + 'wb': ['热点', '新闻', '娱乐', '明星', '社会', '时事', '科技'], + 'tieba': ['游戏', '动漫', '学习', '生活', '兴趣', '讨论'], + 'zhihu': ['知识', '学习', '科技', '职场', '投资', '教育', '思考'] + } + + # 如果平台有特定偏好,优先选择相关关键词 + preferred_keywords = platform_preferences.get(platform, []) + + if preferred_keywords: + # 先选择平台偏好的关键词 + filtered = [] + remaining = [] + + for keyword in keywords: + if any(pref in keyword for pref in preferred_keywords): + filtered.append(keyword) + else: + remaining.append(keyword) + + # 如果偏好关键词不够,补充其他关键词 + if len(filtered) < len(keywords) // 2: + filtered.extend(remaining[:len(keywords) - len(filtered)]) + + return filtered + + # 如果没有特定偏好,返回原关键词 + return keywords + + def get_crawling_summary(self, target_date: date = None) -> Dict: + """ + 获取爬取任务摘要 + + Args: + target_date: 目标日期 + + Returns: + 爬取摘要信息 + """ + if not target_date: + target_date = date.today() + + topics_data = self.get_daily_topics(target_date) + + if topics_data: + return { + 'date': target_date, + 'keywords_count': len(topics_data.get('keywords', [])), + 'summary': topics_data.get('summary', ''), + 'has_data': True + } + else: + return { + 'date': target_date, + 'keywords_count': 0, + 'summary': '暂无数据', + 'has_data': False + } + + def close(self): + """关闭数据库连接""" + if self.connection: + self.connection.close() + print("关键词管理器数据库连接已关闭") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + +if __name__ == "__main__": + # 测试关键词管理器 + with KeywordManager() as km: + # 测试获取关键词 + keywords = km.get_latest_keywords(max_keywords=20) + print(f"获取到的关键词: {keywords}") + + # 测试平台分配 + platforms = ['xhs', 'dy', 'bili'] + distribution = km.distribute_keywords_by_platform(keywords, platforms) + for platform, kws in distribution.items(): + print(f"{platform}: {kws}") + + # 测试爬取摘要 + summary = km.get_crawling_summary() + print(f"爬取摘要: {summary}") + + print("关键词管理器测试完成!") diff --git a/MindSpider/DeepSentimentCrawling/main.py b/MindSpider/DeepSentimentCrawling/main.py new file mode 100644 index 0000000..495512f --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/main.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +DeepSentimentCrawling模块 - 主工作流程 +基于BroadTopicExtraction提取的话题进行全平台关键词爬取 +""" + +import sys +import argparse +from datetime import date, datetime +from pathlib import Path +from typing import List, Dict + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from keyword_manager import KeywordManager +from platform_crawler import PlatformCrawler + +class DeepSentimentCrawling: + """深度情感爬取主工作流程""" + + def __init__(self): + """初始化深度情感爬取""" + self.keyword_manager = KeywordManager() + self.platform_crawler = PlatformCrawler() + self.supported_platforms = ['xhs', 'dy', 'ks', 'bili', 'wb', 'tieba', 'zhihu'] + + def run_daily_crawling(self, target_date: date = None, platforms: List[str] = None, + max_keywords_per_platform: int = 50, + max_notes_per_platform: int = 50, + login_type: str = "qrcode") -> Dict: + """ + 执行每日爬取任务 + + Args: + target_date: 目标日期,默认为今天 + platforms: 要爬取的平台列表,默认为所有支持的平台 + max_keywords_per_platform: 每个平台最大关键词数量 + max_notes_per_platform: 每个平台最大爬取内容数量 + login_type: 登录方式 + + Returns: + 爬取结果统计 + """ + if not target_date: + target_date = date.today() + + if not platforms: + platforms = self.supported_platforms + + print(f"🚀 开始执行 {target_date} 的深度情感爬取任务") + print(f"目标平台: {platforms}") + + # 1. 获取关键词摘要 + summary = self.keyword_manager.get_crawling_summary(target_date) + print(f"📊 关键词摘要: {summary}") + + if not summary['has_data']: + print("⚠️ 没有找到话题数据,无法进行爬取") + return {"success": False, "error": "没有话题数据"} + + # 2. 获取关键词(不分配,所有平台使用相同关键词) + print(f"\n📝 获取关键词...") + keywords = self.keyword_manager.get_latest_keywords(target_date, max_keywords_per_platform) + + if not keywords: + print("⚠️ 没有找到关键词,无法进行爬取") + return {"success": False, "error": "没有关键词"} + + print(f" 获取到 {len(keywords)} 个关键词") + print(f" 将在 {len(platforms)} 个平台上爬取每个关键词") + print(f" 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}") + + # 3. 执行全平台关键词爬取 + print(f"\n🔄 开始全平台关键词爬取...") + crawl_results = self.platform_crawler.run_multi_platform_crawl_by_keywords( + keywords, platforms, login_type, max_notes_per_platform + ) + + # 4. 生成最终报告 + final_report = { + "date": target_date.isoformat(), + "summary": summary, + "crawl_results": crawl_results, + "success": crawl_results["successful_tasks"] > 0 + } + + print(f"\n✅ 深度情感爬取任务完成!") + print(f" 日期: {target_date}") + print(f" 成功任务: {crawl_results['successful_tasks']}/{crawl_results['total_tasks']}") + print(f" 总关键词: {crawl_results['total_keywords']} 个") + print(f" 总平台: {crawl_results['total_platforms']} 个") + print(f" 总内容: {crawl_results['total_notes']} 条") + + return final_report + + def run_platform_crawling(self, platform: str, target_date: date = None, + max_keywords: int = 50, max_notes: int = 50, + login_type: str = "qrcode") -> Dict: + """ + 执行单个平台的爬取任务 + + Args: + platform: 平台名称 + target_date: 目标日期 + max_keywords: 最大关键词数量 + max_notes: 最大爬取内容数量 + login_type: 登录方式 + + Returns: + 爬取结果 + """ + if platform not in self.supported_platforms: + raise ValueError(f"不支持的平台: {platform}") + + if not target_date: + target_date = date.today() + + print(f"🎯 开始执行 {platform} 平台的爬取任务 ({target_date})") + + # 获取关键词 + keywords = self.keyword_manager.get_keywords_for_platform( + platform, target_date, max_keywords + ) + + if not keywords: + print(f"⚠️ 没有找到 {platform} 平台的关键词") + return {"success": False, "error": "没有关键词"} + + print(f"📝 准备爬取 {len(keywords)} 个关键词") + + # 执行爬取 + result = self.platform_crawler.run_crawler( + platform, keywords, login_type, max_notes + ) + + return result + + def list_available_topics(self, days: int = 7): + """列出最近可用的话题""" + print(f"📋 最近 {days} 天的话题数据:") + + recent_topics = self.keyword_manager.db_manager.get_recent_topics(days) + + if not recent_topics: + print(" 暂无话题数据") + return + + for topic in recent_topics: + extract_date = topic['extract_date'] + keywords_count = len(topic.get('keywords', [])) + summary_preview = topic.get('summary', '')[:100] + "..." if len(topic.get('summary', '')) > 100 else topic.get('summary', '') + + print(f" 📅 {extract_date}: {keywords_count} 个关键词") + print(f" 摘要: {summary_preview}") + print() + + def show_platform_guide(self): + """显示平台使用指南""" + print("🔧 平台爬取指南:") + print() + + platform_info = { + 'xhs': '小红书 - 美妆、生活、时尚内容为主', + 'dy': '抖音 - 短视频、娱乐、生活内容', + 'ks': '快手 - 生活、娱乐、农村题材内容', + 'bili': 'B站 - 科技、学习、游戏、动漫内容', + 'wb': '微博 - 热点新闻、明星、社会话题', + 'tieba': '百度贴吧 - 兴趣讨论、游戏、学习', + 'zhihu': '知乎 - 知识问答、深度讨论' + } + + for platform, desc in platform_info.items(): + print(f" {platform}: {desc}") + + print() + print("💡 使用建议:") + print(" 1. 首次使用需要扫码登录各平台") + print(" 2. 建议先测试单个平台,确认登录正常") + print(" 3. 爬取数量不宜过大,避免被限制") + print(" 4. 可以使用 --test 模式进行小规模测试") + + def close(self): + """关闭资源""" + if self.keyword_manager: + self.keyword_manager.close() + +def main(): + """命令行入口""" + parser = argparse.ArgumentParser(description="DeepSentimentCrawling - 基于话题的深度情感爬取") + + # 基本参数 + parser.add_argument("--date", type=str, help="目标日期 (YYYY-MM-DD),默认为今天") + parser.add_argument("--platform", type=str, choices=['xhs', 'dy', 'ks', 'bili', 'wb', 'tieba', 'zhihu'], + help="指定单个平台进行爬取") + parser.add_argument("--platforms", type=str, nargs='+', + choices=['xhs', 'dy', 'ks', 'bili', 'wb', 'tieba', 'zhihu'], + help="指定多个平台进行爬取") + + # 爬取参数 + parser.add_argument("--max-keywords", type=int, default=50, + help="每个平台最大关键词数量 (默认: 50)") + parser.add_argument("--max-notes", type=int, default=50, + help="每个平台最大爬取内容数量 (默认: 50)") + parser.add_argument("--login-type", type=str, choices=['qrcode', 'phone', 'cookie'], + default='qrcode', help="登录方式 (默认: qrcode)") + + # 功能参数 + parser.add_argument("--list-topics", action="store_true", help="列出最近的话题数据") + parser.add_argument("--days", type=int, default=7, help="查看最近几天的话题 (默认: 7)") + parser.add_argument("--guide", action="store_true", help="显示平台使用指南") + parser.add_argument("--test", action="store_true", help="测试模式 (少量数据)") + + args = parser.parse_args() + + # 解析日期 + target_date = None + if args.date: + try: + target_date = datetime.strptime(args.date, "%Y-%m-%d").date() + except ValueError: + print("❌ 日期格式错误,请使用 YYYY-MM-DD 格式") + return + + # 创建爬取实例 + crawler = DeepSentimentCrawling() + + try: + # 显示指南 + if args.guide: + crawler.show_platform_guide() + return + + # 列出话题 + if args.list_topics: + crawler.list_available_topics(args.days) + return + + # 测试模式调整参数 + if args.test: + args.max_keywords = min(args.max_keywords, 10) + args.max_notes = min(args.max_notes, 10) + print("测试模式:限制关键词和内容数量") + + # 单平台爬取 + if args.platform: + result = crawler.run_platform_crawling( + args.platform, target_date, args.max_keywords, + args.max_notes, args.login_type + ) + + if result['success']: + print(f"\n{args.platform} 爬取成功!") + else: + print(f"\n{args.platform} 爬取失败: {result.get('error', '未知错误')}") + + return + + # 多平台爬取 + platforms = args.platforms if args.platforms else None + result = crawler.run_daily_crawling( + target_date, platforms, args.max_keywords, + args.max_notes, args.login_type + ) + + if result['success']: + print(f"\n多平台爬取任务完成!") + else: + print(f"\n多平台爬取失败: {result.get('error', '未知错误')}") + + except KeyboardInterrupt: + print("\n用户中断操作") + except Exception as e: + print(f"\n执行出错: {e}") + finally: + crawler.close() + +if __name__ == "__main__": + main() diff --git a/MindSpider/DeepSentimentCrawling/platform_crawler.py b/MindSpider/DeepSentimentCrawling/platform_crawler.py new file mode 100644 index 0000000..fc6b467 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/platform_crawler.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +DeepSentimentCrawling模块 - 平台爬虫管理器 +负责配置和调用MediaCrawler进行多平台爬取 +""" + +import os +import sys +import subprocess +import tempfile +from datetime import datetime +from pathlib import Path +from typing import List, Dict, Optional +import json + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + raise ImportError("无法导入config.py配置文件") + +class PlatformCrawler: + """平台爬虫管理器""" + + def __init__(self): + """初始化平台爬虫管理器""" + self.mediacrawler_path = Path(__file__).parent / "MediaCrawler" + self.supported_platforms = ['xhs', 'dy', 'ks', 'bili', 'wb', 'tieba', 'zhihu'] + self.crawl_stats = {} + + # 确保MediaCrawler目录存在 + if not self.mediacrawler_path.exists(): + raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}") + + print(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}") + + def configure_mediacrawler_db(self): + """配置MediaCrawler使用我们的MySQL数据库""" + try: + # 修改MediaCrawler的数据库配置 + db_config_path = self.mediacrawler_path / "config" / "db_config.py" + + # 读取原始配置 + with open(db_config_path, 'r', encoding='utf-8') as f: + content = f.read() + + # 替换数据库配置 + new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + + +import os + +# mysql config - 使用MindSpider的数据库配置 +MYSQL_DB_PWD = "{config.DB_PASSWORD}" +MYSQL_DB_USER = "{config.DB_USER}" +MYSQL_DB_HOST = "{config.DB_HOST}" +MYSQL_DB_PORT = {config.DB_PORT} +MYSQL_DB_NAME = "{config.DB_NAME}" + + +# redis config +REDIS_DB_HOST = "127.0.0.1" # your redis host +REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password +REDIS_DB_PORT = os.getenv("REDIS_DB_PORT", 6379) # your redis port +REDIS_DB_NUM = os.getenv("REDIS_DB_NUM", 0) # your redis db num + +# cache type +CACHE_TYPE_REDIS = "redis" +CACHE_TYPE_MEMORY = "memory" + +# sqlite config +SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")''' + + # 写入新配置 + with open(db_config_path, 'w', encoding='utf-8') as f: + f.write(new_config) + + print("已配置MediaCrawler使用MindSpider数据库") + return True + + except Exception as e: + print(f"配置MediaCrawler数据库失败: {e}") + return False + + def create_base_config(self, platform: str, keywords: List[str], + crawler_type: str = "search", max_notes: int = 50) -> bool: + """ + 创建MediaCrawler的基础配置 + + Args: + platform: 平台名称 + keywords: 关键词列表 + crawler_type: 爬取类型 + max_notes: 最大爬取数量 + + Returns: + 是否配置成功 + """ + try: + base_config_path = self.mediacrawler_path / "config" / "base_config.py" + + # 将关键词列表转换为逗号分隔的字符串 + keywords_str = ",".join(keywords) + + # 读取原始配置文件 + with open(base_config_path, 'r', encoding='utf-8') as f: + content = f.read() + + # 修改关键配置项 + lines = content.split('\n') + new_lines = [] + + for line in lines: + if line.startswith('PLATFORM = '): + new_lines.append(f'PLATFORM = "{platform}" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu') + elif line.startswith('KEYWORDS = '): + new_lines.append(f'KEYWORDS = "{keywords_str}" # 关键词搜索配置,以英文逗号分隔') + elif line.startswith('CRAWLER_TYPE = '): + new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)') + elif line.startswith('SAVE_DATA_OPTION = '): + new_lines.append('SAVE_DATA_OPTION = "db" # csv or db or json or sqlite') + elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '): + new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}') + elif line.startswith('ENABLE_GET_COMMENTS = '): + new_lines.append('ENABLE_GET_COMMENTS = True') + elif line.startswith('CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = '): + new_lines.append('CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 20') + elif line.startswith('HEADLESS = '): + new_lines.append('HEADLESS = True') # 使用无头模式 + else: + new_lines.append(line) + + # 写入新配置 + with open(base_config_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(new_lines)) + + print(f"已配置 {platform} 平台,关键词数量: {len(keywords)}") + return True + + except Exception as e: + print(f"创建基础配置失败: {e}") + return False + + def run_crawler(self, platform: str, keywords: List[str], + login_type: str = "qrcode", max_notes: int = 50) -> Dict: + """ + 运行爬虫 + + Args: + platform: 平台名称 + keywords: 关键词列表 + login_type: 登录方式 + max_notes: 最大爬取数量 + + Returns: + 爬取结果统计 + """ + if platform not in self.supported_platforms: + raise ValueError(f"不支持的平台: {platform}") + + if not keywords: + raise ValueError("关键词列表不能为空") + + print(f"\n开始爬取平台: {platform}") + print(f"关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)") + + start_time = datetime.now() + + try: + # 配置数据库 + if not self.configure_mediacrawler_db(): + return {"success": False, "error": "数据库配置失败"} + + # 创建基础配置 + if not self.create_base_config(platform, keywords, "search", max_notes): + return {"success": False, "error": "基础配置创建失败"} + + # 构建命令 + cmd = [ + sys.executable, "main.py", + "--platform", platform, + "--lt", login_type, + "--type", "search", + "--save_data_option", "db" + ] + + print(f"执行命令: {' '.join(cmd)}") + + # 切换到MediaCrawler目录并执行 + result = subprocess.run( + cmd, + cwd=self.mediacrawler_path, + timeout=1800 # 30分钟超时 + ) + + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + + # 创建统计信息 + crawl_stats = { + "platform": platform, + "keywords_count": len(keywords), + "duration_seconds": duration, + "start_time": start_time.isoformat(), + "end_time": end_time.isoformat(), + "return_code": result.returncode, + "success": result.returncode == 0, + "notes_count": 0, + "comments_count": 0, + "errors_count": 0 + } + + # 保存统计信息 + self.crawl_stats[platform] = crawl_stats + + if result.returncode == 0: + print(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒") + else: + print(f"❌ {platform} 爬取失败,返回码: {result.returncode}") + + return crawl_stats + + except subprocess.TimeoutExpired: + print(f"❌ {platform} 爬取超时") + return {"success": False, "error": "爬取超时", "platform": platform} + except Exception as e: + print(f"❌ {platform} 爬取异常: {e}") + return {"success": False, "error": str(e), "platform": platform} + + def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict: + """解析爬取输出,提取统计信息""" + stats = { + "notes_count": 0, + "comments_count": 0, + "errors_count": 0, + "login_required": False + } + + # 解析输出行 + for line in output_lines: + if "条笔记" in line or "条内容" in line: + try: + # 提取数字 + import re + numbers = re.findall(r'\d+', line) + if numbers: + stats["notes_count"] = int(numbers[0]) + except: + pass + elif "条评论" in line: + try: + import re + numbers = re.findall(r'\d+', line) + if numbers: + stats["comments_count"] = int(numbers[0]) + except: + pass + elif "登录" in line or "扫码" in line: + stats["login_required"] = True + + # 解析错误行 + for line in error_lines: + if "error" in line.lower() or "异常" in line: + stats["errors_count"] += 1 + + return stats + + def run_multi_platform_crawl_by_keywords(self, keywords: List[str], platforms: List[str], + login_type: str = "qrcode", max_notes_per_keyword: int = 50) -> Dict: + """ + 基于关键词的多平台爬取 - 每个关键词在所有平台上都进行爬取 + + Args: + keywords: 关键词列表 + platforms: 平台列表 + login_type: 登录方式 + max_notes_per_keyword: 每个关键词在每个平台的最大爬取数量 + + Returns: + 总体爬取统计 + """ + print(f"\n🚀 开始全平台关键词爬取") + print(f" 关键词数量: {len(keywords)}") + print(f" 平台数量: {len(platforms)}") + print(f" 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}") + + total_stats = { + "total_keywords": len(keywords), + "total_platforms": len(platforms), + "total_tasks": len(keywords) * len(platforms), + "successful_tasks": 0, + "failed_tasks": 0, + "total_notes": 0, + "total_comments": 0, + "keyword_results": {}, + "platform_summary": {} + } + + # 初始化平台统计 + for platform in platforms: + total_stats["platform_summary"][platform] = { + "successful_keywords": 0, + "failed_keywords": 0, + "total_notes": 0, + "total_comments": 0 + } + + # 对每个平台一次性爬取所有关键词 + for platform in platforms: + print(f"\n📝 在 {platform} 平台爬取所有关键词") + print(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}") + + try: + # 一次性传递所有关键词给平台 + result = self.run_crawler(platform, keywords, login_type, max_notes_per_keyword) + + if result.get("success"): + total_stats["successful_tasks"] += len(keywords) + total_stats["platform_summary"][platform]["successful_keywords"] = len(keywords) + + notes_count = result.get("notes_count", 0) + comments_count = result.get("comments_count", 0) + + total_stats["total_notes"] += notes_count + total_stats["total_comments"] += comments_count + total_stats["platform_summary"][platform]["total_notes"] = notes_count + total_stats["platform_summary"][platform]["total_comments"] = comments_count + + # 为每个关键词记录结果 + for keyword in keywords: + if keyword not in total_stats["keyword_results"]: + total_stats["keyword_results"][keyword] = {} + total_stats["keyword_results"][keyword][platform] = result + + print(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论") + else: + total_stats["failed_tasks"] += len(keywords) + total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) + + # 为每个关键词记录失败结果 + for keyword in keywords: + if keyword not in total_stats["keyword_results"]: + total_stats["keyword_results"][keyword] = {} + total_stats["keyword_results"][keyword][platform] = result + + print(f" ❌ 失败: {result.get('error', '未知错误')}") + + except Exception as e: + total_stats["failed_tasks"] += len(keywords) + total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords) + error_result = {"success": False, "error": str(e)} + + # 为每个关键词记录异常结果 + for keyword in keywords: + if keyword not in total_stats["keyword_results"]: + total_stats["keyword_results"][keyword] = {} + total_stats["keyword_results"][keyword][platform] = error_result + + print(f" ❌ 异常: {e}") + + # 打印详细统计 + print(f"\n📊 全平台关键词爬取完成!") + print(f" 总任务: {total_stats['total_tasks']}") + print(f" 成功: {total_stats['successful_tasks']}") + print(f" 失败: {total_stats['failed_tasks']}") + print(f" 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%") + print(f" 总内容: {total_stats['total_notes']} 条") + print(f" 总评论: {total_stats['total_comments']} 条") + + print(f"\n📈 各平台统计:") + for platform, stats in total_stats["platform_summary"].items(): + success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0 + print(f" {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), " + f"{stats['total_notes']} 条内容") + + return total_stats + + def get_crawl_statistics(self) -> Dict: + """获取爬取统计信息""" + return { + "platforms_crawled": list(self.crawl_stats.keys()), + "total_platforms": len(self.crawl_stats), + "detailed_stats": self.crawl_stats + } + + def save_crawl_log(self, log_path: str = None): + """保存爬取日志""" + if not log_path: + log_path = f"crawl_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + + try: + with open(log_path, 'w', encoding='utf-8') as f: + json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2) + print(f"爬取日志已保存到: {log_path}") + except Exception as e: + print(f"保存爬取日志失败: {e}") + +if __name__ == "__main__": + # 测试平台爬虫管理器 + crawler = PlatformCrawler() + + # 测试配置 + test_keywords = ["科技", "AI", "编程"] + result = crawler.run_crawler("xhs", test_keywords, max_notes=5) + + print(f"测试结果: {result}") + print("平台爬虫管理器测试完成!") diff --git a/MindSpider/LICENSE b/MindSpider/LICENSE new file mode 100644 index 0000000..b7d972b --- /dev/null +++ b/MindSpider/LICENSE @@ -0,0 +1,81 @@ +MIT License + +Copyright (c) 2025 666ghj + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +NON-COMMERCIAL LEARNING LICENSE 1.1 + +Copyright (c) [2024] [relakkes@gmail.com] + +WHEREAS: +1. The copyright owner owns and controls the copyright of this software and related documentation files (hereinafter referred to as the "Software"); +2. The user wishes to use the Software for learning purposes; +3. The copyright owner is willing to authorize the user to use the Software under the conditions stated in this license; + +NOW, THEREFORE, the parties, in compliance with relevant laws and regulations, agree to the following terms: + +SCOPE OF AUTHORIZATION: +1. The copyright owner hereby grants any natural person or legal entity (hereinafter referred to as the "User") accepting this license a free, non-exclusive, non-transferable right to use, copy, modify, and merge the Software for non-commercial learning purposes, subject to the following conditions. + +CONDITIONS: +1. The User must include the above copyright notice and this license statement in all reasonably prominent locations of the Software and its copies. +2. The Software is limited to learning and research purposes only, and may not be used for large-scale crawling or activities that disrupt platform operations. +3. Without the written consent of the copyright owner, the Software may not be used for any commercial purposes or to cause improper influence on third parties. + +DISCLAIMER: +1. The Software is provided "AS IS," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. +2. In no event shall the copyright owner be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this Software, even if advised of the possibility of such damage. + +APPLICABLE LAW: +1. The interpretation and enforcement of this license shall comply with local laws and regulations. +2. Any disputes arising from or related to this license shall be resolved through friendly negotiation between the parties; if negotiation fails, either party may submit the dispute to the people's court where the copyright owner is located for resolution. + +This license constitutes the entire agreement between the parties regarding the Software, superseding and merging all prior discussions, communications, and agreements, whether oral or written. + + +非商业学习使用许可证 1.1 + +版权所有 (c) [2024] [relakkes@gmail.com] + +鉴于: +1. 版权所有者拥有和控制本软件和相关文档文件(以下简称“软件”)的版权; +2. 使用者希望使用该软件进行学习; +3. 版权所有者愿意在本许可证所述的条件下授权使用者使用该软件; + +现因此,双方遵循相关法律法规,同意如下条款: + +授权范围: +1. 版权所有者特此免费授予接受本许可证的任何自然人或法人(以下简称“使用者”)非独占的、不可转让的权利,在非商业学习目的下使用、复制、修改、合并本软件,前提是遵守以下条件。 + +条件: +1. 使用者必须在软件及其副本的所有合理显著位置包含上述版权声明和本许可证声明。 +2. 本软件仅限用于学习和研究目的,不得用于大规模爬虫或对平台造成运营干扰的行为。 +3. 未经版权所有者书面同意,不得将本软件用于任何商业用途或对第三方造成不当影响。 + +免责声明: +1. 本软件按“现状”提供,不提供任何形式的明示或暗示保证,包括但不限于对适销性、特定用途的适用性和非侵权的保证。 +2. 在任何情况下,版权所有者均不对因使用本软件而产生的,或在任何方式上与本软件有关的任何直接、间接、偶然、特殊、示例性或后果性损害负责(包括但不限于采购替代品或服务;使用、数据或利润的损失;或业务中断),无论这些损害是如何引起的,以及无论是通过合同、严格责任还是侵权行为(包括疏忽或其他方式)产生的,即使已被告知此类损害的可能性。 + +适用法律: +1. 本许可证的解释和执行应遵循当地法律法规。 +2. 因本许可证引起的或与之相关的任何争议,双方应友好协商解决;协商不成时,任何一方可将争议提交至版权所有者所在地的人民法院诉讼解决。 + +本许可证构成双方之间关于本软件的完整协议,取代并合并以前的讨论、交流和协议,无论是口头还是书面的。 diff --git a/MindSpider/README.md b/MindSpider/README.md new file mode 100644 index 0000000..5500ce8 --- /dev/null +++ b/MindSpider/README.md @@ -0,0 +1,503 @@ +# MindSpider - 专为舆情分析设计的AI爬虫 + +> 免责声明: +> 本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。 + +## 项目概述 + +MindSpider是一个基于Agent技术的智能舆情爬虫系统,通过AI自动识别热点话题,并在多个社交媒体平台进行精准的内容爬取。系统采用模块化设计,能够实现从话题发现到内容收集的全自动化流程。 + +两步走爬取: + +- 模块一:Search Agent从包括微博、知乎、github、酷安等 **13个** 社媒平台、技术论坛识别热点新闻,并维护一个每日话题分析表。 +- 模块二:全平台爬虫深度爬取每个话题的细粒度舆情反馈。 + +### 技术架构 + +- **编程语言**: Python 3.9+ +- **AI框架**: 默认Deepseek,可以接入多种api (话题提取与分析) +- **爬虫框架**: Playwright (浏览器自动化) +- **数据库**: MySQL (数据持久化存储) +- **并发处理**: AsyncIO (异步并发爬取) + +## 项目结构 + +``` +MindSpider/ +├── BroadTopicExtraction/ # 话题提取模块 +│ ├── database_manager.py # 数据库管理器 +│ ├── get_today_news.py # 新闻采集器 +│ ├── main.py # 模块主入口 +│ └── topic_extractor.py # AI话题提取器 +│ +├── DeepSentimentCrawling/ # 深度爬取模块 +│ ├── keyword_manager.py # 关键词管理器 +│ ├── main.py # 模块主入口 +│ ├── platform_crawler.py # 平台爬虫管理器 +│ └── MediaCrawler/ # 多平台爬虫核心 +│ ├── base/ # 基础类 +│ ├── cache/ # 缓存系统 +│ ├── config/ # 配置文件 +│ ├── media_platform/ # 各平台实现 +│ │ ├── bilibili/ # B站爬虫 +│ │ ├── douyin/ # 抖音爬虫 +│ │ ├── kuaishou/ # 快手爬虫 +│ │ ├── tieba/ # 贴吧爬虫 +│ │ ├── weibo/ # 微博爬虫 +│ │ ├── xhs/ # 小红书爬虫 +│ │ └── zhihu/ # 知乎爬虫 +│ ├── model/ # 数据模型 +│ ├── proxy/ # 代理管理 +│ ├── store/ # 存储层 +│ └── tools/ # 工具集 +│ +├── schema/ # 数据库架构 +│ ├── db_manager.py # 数据库管理 +│ ├── init_database.py # 初始化脚本 +│ └── mindspider_tables.sql # 表结构定义 +│ +├── config.py # 全局配置文件 +├── main.py # 系统主入口 +├── requirements.txt # 依赖列表 +└── README.md # 项目文档 +``` + +## 系统工作流程 + +### 整体架构流程图 + +```mermaid +flowchart TB + Start[开始] --> CheckConfig{检查配置} + CheckConfig -->|配置无效| ConfigError[配置错误
                                                                                                                          请检查config.py] + CheckConfig -->|配置有效| InitDB[初始化数据库] + + InitDB --> BroadTopic[BroadTopicExtraction
                                                                                                                          话题提取模块] + + BroadTopic --> CollectNews[收集热点新闻] + CollectNews --> |多平台采集| NewsSource{新闻源} + NewsSource --> Weibo[微博热搜] + NewsSource --> Zhihu[知乎热榜] + NewsSource --> Bilibili[B站热门] + NewsSource --> Toutiao[今日头条] + NewsSource --> Other[其他平台...] + + Weibo --> SaveNews[保存新闻到数据库] + Zhihu --> SaveNews + Bilibili --> SaveNews + Toutiao --> SaveNews + Other --> SaveNews + + SaveNews --> ExtractTopic[AI话题提取] + ExtractTopic --> |DeepSeek API| GenerateKeywords[生成关键词列表] + GenerateKeywords --> GenerateSummary[生成新闻摘要] + GenerateSummary --> SaveTopics[保存话题数据] + + SaveTopics --> DeepCrawl[DeepSentimentCrawling
                                                                                                                          深度爬取模块] + + DeepCrawl --> LoadKeywords[加载关键词] + LoadKeywords --> PlatformSelect{选择爬取平台} + + PlatformSelect --> XHS[小红书爬虫] + PlatformSelect --> DY[抖音爬虫] + PlatformSelect --> KS[快手爬虫] + PlatformSelect --> BILI[B站爬虫] + PlatformSelect --> WB[微博爬虫] + PlatformSelect --> TB[贴吧爬虫] + PlatformSelect --> ZH[知乎爬虫] + + XHS --> Login{需要登录?} + DY --> Login + KS --> Login + BILI --> Login + WB --> Login + TB --> Login + ZH --> Login + + Login -->|是| QRCode[扫码登录] + Login -->|否| Search[关键词搜索] + QRCode --> Search + + Search --> CrawlContent[爬取内容] + CrawlContent --> ParseData[解析数据] + ParseData --> SaveContent[保存到数据库] + + SaveContent --> MoreKeywords{还有更多关键词?} + MoreKeywords -->|是| LoadKeywords + MoreKeywords -->|否| GenerateReport[生成爬取报告] + + GenerateReport --> End[结束] + + style Start fill:#90EE90 + style End fill:#FFB6C1 + style BroadTopic fill:#87CEEB,stroke:#000,stroke-width:3px + style DeepCrawl fill:#DDA0DD,stroke:#000,stroke-width:3px + style ExtractTopic fill:#FFD700 + style ConfigError fill:#FF6347 +``` + +### 工作流程说明 + +#### 1. BroadTopicExtraction(话题提取模块) + +该模块负责每日热点话题的自动发现和提取: + +1. **新闻采集**:从多个主流平台(微博、知乎、B站等)自动采集热点新闻 +2. **AI分析**:使用DeepSeek API对新闻进行智能分析 +3. **话题提取**:自动识别热点话题并生成相关关键词 +4. **数据存储**:将话题和关键词保存到MySQL数据库 + +#### 2. DeepSentimentCrawling(深度爬取模块) + +基于提取的话题关键词,在各大社交平台进行深度内容爬取: + +1. **关键词加载**:从数据库读取当日提取的关键词 +2. **平台爬取**:使用Playwright在7大平台进行自动化爬取 +3. **内容解析**:提取帖子、评论、互动数据等 +4. **情感分析**:对爬取内容进行情感倾向分析 +5. **数据持久化**:将所有数据结构化存储到数据库 + +## 数据库架构 + +### 核心数据表 + +1. **daily_news** - 每日新闻表 + - 存储从各平台采集的热点新闻 + - 包含标题、链接、描述、排名等信息 + +2. **daily_topics** - 每日话题表 + - 存储AI提取的话题和关键词 + - 包含话题名称、描述、关键词列表等 + +3. **topic_news_relation** - 话题新闻关联表 + - 记录话题与新闻的关联关系 + - 包含关联度得分 + +4. **crawling_tasks** - 爬取任务表 + - 管理各平台的爬取任务 + - 记录任务状态、进度、结果等 + +5. **平台内容表**(继承自MediaCrawler) + - xhs_note - 小红书笔记 + - douyin_aweme - 抖音视频 + - kuaishou_video - 快手视频 + - bilibili_video - B站视频 + - weibo_note - 微博帖子 + - tieba_note - 贴吧帖子 + - zhihu_content - 知乎内容 + +## 安装部署 + +### 环境要求 + +- Python 3.9 或更高版本 +- MySQL 5.7 或更高版本 +- Conda环境:pytorch_python11(推荐) +- 操作系统:Windows/Linux/macOS + +### 1. 克隆项目 + +```bash +git clone https://github.com/yourusername/MindSpider.git +cd MindSpider +``` + +### 2. 创建并激活Conda环境 + +```bash +conda create -n pytorch_python11 python=3.11 +conda activate pytorch_python11 +``` + +### 3. 安装依赖 + +```bash +# 安装Python依赖 +pip install -r requirements.txt + +# 安装Playwright浏览器驱动 +playwright install +``` + +### 4. 配置系统 + +编辑 `config.py` 文件,设置数据库和API配置: + +```python +# MySQL数据库配置 +DB_HOST = "your_database_host" +DB_PORT = 3306 +DB_USER = "your_username" +DB_PASSWORD = "your_password" +DB_NAME = "mindspider" +DB_CHARSET = "utf8mb4" + +# DeepSeek API密钥 +DEEPSEEK_API_KEY = "your_deepseek_api_key" +``` + +### 5. 初始化系统 + +```bash +# 检查系统状态 +python main.py --status + +# 初始化数据库表 +python main.py --setup +``` + +## 使用指南 + +### 完整流程 + +```bash +# 1. 运行话题提取(获取热点新闻和关键词) +python main.py --broad-topic + +# 2. 运行爬虫(基于关键词爬取各平台内容) +python main.py --deep-sentiment --test + +# 或者一次性运行完整流程 +python main.py --complete --test +``` + +### 单独使用模块 + +```bash +# 只获取今日热点和关键词 +python main.py --broad-topic + +# 只爬取特定平台 +python main.py --deep-sentiment --platforms xhs dy --test + +# 指定日期 +python main.py --broad-topic --date 2024-01-15 +``` + +## 爬虫配置(重要) + +### 平台登录配置 + +**首次使用每个平台都需要登录,这是最关键的步骤:** + +1. **小红书登录** +```bash +# 测试小红书爬取(会弹出二维码) +python main.py --deep-sentiment --platforms xhs --test +# 用小红书APP扫码登录,登录成功后会自动保存状态 +``` + +2. **抖音登录** +```bash +# 测试抖音爬取 +python main.py --deep-sentiment --platforms dy --test +# 用抖音APP扫码登录 +``` + +3. **其他平台同理** +```bash +# 快手 +python main.py --deep-sentiment --platforms ks --test + +# B站 +python main.py --deep-sentiment --platforms bili --test + +# 微博 +python main.py --deep-sentiment --platforms wb --test + +# 贴吧 +python main.py --deep-sentiment --platforms tieba --test + +# 知乎 +python main.py --deep-sentiment --platforms zhihu --test +``` + +### 登录问题排除 + +**如果登录失败或卡住:** + +1. **检查网络**:确保能正常访问对应平台 +2. **关闭无头模式**:编辑 `DeepSentimentCrawling/MediaCrawler/config/base_config.py` + ```python + HEADLESS = False # 改为False,可以看到浏览器界面 + ``` +3. **手动处理验证**:有些平台可能需要手动滑动验证码 +4. **重新登录**:删除 `DeepSentimentCrawling/MediaCrawler/browser_data/` 目录重新登录 + +### 爬取参数调整 + +在实际使用前建议调整爬取参数: + +```bash +# 小规模测试(推荐先这样测试) +python main.py --complete --test + +# 调整爬取数量 +python main.py --complete --max-keywords 20 --max-notes 30 +``` + +### 高级功能 + +#### 1. 指定日期操作 +```bash +# 提取指定日期的话题 +python main.py --broad-topic --date 2024-01-15 + +# 爬取指定日期的内容 +python main.py --deep-sentiment --date 2024-01-15 +``` + +#### 2. 指定平台爬取 +```bash +# 只爬取小红书和抖音 +python main.py --deep-sentiment --platforms xhs dy --test + +# 爬取所有平台的特定数量内容 +python main.py --deep-sentiment --max-keywords 30 --max-notes 20 +``` + +## 常用参数 + +```bash +--status # 检查项目状态 +--setup # 初始化项目 +--broad-topic # 话题提取 +--deep-sentiment # 爬虫模块 +--complete # 完整流程 +--test # 测试模式(少量数据) +--platforms xhs dy # 指定平台 +--date 2024-01-15 # 指定日期 +``` + +## 支持的平台 + +| 代码 | 平台 | 代码 | 平台 | +|-----|-----|-----|-----| +| xhs | 小红书 | wb | 微博 | +| dy | 抖音 | tieba | 贴吧 | +| ks | 快手 | zhihu | 知乎 | +| bili | B站 | | | + +## 常见问题 + +### 1. 爬虫登录失败 +```bash +# 问题:二维码不显示或登录失败 +# 解决:关闭无头模式,手动登录 +# 编辑:DeepSentimentCrawling/MediaCrawler/config/base_config.py +HEADLESS = False + +# 重新运行登录 +python main.py --deep-sentiment --platforms xhs --test +``` + +### 2. 数据库连接失败 +```bash +# 检查配置 +python main.py --status + +# 检查config.py中的数据库配置是否正确 +``` + +### 3. playwright安装失败 +```bash +# 重新安装 +pip install playwright +playwright install +``` + +### 4. 爬取数据为空 +- 确保平台已经登录成功 +- 检查关键词是否存在(先运行话题提取) +- 使用测试模式验证:`--test` + +### 5. API调用失败 +- 检查DeepSeek API密钥是否正确 +- 确认API额度是否充足 + +## 注意事项 + +1. **首次使用必须先登录各平台** +2. **建议先用测试模式验证** +3. **遵守平台使用规则** +4. **仅供学习研究使用** + +## 项目开发指南 + +### 扩展新的新闻源 + +在 `BroadTopicExtraction/get_today_news.py` 中添加新的新闻源: + +```python +async def get_new_platform_news(self) -> List[Dict]: + """获取新平台的热点新闻""" + # 实现新闻采集逻辑 + pass +``` + +### 扩展新的爬虫平台 + +1. 在 `DeepSentimentCrawling/MediaCrawler/media_platform/` 下创建新平台目录 +2. 实现平台的核心功能模块: + - `client.py`: API客户端 + - `core.py`: 爬虫核心逻辑 + - `login.py`: 登录逻辑 + - `field.py`: 数据字段定义 + +### 数据库扩展 + +如需添加新的数据表或字段,请更新 `schema/mindspider_tables.sql` 并运行: + +```bash +python schema/init_database.py +``` + +## 性能优化建议 + +1. **数据库优化** + - 定期清理历史数据 + - 为高频查询字段建立索引 + - 考虑使用分区表管理大量数据 + +2. **爬取优化** + - 合理设置爬取间隔避免被限制 + - 使用代理池提高稳定性 + - 控制并发数避免资源耗尽 + +3. **系统优化** + - 使用Redis缓存热点数据 + - 异步任务队列处理耗时操作 + - 定期监控系统资源使用 + +## API接口说明 + +系统提供Python API供二次开发: + +```python +from BroadTopicExtraction import BroadTopicExtraction +from DeepSentimentCrawling import DeepSentimentCrawling + +# 话题提取 +async def extract_topics(): + extractor = BroadTopicExtraction() + result = await extractor.run_daily_extraction() + return result + +# 内容爬取 +def crawl_content(): + crawler = DeepSentimentCrawling() + result = crawler.run_daily_crawling( + platforms=['xhs', 'dy'], + max_keywords=50, + max_notes=30 + ) + return result +``` + +## 许可证 + +本项目仅供学习研究使用,请勿用于商业用途。使用本项目时请遵守相关法律法规和平台服务条款。 + +--- + +**MindSpider** - 让AI助力舆情洞察,智能化内容分析的得力助手 diff --git a/MindSpider/config.py b/MindSpider/config.py new file mode 100644 index 0000000..7d36612 --- /dev/null +++ b/MindSpider/config.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +""" +存储数据库连接信息和API密钥 +""" + +# MySQL数据库配置 +DB_HOST = "your_host" +DB_PORT = 3306 +DB_USER = "your_username" +DB_PASSWORD = "your_password" +DB_NAME = "mindspider" +DB_CHARSET = "utf8mb4" + +# DeepSeek API密钥 +DEEPSEEK_API_KEY = "your_deepseek_api_key" diff --git a/MindSpider/main.py b/MindSpider/main.py new file mode 100644 index 0000000..aff9271 --- /dev/null +++ b/MindSpider/main.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +MindSpider - AI爬虫项目主程序 +集成BroadTopicExtraction和DeepSentimentCrawling两个核心模块 +""" + +import os +import sys +import argparse +from datetime import date, datetime +from pathlib import Path +import subprocess +import pymysql +from pymysql.cursors import DictCursor + +# 添加项目根目录到路径 +project_root = Path(__file__).parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + print("错误:无法导入config.py配置文件") + print("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息") + sys.exit(1) + +class MindSpider: + """MindSpider主程序""" + + def __init__(self): + """初始化MindSpider""" + self.project_root = project_root + self.broad_topic_path = self.project_root / "BroadTopicExtraction" + self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling" + self.schema_path = self.project_root / "schema" + + print("MindSpider AI爬虫项目") + print(f"项目路径: {self.project_root}") + + def check_config(self) -> bool: + """检查基础配置""" + print("\n检查基础配置...") + + # 检查config.py配置项 + required_configs = [ + 'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET', + 'DEEPSEEK_API_KEY' + ] + + missing_configs = [] + for config_name in required_configs: + if not hasattr(config, config_name) or not getattr(config, config_name): + missing_configs.append(config_name) + + if missing_configs: + print(f"配置缺失: {', '.join(missing_configs)}") + print("请检查config.py文件中的配置信息") + return False + + print("基础配置检查通过") + return True + + def check_database_connection(self) -> bool: + """检查数据库连接""" + print("\n检查数据库连接...") + + try: + connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + database=config.DB_NAME, + charset=config.DB_CHARSET, + cursorclass=DictCursor + ) + connection.close() + print("数据库连接正常") + return True + except Exception as e: + print(f"数据库连接失败: {e}") + return False + + def check_database_tables(self) -> bool: + """检查数据库表是否存在""" + print("\n检查数据库表...") + + try: + connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + database=config.DB_NAME, + charset=config.DB_CHARSET, + cursorclass=DictCursor + ) + + cursor = connection.cursor() + + # 检查核心表是否存在 + required_tables = ['daily_news', 'daily_topics'] + cursor.execute("SHOW TABLES") + existing_tables = [row[f'Tables_in_{config.DB_NAME}'] for row in cursor.fetchall()] + + missing_tables = [table for table in required_tables if table not in existing_tables] + + connection.close() + + if missing_tables: + print(f"缺少数据库表: {', '.join(missing_tables)}") + return False + else: + print("数据库表检查通过") + return True + + except Exception as e: + print(f"检查数据库表失败: {e}") + return False + + def initialize_database(self) -> bool: + """初始化数据库""" + print("\n初始化数据库...") + + try: + # 运行数据库初始化脚本 + init_script = self.schema_path / "init_database.py" + if not init_script.exists(): + print("错误:找不到数据库初始化脚本") + return False + + result = subprocess.run( + [sys.executable, str(init_script)], + cwd=self.schema_path, + capture_output=True, + text=True + ) + + if result.returncode == 0: + print("数据库初始化成功") + return True + else: + print(f"数据库初始化失败: {result.stderr}") + return False + + except Exception as e: + print(f"数据库初始化异常: {e}") + return False + + def check_dependencies(self) -> bool: + """检查依赖环境""" + print("\n检查依赖环境...") + + # 检查Python包 + required_packages = ['pymysql', 'requests', 'playwright'] + missing_packages = [] + + for package in required_packages: + try: + __import__(package) + except ImportError: + missing_packages.append(package) + + if missing_packages: + print(f"缺少Python包: {', '.join(missing_packages)}") + print("请运行: pip install -r requirements.txt") + return False + + # 检查MediaCrawler依赖 + mediacrawler_path = self.deep_sentiment_path / "MediaCrawler" + if not mediacrawler_path.exists(): + print("错误:找不到MediaCrawler目录") + return False + + print("依赖环境检查通过") + return True + + def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool: + """运行BroadTopicExtraction模块""" + print(f"\n运行BroadTopicExtraction模块...") + + if not extract_date: + extract_date = date.today() + + try: + cmd = [ + sys.executable, "main.py", + "--date", extract_date.strftime("%Y-%m-%d"), + "--keywords", str(keywords_count) + ] + + print(f"执行命令: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + cwd=self.broad_topic_path, + timeout=1800 # 30分钟超时 + ) + + if result.returncode == 0: + print("BroadTopicExtraction模块执行成功") + return True + else: + print(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}") + return False + + except subprocess.TimeoutExpired: + print("BroadTopicExtraction模块执行超时") + return False + except Exception as e: + print(f"BroadTopicExtraction模块执行异常: {e}") + return False + + def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None, + max_keywords: int = 50, max_notes: int = 50, + test_mode: bool = False) -> bool: + """运行DeepSentimentCrawling模块""" + print(f"\n运行DeepSentimentCrawling模块...") + + if not target_date: + target_date = date.today() + + try: + cmd = [sys.executable, "main.py"] + + if target_date: + cmd.extend(["--date", target_date.strftime("%Y-%m-%d")]) + + if platforms: + cmd.extend(["--platforms"] + platforms) + + cmd.extend([ + "--max-keywords", str(max_keywords), + "--max-notes", str(max_notes) + ]) + + if test_mode: + cmd.append("--test") + + print(f"执行命令: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + cwd=self.deep_sentiment_path, + timeout=3600 # 60分钟超时 + ) + + if result.returncode == 0: + print("DeepSentimentCrawling模块执行成功") + return True + else: + print(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}") + return False + + except subprocess.TimeoutExpired: + print("DeepSentimentCrawling模块执行超时") + return False + except Exception as e: + print(f"DeepSentimentCrawling模块执行异常: {e}") + return False + + def run_complete_workflow(self, target_date: date = None, platforms: list = None, + keywords_count: int = 100, max_keywords: int = 50, + max_notes: int = 50, test_mode: bool = False) -> bool: + """运行完整工作流程""" + print(f"\n开始完整的MindSpider工作流程") + + if not target_date: + target_date = date.today() + + print(f"目标日期: {target_date}") + print(f"平台列表: {platforms if platforms else '所有支持的平台'}") + print(f"测试模式: {'是' if test_mode else '否'}") + + # 第一步:运行话题提取 + print(f"\n=== 第一步:话题提取 ===") + if not self.run_broad_topic_extraction(target_date, keywords_count): + print("话题提取失败,终止流程") + return False + + # 第二步:运行情感爬取 + print(f"\n=== 第二步:情感爬取 ===") + if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode): + print("情感爬取失败,但话题提取已完成") + return False + + print(f"\n完整工作流程执行成功!") + return True + + def show_status(self): + """显示项目状态""" + print(f"\nMindSpider项目状态:") + print(f"项目路径: {self.project_root}") + + # 配置状态 + config_ok = self.check_config() + print(f"配置状态: {'正常' if config_ok else '异常'}") + + # 数据库状态 + if config_ok: + db_conn_ok = self.check_database_connection() + print(f"数据库连接: {'正常' if db_conn_ok else '异常'}") + + if db_conn_ok: + db_tables_ok = self.check_database_tables() + print(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}") + + # 依赖状态 + deps_ok = self.check_dependencies() + print(f"依赖环境: {'正常' if deps_ok else '异常'}") + + # 模块状态 + broad_topic_exists = self.broad_topic_path.exists() + deep_sentiment_exists = self.deep_sentiment_path.exists() + print(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}") + print(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}") + + def setup_project(self) -> bool: + """项目初始化设置""" + print(f"\n开始MindSpider项目初始化...") + + # 1. 检查配置 + if not self.check_config(): + return False + + # 2. 检查依赖 + if not self.check_dependencies(): + return False + + # 3. 检查数据库连接 + if not self.check_database_connection(): + return False + + # 4. 检查并初始化数据库表 + if not self.check_database_tables(): + print("需要初始化数据库表...") + if not self.initialize_database(): + return False + + print(f"\nMindSpider项目初始化完成!") + return True + +def main(): + """命令行入口""" + parser = argparse.ArgumentParser(description="MindSpider - AI爬虫项目主程序") + + # 基本操作 + parser.add_argument("--setup", action="store_true", help="初始化项目设置") + parser.add_argument("--status", action="store_true", help="显示项目状态") + parser.add_argument("--init-db", action="store_true", help="初始化数据库") + + # 模块运行 + parser.add_argument("--broad-topic", action="store_true", help="只运行话题提取模块") + parser.add_argument("--deep-sentiment", action="store_true", help="只运行情感爬取模块") + parser.add_argument("--complete", action="store_true", help="运行完整工作流程") + + # 参数配置 + parser.add_argument("--date", type=str, help="目标日期 (YYYY-MM-DD),默认为今天") + parser.add_argument("--platforms", type=str, nargs='+', + choices=['xhs', 'dy', 'ks', 'bili', 'wb', 'tieba', 'zhihu'], + help="指定爬取平台") + parser.add_argument("--keywords-count", type=int, default=100, help="话题提取的关键词数量") + parser.add_argument("--max-keywords", type=int, default=50, help="每个平台最大关键词数量") + parser.add_argument("--max-notes", type=int, default=50, help="每个关键词最大爬取内容数量") + parser.add_argument("--test", action="store_true", help="测试模式(少量数据)") + + args = parser.parse_args() + + # 解析日期 + target_date = None + if args.date: + try: + target_date = datetime.strptime(args.date, "%Y-%m-%d").date() + except ValueError: + print("错误:日期格式不正确,请使用 YYYY-MM-DD 格式") + return + + # 创建MindSpider实例 + spider = MindSpider() + + try: + # 显示状态 + if args.status: + spider.show_status() + return + + # 项目设置 + if args.setup: + if spider.setup_project(): + print("项目设置完成,可以开始使用MindSpider!") + else: + print("项目设置失败,请检查配置和环境") + return + + # 初始化数据库 + if args.init_db: + if spider.initialize_database(): + print("数据库初始化成功") + else: + print("数据库初始化失败") + return + + # 运行模块 + if args.broad_topic: + spider.run_broad_topic_extraction(target_date, args.keywords_count) + elif args.deep_sentiment: + spider.run_deep_sentiment_crawling( + target_date, args.platforms, args.max_keywords, args.max_notes, args.test + ) + elif args.complete: + spider.run_complete_workflow( + target_date, args.platforms, args.keywords_count, + args.max_keywords, args.max_notes, args.test + ) + else: + # 默认运行完整工作流程 + print("运行完整MindSpider工作流程...") + spider.run_complete_workflow( + target_date, args.platforms, args.keywords_count, + args.max_keywords, args.max_notes, args.test + ) + + except KeyboardInterrupt: + print("\n用户中断操作") + except Exception as e: + print(f"\n执行出错: {e}") + +if __name__ == "__main__": + main() diff --git a/MindSpider/requirements.txt b/MindSpider/requirements.txt new file mode 100644 index 0000000..8f1b411 --- /dev/null +++ b/MindSpider/requirements.txt @@ -0,0 +1,51 @@ +# MindSpider AI爬虫项目依赖包 +# 整合BroadTopicExtraction和DeepSentimentCrawling模块的核心依赖 + +# =============================== +# 数据库相关 +# =============================== +pymysql==1.1.0 +aiomysql==0.2.0 +aiosqlite==0.21.0 + +# =============================== +# HTTP请求和网络 +# =============================== +httpx==0.28.1 +requests==2.32.3 +aiofiles~=23.2.1 + +# =============================== +# 话题提取模块依赖 +# =============================== +numpy +pandas==2.2.3 +regex +tqdm +python-dateutil +pytz + +# =============================== +# MediaCrawler爬虫依赖 +# =============================== +Pillow==9.5.0 +playwright==1.45.0 +tenacity==8.2.2 +opencv-python +redis~=4.6.0 +pydantic==2.5.2 +fastapi==0.110.2 +uvicorn==0.29.0 +python-dotenv==1.0.1 +jieba==0.42.1 +wordcloud==1.9.3 +matplotlib==3.9.0 +parsel==1.9.1 +pyexecjs==1.5.1 + +# =============================== +# 工具包 +# =============================== +beautifulsoup4 +lxml +loguru diff --git a/MindSpider/schema/db_manager.py b/MindSpider/schema/db_manager.py new file mode 100644 index 0000000..76303ee --- /dev/null +++ b/MindSpider/schema/db_manager.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +MindSpider AI爬虫项目 - 数据库管理工具 +提供数据库状态查看、数据统计、清理等功能 +""" + +import os +import sys +import pymysql +import argparse +from pathlib import Path +from datetime import datetime, timedelta + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +try: + import config +except ImportError: + print("错误: 无法导入config.py配置文件") + sys.exit(1) + +class DatabaseManager: + def __init__(self): + self.connection = None + self.connect() + + def connect(self): + """连接数据库""" + try: + self.connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + database=config.DB_NAME, + charset=config.DB_CHARSET, + autocommit=True + ) + print(f"成功连接到数据库: {config.DB_NAME}") + except Exception as e: + print(f"数据库连接失败: {e}") + sys.exit(1) + + def close(self): + """关闭数据库连接""" + if self.connection: + self.connection.close() + + def show_tables(self): + """显示所有表""" + print("\n" + "=" * 60) + print("数据库表列表") + print("=" * 60) + + cursor = self.connection.cursor() + cursor.execute("SHOW TABLES") + tables = cursor.fetchall() + + if not tables: + print("数据库中没有表") + return + + # 分类显示表 + mindspider_tables = [] + mediacrawler_tables = [] + + for table in tables: + table_name = table[0] + if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']: + mindspider_tables.append(table_name) + else: + mediacrawler_tables.append(table_name) + + print("MindSpider核心表:") + for table in mindspider_tables: + cursor.execute(f"SELECT COUNT(*) FROM {table}") + count = cursor.fetchone()[0] + print(f" - {table:<25} ({count:>6} 条记录)") + + print("\nMediaCrawler平台表:") + for table in mediacrawler_tables: + try: + cursor.execute(f"SELECT COUNT(*) FROM {table}") + count = cursor.fetchone()[0] + print(f" - {table:<25} ({count:>6} 条记录)") + except: + print(f" - {table:<25} (查询失败)") + + def show_statistics(self): + """显示数据统计""" + print("\n" + "=" * 60) + print("数据统计") + print("=" * 60) + + cursor = self.connection.cursor() + + try: + # 新闻统计 + cursor.execute("SELECT COUNT(*) FROM daily_news") + news_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news") + news_days = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news") + platforms = cursor.fetchone()[0] + + print(f"新闻数据:") + print(f" - 总新闻数: {news_count}") + print(f" - 覆盖天数: {news_days}") + print(f" - 新闻平台: {platforms}") + + # 话题统计 + cursor.execute("SELECT COUNT(*) FROM daily_topics") + topic_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics") + topic_days = cursor.fetchone()[0] + + print(f"\n话题数据:") + print(f" - 总话题数: {topic_count}") + print(f" - 提取天数: {topic_days}") + + # 爬取任务统计 + cursor.execute("SELECT COUNT(*) FROM crawling_tasks") + task_count = cursor.fetchone()[0] + + cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status") + task_status = cursor.fetchall() + + print(f"\n爬取任务:") + print(f" - 总任务数: {task_count}") + for status, count in task_status: + print(f" - {status}: {count}") + + # 爬取内容统计 + print(f"\n平台内容统计:") + platform_tables = { + 'xhs_note': '小红书', + 'douyin_aweme': '抖音', + 'kuaishou_video': '快手', + 'bilibili_video': 'B站', + 'weibo_note': '微博', + 'tieba_note': '贴吧', + 'zhihu_content': '知乎' + } + + for table, platform in platform_tables.items(): + try: + cursor.execute(f"SELECT COUNT(*) FROM {table}") + count = cursor.fetchone()[0] + print(f" - {platform}: {count}") + except: + print(f" - {platform}: 表不存在") + + except Exception as e: + print(f"统计查询失败: {e}") + + def show_recent_data(self, days=7): + """显示最近几天的数据""" + print(f"\n" + "=" * 60) + print(f"最近{days}天的数据") + print("=" * 60) + + cursor = self.connection.cursor() + + # 最近的新闻 + cursor.execute(""" + SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms + FROM daily_news + WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + GROUP BY crawl_date + ORDER BY crawl_date DESC + """, (days,)) + + news_data = cursor.fetchall() + if news_data: + print("每日新闻统计:") + for date, count, platforms in news_data: + print(f" {date}: {count} 条新闻, {platforms} 个平台") + + # 最近的话题 + cursor.execute(""" + SELECT extract_date, COUNT(*) as topic_count + FROM daily_topics + WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY) + GROUP BY extract_date + ORDER BY extract_date DESC + """, (days,)) + + topic_data = cursor.fetchall() + if topic_data: + print("\n每日话题统计:") + for date, count in topic_data: + print(f" {date}: {count} 个话题") + + def cleanup_old_data(self, days=90, dry_run=True): + """清理旧数据""" + print(f"\n" + "=" * 60) + print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})") + print("=" * 60) + + cursor = self.connection.cursor() + cutoff_date = datetime.now() - timedelta(days=days) + + # 检查要删除的数据 + cleanup_queries = [ + ("daily_news", f"SELECT COUNT(*) FROM daily_news WHERE crawl_date < '{cutoff_date.date()}'"), + ("daily_topics", f"SELECT COUNT(*) FROM daily_topics WHERE extract_date < '{cutoff_date.date()}'"), + ("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'") + ] + + for table, query in cleanup_queries: + cursor.execute(query) + count = cursor.fetchone()[0] + if count > 0: + print(f" {table}: {count} 条记录将被删除") + if not dry_run: + delete_query = query.replace("SELECT COUNT(*)", "DELETE") + cursor.execute(delete_query) + print(f" 已删除 {count} 条记录") + else: + print(f" {table}: 无需清理") + + if dry_run: + print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。") + +def main(): + parser = argparse.ArgumentParser(description="MindSpider数据库管理工具") + parser.add_argument("--tables", action="store_true", help="显示所有表") + parser.add_argument("--stats", action="store_true", help="显示数据统计") + parser.add_argument("--recent", type=int, default=7, help="显示最近N天的数据 (默认7天)") + parser.add_argument("--cleanup", type=int, help="清理N天前的数据") + parser.add_argument("--execute", action="store_true", help="执行实际清理操作") + + args = parser.parse_args() + + # 如果没有参数,显示所有信息 + if not any([args.tables, args.stats, args.recent != 7, args.cleanup]): + args.tables = True + args.stats = True + + db_manager = DatabaseManager() + + try: + if args.tables: + db_manager.show_tables() + + if args.stats: + db_manager.show_statistics() + + if args.recent != 7 or not any([args.tables, args.stats, args.cleanup]): + db_manager.show_recent_data(args.recent) + + if args.cleanup: + db_manager.cleanup_old_data(args.cleanup, dry_run=not args.execute) + + finally: + db_manager.close() + +if __name__ == "__main__": + main() diff --git a/MindSpider/schema/init_database.py b/MindSpider/schema/init_database.py new file mode 100644 index 0000000..c0044f7 --- /dev/null +++ b/MindSpider/schema/init_database.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +MindSpider AI爬虫项目 - 数据库初始化脚本 +用于创建项目所需的所有数据库表 +""" + +import os +import sys +import pymysql +from pathlib import Path + +# 添加项目根目录到路径 +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +# 导入配置 +try: + import config +except ImportError: + print("错误: 无法导入config.py配置文件") + print("请确保config.py文件存在于项目根目录") + sys.exit(1) + +def create_database_connection(): + """创建数据库连接""" + try: + connection = pymysql.connect( + host=config.DB_HOST, + port=config.DB_PORT, + user=config.DB_USER, + password=config.DB_PASSWORD, + charset=config.DB_CHARSET, + autocommit=True + ) + print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}") + return connection + except Exception as e: + print(f"连接数据库失败: {e}") + return None + +def create_database(connection): + """创建数据库""" + try: + cursor = connection.cursor() + cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci") + cursor.execute(f"USE `{config.DB_NAME}`") + print(f"数据库 '{config.DB_NAME}' 创建/选择成功") + return True + except Exception as e: + print(f"创建数据库失败: {e}") + return False + +def execute_sql_file(connection, sql_file_path, description=""): + """执行SQL文件""" + if not os.path.exists(sql_file_path): + print(f"警告: SQL文件不存在: {sql_file_path}") + return False + + try: + cursor = connection.cursor() + with open(sql_file_path, 'r', encoding='utf-8') as f: + sql_content = f.read() + + # 分割SQL语句(简单实现,按分号分割) + sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] + + success_count = 0 + error_count = 0 + + for stmt in sql_statements: + if not stmt or stmt.startswith('--'): + continue + try: + cursor.execute(stmt) + success_count += 1 + except Exception as e: + error_count += 1 + print(f"执行SQL语句失败: {str(e)[:100]}...") + + print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句") + return error_count == 0 + + except Exception as e: + print(f"执行SQL文件失败 {sql_file_path}: {e}") + return False + +def main(): + """主函数""" + print("=" * 60) + print("MindSpider AI爬虫项目 - 数据库初始化") + print("=" * 60) + + # 检查配置 + print("检查数据库配置...") + print(f"数据库主机: {config.DB_HOST}") + print(f"数据库端口: {config.DB_PORT}") + print(f"数据库名称: {config.DB_NAME}") + print(f"数据库用户: {config.DB_USER}") + print(f"字符集: {config.DB_CHARSET}") + print() + + # 创建数据库连接 + print("正在连接数据库...") + connection = create_database_connection() + if not connection: + print("数据库初始化失败!") + return False + + try: + # 创建数据库 + print("正在创建/选择数据库...") + if not create_database(connection): + return False + + # 获取SQL文件路径 + schema_dir = Path(__file__).parent + mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql" + mindspider_sql = schema_dir / "mindspider_tables.sql" + + print() + print("开始执行SQL脚本...") + + # 1. 执行MediaCrawler的原始表结构 + if mediacrawler_sql.exists(): + print("1. 创建MediaCrawler基础表...") + execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表") + else: + print("警告: MediaCrawler SQL文件不存在,跳过基础表创建") + + # 2. 执行MindSpider扩展表结构 + print("2. 创建MindSpider扩展表...") + if mindspider_sql.exists(): + execute_sql_file(connection, str(mindspider_sql), "MindSpider扩展表") + else: + print("错误: MindSpider SQL文件不存在") + return False + + print() + print("=" * 60) + print("数据库初始化完成!") + print("=" * 60) + + # 显示创建的表 + cursor = connection.cursor() + cursor.execute("SHOW TABLES") + tables = cursor.fetchall() + + print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:") + for table in tables: + print(f" - {table[0]}") + + print() + print("数据库初始化成功完成!您现在可以开始使用MindSpider了。") + return True + + except Exception as e: + print(f"数据库初始化过程中发生错误: {e}") + return False + + finally: + if connection: + connection.close() + print("数据库连接已关闭") + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/MindSpider/schema/mindspider_tables.sql b/MindSpider/schema/mindspider_tables.sql new file mode 100644 index 0000000..1b05489 --- /dev/null +++ b/MindSpider/schema/mindspider_tables.sql @@ -0,0 +1,201 @@ +-- MindSpider AI爬虫项目 - 数据库表结构 +-- 基于MediaCrawler表结构扩展,添加BroadTopicExtraction模块所需表 + +-- =============================== +-- BroadTopicExtraction 模块表结构 +-- =============================== + +-- ---------------------------- +-- Table structure for daily_news +-- 每日新闻表:存储get_today_news.py获取的热点新闻 +-- ---------------------------- +DROP TABLE IF EXISTS `daily_news`; +CREATE TABLE `daily_news` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `news_id` varchar(128) NOT NULL COMMENT '新闻唯一ID', + `source_platform` varchar(32) NOT NULL COMMENT '新闻源平台(weibo|zhihu|bilibili|toutiao|douyin等)', + `title` varchar(500) NOT NULL COMMENT '新闻标题', + `url` varchar(512) DEFAULT NULL COMMENT '新闻链接', + `description` text COMMENT '新闻描述或摘要', + `extra_info` text COMMENT '额外信息(JSON格式存储)', + `crawl_date` date NOT NULL COMMENT '爬取日期', + `rank_position` int DEFAULT NULL COMMENT '在热榜中的排名位置', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_daily_news_unique` (`news_id`, `source_platform`, `crawl_date`), + KEY `idx_daily_news_date` (`crawl_date`), + KEY `idx_daily_news_platform` (`source_platform`), + KEY `idx_daily_news_rank` (`rank_position`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日热点新闻表'; + +-- ---------------------------- +-- Table structure for daily_topics +-- 每日话题表:存储TopicGPT提取的话题信息 +-- ---------------------------- +DROP TABLE IF EXISTS `daily_topics`; +CREATE TABLE `daily_topics` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `topic_id` varchar(64) NOT NULL COMMENT '话题唯一ID', + `topic_name` varchar(255) NOT NULL COMMENT '话题名称', + `topic_description` text COMMENT '话题描述', + `keywords` text COMMENT '话题关键词(JSON格式存储)', + `extract_date` date NOT NULL COMMENT '话题提取日期', + `relevance_score` float DEFAULT NULL COMMENT '话题相关性得分', + `news_count` int DEFAULT 0 COMMENT '关联的新闻数量', + `processing_status` varchar(16) DEFAULT 'pending' COMMENT '处理状态(pending|processing|completed|failed)', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_daily_topics_unique` (`topic_id`, `extract_date`), + KEY `idx_daily_topics_date` (`extract_date`), + KEY `idx_daily_topics_status` (`processing_status`), + KEY `idx_daily_topics_score` (`relevance_score`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日提取话题表'; + +-- ---------------------------- +-- Table structure for topic_news_relation +-- 话题新闻关联表:记录话题和新闻的关联关系 +-- ---------------------------- +DROP TABLE IF EXISTS `topic_news_relation`; +CREATE TABLE `topic_news_relation` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `topic_id` varchar(64) NOT NULL COMMENT '话题ID', + `news_id` varchar(128) NOT NULL COMMENT '新闻ID', + `relation_score` float DEFAULT NULL COMMENT '关联度得分', + `extract_date` date NOT NULL COMMENT '关联提取日期', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_topic_news_unique` (`topic_id`, `news_id`, `extract_date`), + KEY `idx_topic_news_topic` (`topic_id`), + KEY `idx_topic_news_news` (`news_id`), + KEY `idx_topic_news_date` (`extract_date`), + FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE, + FOREIGN KEY (`news_id`) REFERENCES `daily_news`(`news_id`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='话题新闻关联表'; + +-- ---------------------------- +-- Table structure for crawling_tasks +-- 爬取任务表:记录基于话题的平台爬取任务 +-- ---------------------------- +DROP TABLE IF EXISTS `crawling_tasks`; +CREATE TABLE `crawling_tasks` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `task_id` varchar(64) NOT NULL COMMENT '任务唯一ID', + `topic_id` varchar(64) NOT NULL COMMENT '关联的话题ID', + `platform` varchar(32) NOT NULL COMMENT '目标平台(xhs|dy|ks|bili|wb|tieba|zhihu)', + `search_keywords` text NOT NULL COMMENT '搜索关键词(JSON格式存储)', + `task_status` varchar(16) DEFAULT 'pending' COMMENT '任务状态(pending|running|completed|failed|paused)', + `start_time` bigint DEFAULT NULL COMMENT '任务开始时间戳', + `end_time` bigint DEFAULT NULL COMMENT '任务结束时间戳', + `total_crawled` int DEFAULT 0 COMMENT '已爬取内容数量', + `success_count` int DEFAULT 0 COMMENT '成功爬取数量', + `error_count` int DEFAULT 0 COMMENT '错误数量', + `error_message` text COMMENT '错误信息', + `config_params` text COMMENT '爬取配置参数(JSON格式)', + `scheduled_date` date NOT NULL COMMENT '计划执行日期', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_crawling_tasks_unique` (`task_id`), + KEY `idx_crawling_tasks_topic` (`topic_id`), + KEY `idx_crawling_tasks_platform` (`platform`), + KEY `idx_crawling_tasks_status` (`task_status`), + KEY `idx_crawling_tasks_date` (`scheduled_date`), + FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='爬取任务表'; + +-- =============================== +-- MediaCrawler表结构扩展字段 +-- =============================== + +-- 为MediaCrawler现有表添加话题关联字段,支持MindSpider功能 +-- 注意:这些字段是可选的,不影响MediaCrawler原有功能 + +-- 为小红书笔记表添加话题关联字段 +ALTER TABLE `xhs_note` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为抖音视频表添加话题关联字段 +ALTER TABLE `douyin_aweme` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为快手视频表添加话题关联字段 +ALTER TABLE `kuaishou_video` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为B站视频表添加话题关联字段 +ALTER TABLE `bilibili_video` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为微博帖子表添加话题关联字段 +ALTER TABLE `weibo_note` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为贴吧帖子表添加话题关联字段 +ALTER TABLE `tieba_note` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- 为知乎内容表添加话题关联字段 +ALTER TABLE `zhihu_content` +ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID', +ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID'; + +-- =============================== +-- 创建视图用于数据分析 +-- =============================== + +-- 话题爬取统计视图 +CREATE OR REPLACE VIEW `v_topic_crawling_stats` AS +SELECT + dt.topic_id, + dt.topic_name, + dt.extract_date, + dt.processing_status, + COUNT(DISTINCT ct.task_id) as total_tasks, + SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) as completed_tasks, + SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) as failed_tasks, + SUM(ct.total_crawled) as total_content_crawled, + SUM(ct.success_count) as total_success_count, + SUM(ct.error_count) as total_error_count +FROM daily_topics dt +LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id +GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status; + +-- 每日数据统计视图 +CREATE OR REPLACE VIEW `v_daily_summary` AS +SELECT + crawl_date, + COUNT(DISTINCT news_id) as total_news, + COUNT(DISTINCT source_platform) as platforms_covered, + (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) as topics_extracted, + (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) as tasks_created +FROM daily_news dn +GROUP BY crawl_date +ORDER BY crawl_date DESC; + +-- =============================== +-- 初始化索引优化 +-- =============================== + +-- 为关联查询优化添加复合索引 +CREATE INDEX `idx_topic_date_status` ON `daily_topics` (`extract_date`, `processing_status`); +CREATE INDEX `idx_task_topic_platform` ON `crawling_tasks` (`topic_id`, `platform`, `task_status`); +CREATE INDEX `idx_news_date_platform` ON `daily_news` (`crawl_date`, `source_platform`); + +-- =============================== +-- 数据库配置优化建议 +-- =============================== + +-- 设置合适的字符集和排序规则 +-- ALTER DATABASE mindspider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; + +-- 建议的数据保留策略(可选) +-- 可以根据需要创建事件调度器来清理历史数据 +-- 例如:删除90天前的新闻数据,保留话题和爬取结果数据 diff --git a/config.py b/config.py index 52cc817..0b2d027 100644 --- a/config.py +++ b/config.py @@ -1,37 +1,31 @@ # -*- coding: utf-8 -*- """ -Intelligence Public Opinion Analysis Platform Configuration File -Stores database connection information and API keys +智能舆情分析平台配置文件 +存储数据库连接信息和API密钥 """ -# MySQL Database Configuration -DB_HOST = "your_database_host" # e.g., "localhost" or "127.0.0.1" +# MySQL数据库配置 +DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com" DB_PORT = 3306 -DB_USER = "your_database_user" -DB_PASSWORD = "your_database_password" -DB_NAME = "your_database_name" +DB_USER = "root" +DB_PASSWORD = "mneDccc7sHHANtFk" +DB_NAME = "media_crawler" DB_CHARSET = "utf8mb4" -# DeepSeek API Key -# 申请地址https://www.deepseek.com/ -DEEPSEEK_API_KEY = "your_deepseek_api_key" +# DeepSeek API密钥 +DEEPSEEK_API_KEY = "sk-db84c08a6f9a439b8eb798ad9ef22225" -# Tavily Search API Key -# 申请地址https://www.tavily.com/ -TAVILY_API_KEY = "your_tavily_api_key" +# Tavily搜索API密钥 +TAVILY_API_KEY = "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" -# Kimi API Key -# 申请地址https://www.kimi.com/ -KIMI_API_KEY = "your_kimi_api_key" +# Kimi API密钥 +KIMI_API_KEY = "sk-H3vxh28PQMJajvAon6nrqVFcf9Igs5cVKVn2v7UUthRrmje3" -# Gemini API Key (via OpenAI format proxy) -# 申请地址https://api.chataiapi.com/ -GEMINI_API_KEY = "your_gemini_api_key" +# Gemini API密钥(中转,OpenAI调用格式) +GEMINI_API_KEY = "sk-JjKFgVz5NsXAWjflIFM82Z3eGwpunP7kq0HBiLh0suRJDLtp" -# Bocha Search API Key -# 申请地址https://open.bochaai.com/ -BOCHA_Web_Search_API_KEY = "your_bocha_web_search_api_key" +# 博查搜索API密钥 +BOCHA_Web_Search_API_KEY = "sk-8dfcc8b40d81448ca41f1d8d50aba2e9" -# Guiji Flow API Key -# 申请地址https://siliconflow.cn/ -GUIJI_QWEN3_API_KEY = "your_guiji_qwen3_api_key" \ No newline at end of file +# 硅基流动API密钥 +GUIJI_QWEN3_API_KEY = "sk-qrkvwyhqodwwdldpzsuaipoxiepqeqelhguwkoklbdnemybt" \ No newline at end of file