diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..6046351
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,67 @@
+# ====================== 数据库配置 ======================
+# 数据库主机,例如localhost 或 127.0.0.1
+DB_HOST=your_db_host
+# 数据库端口号,默认为3306
+DB_PORT=3306
+# 数据库用户名
+DB_USER=your_db_user
+# 数据库密码
+DB_PASSWORD=your_db_password
+# 数据库名称
+DB_NAME=your_db_name
+# 数据库字符集,推荐utf8mb4,兼容emoji
+DB_CHARSET=utf8mb4
+# 数据库类型mysql或postgresql
+DB_DIALECT=postgresql
+
+# ======================= LLM 相关 =======================
+# Insight Agent(推荐Kimi,https://platform.moonshot.cn/)API密钥,用于主LLM
+INSIGHT_ENGINE_API_KEY=
+# Insight Agent LLM接口BaseUrl,可自定义厂商API
+INSIGHT_ENGINE_BASE_URL=
+# Insight Agent LLM模型名称,如kimi-k2-0711-preview
+INSIGHT_ENGINE_MODEL_NAME=
+# Media Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥
+MEDIA_ENGINE_API_KEY=
+# Media Agent LLM接口BaseUrl
+MEDIA_ENGINE_BASE_URL=
+# Media Agent LLM模型名称,如gemini-2.5-pro
+MEDIA_ENGINE_MODEL_NAME=
+
+# Media Agent API密钥(推荐Deepseek)
+MINDSPIDER_API_KEY=
+# MindSpider LLM接口BaseUrl
+MINDSPIDER_BASE_URL=
+# MindSpider LLM模型名称,如deepseek-chat
+MINDSPIDER_MODEL_NAME=
+
+# Query Agent(推荐DeepSeek,https://www.deepseek.com/)API密钥
+QUERY_ENGINE_API_KEY=
+# Query Agent LLM接口BaseUrl
+QUERY_ENGINE_BASE_URL=
+# Query Agent LLM模型,如deepseek-reasoner
+QUERY_ENGINE_MODEL_NAME=
+# Report Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥
+REPORT_ENGINE_API_KEY=
+# Report Agent LLM接口BaseUrl
+REPORT_ENGINE_BASE_URL=
+# Report Agent LLM模型,如gemini-2.5-pro
+REPORT_ENGINE_MODEL_NAME=
+# Forum Host LLM API密钥,Qwen3最新模型,推荐 https://cloud.siliconflow.cn/
+FORUM_HOST_API_KEY=
+# Forum Host LLM BaseUrl
+FORUM_HOST_BASE_URL=
+# Forum Host LLM模型名,如Qwen/Qwen3-235B-A22B-Instruct-2507
+FORUM_HOST_MODEL_NAME=
+# SQL Keyword Optimizer LLM密钥,小参数Qwen3模型 https://cloud.siliconflow.cn/
+KEYWORD_OPTIMIZER_API_KEY=
+# Keyword Optimizer BaseUrl
+KEYWORD_OPTIMIZER_BASE_URL=
+# Keyword Optimizer LLM模型名称,如deepseek-chat
+KEYWORD_OPTIMIZER_MODEL_NAME=
+
+# ================== 网络工具配置 ====================
+# Tavily API密钥,用于Tavily网络搜索。注册地址:https://www.tavily.com/
+TAVILY_API_KEY=
+# Bocha Web Search API密钥,用于Bocha搜索。注册地址:https://open.bochaai.com/
+BOCHA_WEB_SEARCH_API_KEY=
\ No newline at end of file
diff --git a/MindSpider/BroadTopicExtraction/database_manager.py b/MindSpider/BroadTopicExtraction/database_manager.py
index 2e2bf21..8f1ede7 100644
--- a/MindSpider/BroadTopicExtraction/database_manager.py
+++ b/MindSpider/BroadTopicExtraction/database_manager.py
@@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器
import sys
import json
-from datetime import datetime, date
+from datetime import datetime, date, timedelta
from pathlib import Path
from typing import List, Dict, Optional
-import pymysql
-from pymysql.cursors import DictCursor
+from sqlalchemy import create_engine, text
+from sqlalchemy.engine import Engine
+from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -22,37 +23,44 @@ try:
except ImportError:
raise ImportError("无法导入config.py配置文件")
+from config import settings
+
class DatabaseManager:
"""数据库管理器"""
def __init__(self):
"""初始化数据库管理器"""
- self.connection = None
+ self.engine: Engine = None
self.connect()
def connect(self):
"""连接数据库"""
try:
- self.connection = pymysql.connect(
- host=config.DB_HOST,
- port=config.DB_PORT,
- user=config.DB_USER,
- password=config.DB_PASSWORD,
- database=config.DB_NAME,
- charset=config.DB_CHARSET,
- autocommit=True,
- cursorclass=DictCursor
- )
- print(f"成功连接到数据库: {config.DB_NAME}")
+ dialect = (settings.DB_DIALECT or "mysql").lower()
+ if dialect in ("postgresql", "postgres"):
+ url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
+ else:
+ url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
+ self.engine = create_engine(url, future=True)
+ logger.info(f"成功连接到数据库: {settings.DB_NAME}")
+ except ModuleNotFoundError as e:
+ missing: str = str(e)
+ if "psycopg" in missing:
+ logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
+ elif "pymysql" in missing:
+ logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
+ else:
+ logger.error(f"数据库连接失败(缺少驱动): {e}")
+ raise
except Exception as e:
- print(f"数据库连接失败: {e}")
+ logger.error(f"数据库连接失败: {e}")
raise
def close(self):
"""关闭数据库连接"""
- if self.connection:
- self.connection.close()
- print("数据库连接已关闭")
+ if self.engine:
+ self.engine.dispose()
+ logger.info("数据库连接已关闭")
def __enter__(self):
return self
@@ -79,48 +87,49 @@ class DatabaseManager:
current_timestamp = int(datetime.now().timestamp())
try:
- cursor = self.connection.cursor()
-
- # 先删除当天所有的新闻记录(覆盖模式)
- delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
- deleted_count = cursor.execute(delete_query, (crawl_date,))
- if deleted_count > 0:
- print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")
-
- # 批量插入新记录
saved_count = 0
+ # 先独立事务执行删除,防止后续插入失败导致无法清理
+ with self.engine.begin() as conn:
+ deleted = conn.execute(text("DELETE FROM daily_news WHERE crawl_date = :d"), {"d": crawl_date}).rowcount
+ if deleted and deleted > 0:
+ logger.info(f"覆盖模式:删除了当天已有的 {deleted} 条新闻记录")
+
+ # 逐条插入,单条失败不影响后续(每条独立事务)
for news_item in news_data:
try:
- # 简化的新闻ID生成
news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
-
- # 插入新记录
- insert_query = """
- INSERT INTO daily_news (
- news_id, source_platform, title, url, crawl_date,
- rank_position, add_ts
- ) VALUES (%s, %s, %s, %s, %s, %s, %s)
- """
- cursor.execute(insert_query, (
- news_id,
- news_item.get('source', 'unknown'),
- news_item.get('title', ''),
- news_item.get('url', ''),
- crawl_date,
- news_item.get('rank', None),
- current_timestamp
- ))
+ title_val = (news_item.get("title", "") or "")
+ if len(title_val) > 500:
+ title_val = title_val[:500]
+ with self.engine.begin() as conn:
+ conn.execute(
+ text(
+ """
+ INSERT INTO daily_news (
+ news_id, source_platform, title, url, crawl_date,
+ rank_position, add_ts, last_modify_ts
+ ) VALUES (:news_id, :source_platform, :title, :url, :crawl_date, :rank_position, :add_ts, :last_modify_ts)
+ """
+ ),
+ {
+ "news_id": news_id,
+ "source_platform": news_item.get("source", "unknown"),
+ "title": title_val,
+ "url": news_item.get("url", ""),
+ "crawl_date": crawl_date,
+ "rank_position": news_item.get("rank", None),
+ "add_ts": current_timestamp,
+ "last_modify_ts": current_timestamp,
+ },
+ )
saved_count += 1
-
except Exception as e:
- print(f"保存单条新闻失败: {e}")
+ logger.warning(f"保存单条新闻失败: {e}")
continue
-
- print(f"成功保存 {saved_count} 条新闻记录")
+ logger.info(f"成功保存 {saved_count} 条新闻记录")
return saved_count
-
except Exception as e:
- print(f"保存新闻数据失败: {e}")
+ logger.exception(f"保存新闻数据失败: {e}")
return 0
def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
@@ -136,15 +145,13 @@ class DatabaseManager:
if not crawl_date:
crawl_date = date.today()
- query = """
- SELECT * FROM daily_news
- WHERE crawl_date = %s
- ORDER BY rank_position ASC
- """
-
- cursor = self.connection.cursor()
- cursor.execute(query, (crawl_date,))
- return cursor.fetchall()
+ query = (
+ "SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC"
+ )
+ with self.engine.connect() as conn:
+ result = conn.execute(text(query), {"d": crawl_date})
+ rows = result.mappings().all()
+ return rows
# ==================== 话题数据操作 ====================
@@ -166,37 +173,31 @@ class DatabaseManager:
current_timestamp = int(datetime.now().timestamp())
try:
- cursor = self.connection.cursor()
-
- # 检查今天是否已有记录
- check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
- cursor.execute(check_query, (extract_date,))
- existing = cursor.fetchone()
-
keywords_json = json.dumps(keywords, ensure_ascii=False)
-
- if existing:
- # 更新现有记录
- update_query = """
- UPDATE daily_topics
- SET keywords = %s, summary = %s, add_ts = %s
- WHERE extract_date = %s
- """
- cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
- print(f"更新了 {extract_date} 的话题分析")
- else:
- # 插入新记录
- insert_query = """
- INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
- VALUES (%s, %s, %s, %s)
- """
- cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
- print(f"保存了 {extract_date} 的话题分析")
-
+ with self.engine.begin() as conn:
+ check = conn.execute(
+ text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
+ {"d": extract_date, "tid": "summary"},
+ ).first()
+ if check:
+ conn.execute(
+ text(
+ "UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
+ ),
+ {"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"},
+ )
+ logger.info(f"更新了 {extract_date} 的话题分析")
+ else:
+ conn.execute(
+ text(
+ "INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
+ ),
+ {"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp},
+ )
+ logger.info(f"保存了 {extract_date} 的话题分析")
return True
-
except Exception as e:
- print(f"保存话题分析失败: {e}")
+ logger.exception(f"保存话题分析失败: {e}")
return False
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
@@ -213,20 +214,15 @@ class DatabaseManager:
extract_date = date.today()
try:
- cursor = self.connection.cursor()
- query = "SELECT * FROM daily_topics WHERE extract_date = %s"
- cursor.execute(query, (extract_date,))
- result = cursor.fetchone()
-
- if result:
- # 解析关键词JSON
- result['keywords'] = json.loads(result['keywords'])
- return result
- else:
+ with self.engine.connect() as conn:
+ result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first()
+ if result:
+ result = dict(result) # 转为可变dict以支持item赋值
+ result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
+ return result
return None
-
except Exception as e:
- print(f"获取话题分析失败: {e}")
+ logger.exception(f"获取话题分析失败: {e}")
return None
def get_recent_topics(self, days: int = 7) -> List[Dict]:
@@ -240,23 +236,23 @@ class DatabaseManager:
话题分析列表
"""
try:
- cursor = self.connection.cursor()
- query = """
- SELECT * FROM daily_topics
- WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
- ORDER BY extract_date DESC
- """
- cursor.execute(query, (days,))
- results = cursor.fetchall()
-
- # 解析每个结果的关键词JSON
- for result in results:
- result['keywords'] = json.loads(result['keywords'])
-
- return results
-
+ start_date = date.today() - timedelta(days=days)
+ with self.engine.connect() as conn:
+ results = conn.execute(
+ text(
+ """
+ SELECT * FROM daily_topics
+ WHERE extract_date >= :start_date
+ ORDER BY extract_date DESC
+ """
+ ),
+ {"start_date": start_date},
+ ).mappings().all()
+ for r in results:
+ r["keywords"] = json.loads(r["keywords"]) if r.get("keywords") else []
+ return results
except Exception as e:
- print(f"获取最近话题分析失败: {e}")
+ logger.exception(f"获取最近话题分析失败: {e}")
return []
# ==================== 统计查询 ====================
@@ -264,56 +260,48 @@ class DatabaseManager:
def get_summary_stats(self, days: int = 7) -> Dict:
"""获取统计摘要"""
try:
- cursor = self.connection.cursor()
-
- # 新闻统计
- news_query = """
- SELECT
- crawl_date,
- COUNT(*) as news_count,
- COUNT(DISTINCT source_platform) as platforms_count
- FROM daily_news
- WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
- GROUP BY crawl_date
- ORDER BY crawl_date DESC
- """
- cursor.execute(news_query, (days,))
- news_stats = cursor.fetchall()
-
- # 话题统计
- topics_query = """
- SELECT
- extract_date,
- keywords,
- CHAR_LENGTH(summary) as summary_length
- FROM daily_topics
- WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
- ORDER BY extract_date DESC
- """
- cursor.execute(topics_query, (days,))
- topics_stats = cursor.fetchall()
-
- return {
- 'news_stats': news_stats,
- 'topics_stats': topics_stats
- }
-
+ start_date = date.today() - timedelta(days=days)
+ with self.engine.connect() as conn:
+ news_stats = conn.execute(
+ text(
+ """
+ SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms_count
+ FROM daily_news
+ WHERE crawl_date >= :start_date
+ GROUP BY crawl_date
+ ORDER BY crawl_date DESC
+ """
+ ),
+ {"start_date": start_date},
+ ).all()
+ topics_stats = conn.execute(
+ text(
+ """
+ SELECT extract_date, keywords, CHAR_LENGTH(topic_description) as summary_length
+ FROM daily_topics
+ WHERE extract_date >= :start_date
+ ORDER BY extract_date DESC
+ """
+ ),
+ {"start_date": start_date},
+ ).all()
+ return {"news_stats": news_stats, "topics_stats": topics_stats}
except Exception as e:
- print(f"获取统计摘要失败: {e}")
- return {'news_stats': [], 'topics_stats': []}
+ logger.exception(f"获取统计摘要失败: {e}")
+ return {"news_stats": [], "topics_stats": []}
if __name__ == "__main__":
# 测试数据库管理器
with DatabaseManager() as db:
# 测试获取新闻
news = db.get_daily_news()
- print(f"今日新闻数量: {len(news)}")
+ logger.info(f"今日新闻数量: {len(news)}")
# 测试获取话题
topics = db.get_daily_topics()
if topics:
- print(f"今日话题关键词: {topics['keywords']}")
+ logger.info(f"今日话题关键词: {topics['keywords']}")
else:
- print("今日暂无话题分析")
+ logger.info("今日暂无话题分析")
- print("简化数据库管理器测试完成!")
+ logger.info("简化数据库管理器测试完成!")
diff --git a/MindSpider/BroadTopicExtraction/main.py b/MindSpider/BroadTopicExtraction/main.py
index 7160ed4..438db73 100644
--- a/MindSpider/BroadTopicExtraction/main.py
+++ b/MindSpider/BroadTopicExtraction/main.py
@@ -11,6 +11,7 @@ import argparse
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
+from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -21,8 +22,8 @@ try:
from BroadTopicExtraction.topic_extractor import TopicExtractor
from BroadTopicExtraction.database_manager import DatabaseManager
except ImportError as e:
- print(f"导入模块失败: {e}")
- print("请确保在项目根目录运行,并且已安装所有依赖")
+ logger.exception(f"导入模块失败: {e}")
+ logger.error("请确保在项目根目录运行,并且已安装所有依赖")
sys.exit(1)
class BroadTopicExtraction:
@@ -34,7 +35,7 @@ class BroadTopicExtraction:
self.topic_extractor = TopicExtractor()
self.db_manager = DatabaseManager()
- print("BroadTopicExtraction 初始化完成")
+ logger.info("BroadTopicExtraction 初始化完成")
def close(self):
"""关闭资源"""
@@ -68,21 +69,22 @@ class BroadTopicExtraction:
Returns:
包含完整提取结果的字典
"""
- print("\n" + "=" * 80)
- print("MindSpider AI爬虫 - 每日话题提取")
- print("=" * 80)
- print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
- print(f"目标日期: {date.today()}")
+ extraction_result_message = ""
+ extraction_result_message += "\nMindSpider AI爬虫 - 每日话题提取\n"
+ extraction_result_message += f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
+ extraction_result_message += f"目标日期: {date.today()}\n"
if news_sources:
- print(f"指定平台: {len(news_sources)} 个")
+ extraction_result_message += f"指定平台: {len(news_sources)} 个\n"
for source in news_sources:
source_name = SOURCE_NAMES.get(source, source)
- print(f" - {source_name}")
+ extraction_result_message += f" - {source_name}\n"
else:
- print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
+ extraction_result_message += f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台\n"
- print(f"关键词数: 最多 {max_keywords} 个")
+ extraction_result_message += f"关键词数: 最多 {max_keywords} 个\n"
+
+ logger.info(extraction_result_message)
extraction_result = {
'success': False,
@@ -96,7 +98,7 @@ class BroadTopicExtraction:
try:
# 步骤1: 收集新闻
- print("\n【步骤1】收集热点新闻...")
+ logger.info("【步骤1】收集热点新闻...")
news_result = await self.news_collector.collect_and_save_news(
sources=news_sources
)
@@ -112,7 +114,7 @@ class BroadTopicExtraction:
raise Exception("新闻收集失败或没有获取到新闻")
# 步骤2: 提取关键词和生成总结
- print("\n【步骤2】提取关键词和生成总结...")
+ logger.info("【步骤2】提取关键词和生成总结...")
keywords, summary = self.topic_extractor.extract_keywords_and_summary(
news_result['news_list'],
max_keywords=max_keywords
@@ -126,10 +128,10 @@ class BroadTopicExtraction:
}
if not keywords:
- print("警告: 没有提取到有效关键词")
+ logger.warning("警告: 没有提取到有效关键词")
# 步骤3: 保存到数据库
- print("\n【步骤3】保存分析结果到数据库...")
+ logger.info("【步骤3】保存分析结果到数据库...")
save_success = self.db_manager.save_daily_topics(
keywords, summary, date.today()
)
@@ -141,56 +143,47 @@ class BroadTopicExtraction:
extraction_result['success'] = True
extraction_result['end_time'] = datetime.now().isoformat()
- print("\n" + "=" * 80)
- print("每日话题提取流程完成!")
- print("=" * 80)
+ logger.info("每日话题提取流程完成!")
return extraction_result
except Exception as e:
- print(f"\n话题提取流程失败: {e}")
+ logger.exception(f"话题提取流程失败: {e}")
extraction_result['error'] = str(e)
extraction_result['end_time'] = datetime.now().isoformat()
return extraction_result
def print_extraction_results(self, extraction_result: Dict):
"""打印提取结果"""
- print("\n" + "=" * 80)
- print("话题提取结果报告")
- print("=" * 80)
-
- if not extraction_result['success']:
- print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
- return
+ extraction_result_message = ""
# 新闻收集结果
news_data = extraction_result.get('news_collection', {})
- print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
- print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
+ extraction_result_message += f"\n📰 新闻收集: {news_data.get('total_news', 0)} 条新闻\n"
+ extraction_result_message += f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}\n"
# 话题提取结果
topic_data = extraction_result.get('topic_extraction', {})
keywords = topic_data.get('keywords', [])
summary = topic_data.get('summary', '')
- print(f"\n🔑 提取关键词: {len(keywords)} 个")
+ extraction_result_message += f"\n🔑 提取关键词: {len(keywords)} 个\n"
if keywords:
# 每行显示5个关键词
for i in range(0, len(keywords), 5):
keyword_group = keywords[i:i+5]
- print(f" {', '.join(keyword_group)}")
+ extraction_result_message += f" {', '.join(keyword_group)}\n"
- print(f"\n📝 新闻总结:")
- print(f" {summary}")
+ extraction_result_message += f"\n📝 新闻总结:\n {summary}\n"
# 数据库保存结果
db_data = extraction_result.get('database_save', {})
if db_data.get('success'):
- print(f"\n💾 数据库保存: 成功")
+ extraction_result_message += f"\n💾 数据库保存: 成功\n"
else:
- print(f"\n💾 数据库保存: 失败")
+ extraction_result_message += f"\n💾 数据库保存: 失败\n"
- print("\n" + "=" * 80)
+ logger.info(extraction_result_message)
def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
"""
@@ -207,7 +200,7 @@ class BroadTopicExtraction:
topics_data = self.db_manager.get_daily_topics(extract_date)
if not topics_data:
- print(f"没有找到 {extract_date or date.today()} 的话题数据")
+ logger.info(f"没有找到 {extract_date or date.today()} 的话题数据")
return []
keywords = topics_data['keywords']
@@ -215,11 +208,11 @@ class BroadTopicExtraction:
# 生成搜索关键词
search_keywords = self.topic_extractor.get_search_keywords(keywords)
- print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
+ logger.info(f"准备了 {len(search_keywords)} 个关键词用于爬取")
return search_keywords
except Exception as e:
- print(f"获取爬取关键词失败: {e}")
+ logger.error(f"获取爬取关键词失败: {e}")
return []
def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
@@ -227,7 +220,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_daily_topics(target_date)
except Exception as e:
- print(f"获取每日分析失败: {e}")
+ logger.error(f"获取每日分析失败: {e}")
return None
def get_recent_analysis(self, days: int = 7) -> List[Dict]:
@@ -235,7 +228,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_recent_topics(days)
except Exception as e:
- print(f"获取最近分析失败: {e}")
+ logger.error(f"获取最近分析失败: {e}")
return []
# ==================== 命令行工具 ====================
@@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
news_data = result.get('news_collection', {})
topic_data = result.get('topic_extraction', {})
- print(f"✅ 话题提取成功完成!")
- print(f" 收集新闻: {news_data.get('total_news', 0)} 条")
- print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
- print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
+ logger.info(f"✅ 话题提取成功完成!")
+ logger.info(f" 收集新闻: {news_data.get('total_news', 0)} 条")
+ logger.info(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
+ logger.info(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
# 获取爬取关键词
crawling_keywords = extractor.get_keywords_for_crawling()
if crawling_keywords:
- print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
- print(f" {', '.join(crawling_keywords)}")
+ logger.info(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
+ logger.info(f" {', '.join(crawling_keywords)}")
# 保存关键词到文件
keywords_file = project_root / "data" / "daily_keywords.txt"
@@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
with open(keywords_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(crawling_keywords))
- print(f" 关键词已保存到: {keywords_file}")
+ logger.info(f" 关键词已保存到: {keywords_file}")
return True
else:
- print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
+ logger.error(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
return False
except Exception as e:
- print(f"❌ 执行过程中发生错误: {e}")
+ logger.error(f"❌ 执行过程中发生错误: {e}")
return False
def main():
@@ -304,14 +297,14 @@ def main():
# 显示支持的新闻源
if args.list_sources:
- print("支持的新闻源平台:")
+ logger.info("支持的新闻源平台:")
for source, name in SOURCE_NAMES.items():
- print(f" {source:<25} {name}")
+ logger.info(f" {source:<25} {name}")
return
# 验证参数
if args.keywords < 1 or args.keywords > 200:
- print("关键词数量应在1-200之间")
+ logger.error("关键词数量应在1-200之间")
sys.exit(1)
# 运行提取
@@ -325,7 +318,7 @@ def main():
sys.exit(0 if success else 1)
except KeyboardInterrupt:
- print("\n用户中断操作")
+ logger.info("用户中断操作")
sys.exit(1)
if __name__ == "__main__":
diff --git a/MindSpider/BroadTopicExtraction/topic_extractor.py b/MindSpider/BroadTopicExtraction/topic_extractor.py
index d8329c2..1173e4b 100644
--- a/MindSpider/BroadTopicExtraction/topic_extractor.py
+++ b/MindSpider/BroadTopicExtraction/topic_extractor.py
@@ -18,19 +18,20 @@ sys.path.append(str(project_root))
try:
import config
+ from config import settings
except ImportError:
- raise ImportError("无法导入config.py配置文件")
+ raise ImportError("无法导入settings.py配置文件")
class TopicExtractor:
"""话题提取器"""
-
+
def __init__(self):
"""初始化话题提取器"""
self.client = OpenAI(
- api_key=config.DEEPSEEK_API_KEY,
- base_url="https://api.deepseek.com"
+ api_key=settings.MINDSPIDER_API_KEY,
+ base_url=settings.MINDSPIDER_BASE_URL
)
- self.model = "deepseek-chat"
+ self.model = settings.MINDSPIDER_MODEL_NAME
def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
"""
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..cee4270
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,39 @@
+---
+name: MediaCrawler Bug反馈
+about: 创建一个问题Bug以帮助MediaCrawler开源项目改进
+title: '[BUG] '
+labels: bug
+assignees: ''
+---
+
+## 🔍 问题检查清单
+
+
+- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html)
+- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed)
+- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题
+
+## 🐛 问题描述
+
+
+
+## 📝 复现步骤
+1.
+2.
+3.
+
+## 💻 运行环境
+- 操作系统:
+- Python版本:
+- 是否使用IP代理:
+- 是否使用VPN翻墙软件:
+- 目标平台(抖音/小红书/微博等):
+
+## 📋 错误日志
+
+```shell
+在此粘贴错误日志
+```
+
+## 📷 错误截图
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md
new file mode 100644
index 0000000..649c263
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md
@@ -0,0 +1,36 @@
+---
+name: MediaCrawler使用问题咨询
+about: 提交使用过程中遇到的问题
+title: '[问题] '
+labels: question
+assignees: ''
+---
+
+## ⚠️ 提交前确认
+
+- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html)
+- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed)
+- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题
+
+## ❓ 问题描述
+
+
+## 🔍 使用场景
+
+- 目标平台: (如:小红书/抖音/微博等)
+- 使用功能: (如:关键词搜索/用户主页爬取等)
+
+## 💻 环境信息
+- 操作系统:
+- Python版本:
+- 是否使用IP代理:
+- 是否使用VPN翻墙软件:
+- 目标平台(抖音/小红书/微博等):
+
+## 📋 错误日志
+```shell
+在此粘贴完整的错误日志
+```
+
+## 📷 错误截图
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml
new file mode 100644
index 0000000..eece8af
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml
@@ -0,0 +1,64 @@
+# 构建 VitePress 站点并将其部署到 GitHub Pages 的示例工作流程
+#
+name: Deploy VitePress site to Pages
+
+on:
+ # 在针对 `main` 分支的推送上运行。如果你
+ # 使用 `master` 分支作为默认分支,请将其更改为 `master`
+ push:
+ branches: [main]
+
+ # 允许你从 Actions 选项卡手动运行此工作流程
+ workflow_dispatch:
+
+# 设置 GITHUB_TOKEN 的权限,以允许部署到 GitHub Pages
+permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+# 只允许同时进行一次部署,跳过正在运行和最新队列之间的运行队列
+# 但是,不要取消正在进行的运行,因为我们希望允许这些生产部署完成
+concurrency:
+ group: pages
+ cancel-in-progress: false
+
+jobs:
+ # 构建工作
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0 # 如果未启用 lastUpdated,则不需要
+ # - uses: pnpm/action-setup@v3 # 如果使用 pnpm,请取消注释
+ # - uses: oven-sh/setup-bun@v1 # 如果使用 Bun,请取消注释
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+ cache: npm # 或 pnpm / yarn
+ - name: Setup Pages
+ uses: actions/configure-pages@v4
+ - name: Install dependencies
+ run: npm ci # 或 pnpm install / yarn install / bun install
+ - name: Build with VitePress
+ run: npm run docs:build # 或 pnpm docs:build / yarn docs:build / bun run docs:build
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: docs/.vitepress/dist
+
+ # 部署工作
+ deploy:
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ needs: build
+ runs-on: ubuntu-latest
+ name: Deploy
+ steps:
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore
index c9a9ac8..dedbdfa 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore
@@ -173,4 +173,9 @@ docs/.vitepress/cache
# other gitignore
.venv
-.refer
\ No newline at end of file
+.refer
+
+agent_zone
+debug_tools
+
+database/*.db
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version
index bd28b9c..2c07333 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version
@@ -1 +1 @@
-3.9
+3.11
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE b/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE
new file mode 100644
index 0000000..78408f6
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE
@@ -0,0 +1,58 @@
+NON-COMMERCIAL LEARNING LICENSE 1.1
+
+Copyright (c) [2024] [relakkes@gmail.com]
+
+WHEREAS:
+1. The copyright owner owns and controls the copyright of this software and related documentation files (hereinafter referred to as the "Software");
+2. The user wishes to use the Software for learning purposes;
+3. The copyright owner is willing to authorize the user to use the Software under the conditions stated in this license;
+
+NOW, THEREFORE, the parties, in compliance with relevant laws and regulations, agree to the following terms:
+
+SCOPE OF AUTHORIZATION:
+1. The copyright owner hereby grants any natural person or legal entity (hereinafter referred to as the "User") accepting this license a free, non-exclusive, non-transferable right to use, copy, modify, and merge the Software for non-commercial learning purposes, subject to the following conditions.
+
+CONDITIONS:
+1. The User must include the above copyright notice and this license statement in all reasonably prominent locations of the Software and its copies.
+2. The Software is limited to learning and research purposes only, and may not be used for large-scale crawling or activities that disrupt platform operations.
+3. Without the written consent of the copyright owner, the Software may not be used for any commercial purposes or to cause improper influence on third parties.
+
+DISCLAIMER:
+1. The Software is provided "AS IS," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement.
+2. In no event shall the copyright owner be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this Software, even if advised of the possibility of such damage.
+
+APPLICABLE LAW:
+1. The interpretation and enforcement of this license shall comply with local laws and regulations.
+2. Any disputes arising from or related to this license shall be resolved through friendly negotiation between the parties; if negotiation fails, either party may submit the dispute to the people's court where the copyright owner is located for resolution.
+
+This license constitutes the entire agreement between the parties regarding the Software, superseding and merging all prior discussions, communications, and agreements, whether oral or written.
+
+
+非商业学习使用许可证 1.1
+
+版权所有 (c) [2024] [relakkes@gmail.com]
+
+鉴于:
+1. 版权所有者拥有和控制本软件和相关文档文件(以下简称“软件”)的版权;
+2. 使用者希望使用该软件进行学习;
+3. 版权所有者愿意在本许可证所述的条件下授权使用者使用该软件;
+
+现因此,双方遵循相关法律法规,同意如下条款:
+
+授权范围:
+1. 版权所有者特此免费授予接受本许可证的任何自然人或法人(以下简称“使用者”)非独占的、不可转让的权利,在非商业学习目的下使用、复制、修改、合并本软件,前提是遵守以下条件。
+
+条件:
+1. 使用者必须在软件及其副本的所有合理显著位置包含上述版权声明和本许可证声明。
+2. 本软件仅限用于学习和研究目的,不得用于大规模爬虫或对平台造成运营干扰的行为。
+3. 未经版权所有者书面同意,不得将本软件用于任何商业用途或对第三方造成不当影响。
+
+免责声明:
+1. 本软件按“现状”提供,不提供任何形式的明示或暗示保证,包括但不限于对适销性、特定用途的适用性和非侵权的保证。
+2. 在任何情况下,版权所有者均不对因使用本软件而产生的,或在任何方式上与本软件有关的任何直接、间接、偶然、特殊、示例性或后果性损害负责(包括但不限于采购替代品或服务;使用、数据或利润的损失;或业务中断),无论这些损害是如何引起的,以及无论是通过合同、严格责任还是侵权行为(包括疏忽或其他方式)产生的,即使已被告知此类损害的可能性。
+
+适用法律:
+1. 本许可证的解释和执行应遵循当地法律法规。
+2. 因本许可证引起的或与之相关的任何争议,双方应友好协商解决;协商不成时,任何一方可将争议提交至版权所有者所在地的人民法院诉讼解决。
+
+本许可证构成双方之间关于本软件的完整协议,取代并合并以前的讨论、交流和协议,无论是口头还是书面的。
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md
new file mode 100644
index 0000000..ef01d7c
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md
@@ -0,0 +1,342 @@
+# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
+
+
+
Special thanks to:
+
+
+
+
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+
+
+
+
+
+
+
+[](https://github.com/NanmiCoder/MediaCrawler/stargazers)
+[](https://github.com/NanmiCoder/MediaCrawler/network/members)
+[](https://github.com/NanmiCoder/MediaCrawler/issues)
+[](https://github.com/NanmiCoder/MediaCrawler/pulls)
+[](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
+[](README.md)
+[](README_en.md)
+[](README_es.md)
+
+
+
+
+> **免责声明:**
+>
+> 大家请以学习为目的使用本仓库⚠️⚠️⚠️⚠️,[爬虫违法违规的案件](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China)
+>
+>本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
+>
+> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
+
+
+
+
+## 📖 项目简介
+
+一个功能强大的**多平台自媒体数据采集工具**,支持小红书、抖音、快手、B站、微博、贴吧、知乎等主流平台的公开信息抓取。
+
+### 🔧 技术原理
+
+- **核心技术**:基于 [Playwright](https://playwright.dev/) 浏览器自动化框架登录保存登录态
+- **无需JS逆向**:利用保留登录态的浏览器上下文环境,通过 JS 表达式获取签名参数
+- **优势特点**:无需逆向复杂的加密算法,大幅降低技术门槛
+
+## ✨ 功能特性
+| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
+| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
+| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+
+
+### 🚀 MediaCrawlerPro 重磅发布!
+
+> 专注于学习成熟项目的架构设计,不仅仅是爬虫技术,Pro 版本的代码设计思路同样值得深入学习!
+
+[MediaCrawlerPro](https://github.com/MediaCrawlerPro) 相较于开源版本的核心优势:
+
+#### 🎯 核心功能升级
+- ✅ **断点续爬功能**(重点特性)
+- ✅ **多账号 + IP代理池支持**(重点特性)
+- ✅ **去除 Playwright 依赖**,使用更简单
+- ✅ **完整 Linux 环境支持**
+
+#### 🏗️ 架构设计优化
+- ✅ **代码重构优化**,更易读易维护(解耦 JS 签名逻辑)
+- ✅ **企业级代码质量**,适合构建大型爬虫项目
+- ✅ **完美架构设计**,高扩展性,源码学习价值更大
+
+#### 🎁 额外功能
+- ✅ **自媒体视频下载器桌面端**(适合学习全栈开发)
+- ✅ **多平台首页信息流推荐**(HomeFeed)
+- [ ] **基于自媒体平台的AI Agent正在开发中 🚀🚀**
+
+点击查看:[MediaCrawlerPro 项目主页](https://github.com/MediaCrawlerPro) 更多介绍
+
+
+## 🚀 快速开始
+
+> 💡 **开源不易,如果这个项目对您有帮助,请给个 ⭐ Star 支持一下!**
+
+## 📋 前置依赖
+
+### 🚀 uv 安装(推荐)
+
+在进行下一步操作之前,请确保电脑上已经安装了 uv:
+
+- **安装地址**:[uv 官方安装指南](https://docs.astral.sh/uv/getting-started/installation)
+- **验证安装**:终端输入命令 `uv --version`,如果正常显示版本号,证明已经安装成功
+- **推荐理由**:uv 是目前最强的 Python 包管理工具,速度快、依赖解析准确
+
+### 🟢 Node.js 安装
+
+项目依赖 Node.js,请前往官网下载安装:
+
+- **下载地址**:https://nodejs.org/en/download/
+- **版本要求**:>= 16.0.0
+
+### 📦 Python 包安装
+
+```shell
+# 进入项目目录
+cd MediaCrawler
+
+# 使用 uv sync 命令来保证 python 版本和相关依赖包的一致性
+uv sync
+```
+
+### 🌐 浏览器驱动安装
+
+```shell
+# 安装浏览器驱动
+uv run playwright install
+```
+
+> **💡 提示**:MediaCrawler 目前已经支持使用 playwright 连接你本地的 Chrome 浏览器了,一些因为 Webdriver 导致的问题迎刃而解了。
+>
+> 目前开放了 `xhs` 和 `dy` 这两个使用 CDP 的方式连接本地浏览器,如有需要,查看 `config/base_config.py` 中的配置项。
+
+## 🚀 运行爬虫程序
+
+```shell
+# 项目默认是没有开启评论爬取模式,如需评论请在 config/base_config.py 中的 ENABLE_GET_COMMENTS 变量修改
+# 一些其他支持项,也可以在 config/base_config.py 查看功能,写的有中文注释
+
+# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
+uv run main.py --platform xhs --lt qrcode --type search
+
+# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
+uv run main.py --platform xhs --lt qrcode --type detail
+
+# 打开对应APP扫二维码登录
+
+# 其他平台爬虫使用示例,执行下面的命令查看
+uv run main.py --help
+```
+
+
+🔗 使用 Python 原生 venv 管理环境(不推荐)
+
+#### 创建并激活 Python 虚拟环境
+
+> 如果是爬取抖音和知乎,需要提前安装 nodejs 环境,版本大于等于:`16` 即可
+
+```shell
+# 进入项目根目录
+cd MediaCrawler
+
+# 创建虚拟环境
+# 我的 python 版本是:3.9.6,requirements.txt 中的库是基于这个版本的
+# 如果是其他 python 版本,可能 requirements.txt 中的库不兼容,需自行解决
+python -m venv venv
+
+# macOS & Linux 激活虚拟环境
+source venv/bin/activate
+
+# Windows 激活虚拟环境
+venv\Scripts\activate
+```
+
+#### 安装依赖库
+
+```shell
+pip install -r requirements.txt
+```
+
+#### 安装 playwright 浏览器驱动
+
+```shell
+playwright install
+```
+
+#### 运行爬虫程序(原生环境)
+
+```shell
+# 项目默认是没有开启评论爬取模式,如需评论请在 config/base_config.py 中的 ENABLE_GET_COMMENTS 变量修改
+# 一些其他支持项,也可以在 config/base_config.py 查看功能,写的有中文注释
+
+# 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
+python main.py --platform xhs --lt qrcode --type search
+
+# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
+python main.py --platform xhs --lt qrcode --type detail
+
+# 打开对应APP扫二维码登录
+
+# 其他平台爬虫使用示例,执行下面的命令查看
+python main.py --help
+```
+
+
+
+
+## 💾 数据保存
+
+支持多种数据存储方式:
+- **CSV 文件**:支持保存到 CSV 中(`data/` 目录下)
+- **JSON 文件**:支持保存到 JSON 中(`data/` 目录下)
+- **数据库存储**
+ - 使用参数 `--init_db` 进行数据库初始化(使用`--init_db`时不需要携带其他optional)
+ - **SQLite 数据库**:轻量级数据库,无需服务器,适合个人使用(推荐)
+ 1. 初始化:`--init_db sqlite`
+ 2. 数据存储:`--save_data_option sqlite`
+ - **MySQL 数据库**:支持关系型数据库 MySQL 中保存(需要提前创建数据库)
+ 1. 初始化:`--init_db mysql`
+ 2. 数据存储:`--save_data_option db`(db 参数为兼容历史更新保留)
+
+
+### 使用示例:
+```shell
+# 初始化 SQLite 数据库(使用'--init_db'时不需要携带其他optional)
+uv run main.py --init_db sqlite
+# 使用 SQLite 存储数据(推荐个人用户使用)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
+```
+```shell
+# 初始化 MySQL 数据库
+uv run main.py --init_db mysql
+# 使用 MySQL 存储数据(为适配历史更新,db参数进行沿用)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
+```
+
+
+[🚀 MediaCrawlerPro 重磅发布 🚀!更多的功能,更好的架构设计!](https://github.com/MediaCrawlerPro)
+
+
+### 💬 交流群组
+- **微信交流群**:[点击加入](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
+
+### 📚 其他
+- **常见问题**:[MediaCrawler 完整文档](https://nanmicoder.github.io/MediaCrawler/)
+- **爬虫入门教程**:[CrawlerTutorial 免费教程](https://github.com/NanmiCoder/CrawlerTutorial)
+- **新闻爬虫开源项目**:[NewsCrawlerCollection](https://github.com/NanmiCoder/NewsCrawlerCollection)
+---
+
+### 💰 赞助商展示
+
+
+
+
+豌豆HTTP自营千万级IP资源池,IP纯净度≥99.8%,每日保持IP高频更新,快速响应,稳定连接,满足多种业务场景,支持按需定制,注册免费提取10000ip。
+
+
+---
+
+
+
+
+
+
+
+[TikHub](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 提供超过 **700 个端点**,可用于从 **14+ 个社交媒体平台** 获取与分析数据 —— 包括视频、用户、评论、商店、商品与趋势等,一站式完成所有数据访问与分析。
+
+通过每日签到,可以获取免费额度。可以使用我的注册链接:[https://user.tikhub.io/users/signup?referral_code=cfzyejV9](https://user.tikhub.io/users/signup?referral_code=cfzyejV9&utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 或使用邀请码:`cfzyejV9`,注册并充值即可获得 **$2 免费额度**。
+
+[TikHub](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad) 提供以下服务:
+
+- 🚀 丰富的社交媒体数据接口(TikTok、Douyin、XHS、YouTube、Instagram等)
+- 💎 每日签到免费领取额度
+- ⚡ 高成功率与高并发支持
+- 🌐 官网:[https://tikhub.io/](https://tikhub.io/?utm_source=github.com/NanmiCoder/MediaCrawler&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad)
+- 💻 GitHub地址:[https://github.com/TikHubIO/](https://github.com/TikHubIO/)
+
+---
+
+
+
+
+
+
+Nstbrowser 指纹浏览器 — 多账号运营&自动化管理的最佳解决方案
+
+多账号安全管理与会话隔离;指纹定制结合反检测浏览器环境,兼顾真实度与稳定性;覆盖店铺管理、电商监控、社媒营销、广告验证、Web3、投放监控与联盟营销等业务线;提供生产级并发与定制化企业服务;提供可一键部署的云端浏览器方案,配套全球高质量 IP 池,为您构建长期行业竞争力
+
+[点击此处即刻开始免费使用](https://app.nstbrowser.io/account/register?utm_source=official&utm_term=mediacrawler)
+
+使用 NSTBROWSER 可获得 10% 充值赠礼
+
+
+
+### 🤝 成为赞助者
+
+成为赞助者,可以将您的产品展示在这里,每天获得大量曝光!
+
+**联系方式**:
+- 微信:`relakkes`
+- 邮箱:`relakkes@gmail.com`
+
+---
+
+## ⭐ Star 趋势图
+
+如果这个项目对您有帮助,请给个 ⭐ Star 支持一下,让更多的人看到 MediaCrawler!
+
+[](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
+
+
+
+## 📚 参考
+
+- **小红书客户端**:[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs)
+- **短信转发**:[SmsForwarder 参考仓库](https://github.com/pppscn/SmsForwarder)
+- **内网穿透工具**:[ngrok 官方文档](https://ngrok.com/docs/)
+
+
+# 免责声明
+
+
+## 1. 项目目的与性质
+本项目(以下简称“本项目”)是作为一个技术研究与学习工具而创建的,旨在探索和学习网络数据采集技术。本项目专注于自媒体平台的数据爬取技术研究,旨在提供给学习者和研究者作为技术交流之用。
+
+## 2. 法律合规性声明
+本项目开发者(以下简称“开发者”)郑重提醒用户在下载、安装和使用本项目时,严格遵守中华人民共和国相关法律法规,包括但不限于《中华人民共和国网络安全法》、《中华人民共和国反间谍法》等所有适用的国家法律和政策。用户应自行承担一切因使用本项目而可能引起的法律责任。
+
+## 3. 使用目的限制
+本项目严禁用于任何非法目的或非学习、非研究的商业行为。本项目不得用于任何形式的非法侵入他人计算机系统,不得用于任何侵犯他人知识产权或其他合法权益的行为。用户应保证其使用本项目的目的纯属个人学习和技术研究,不得用于任何形式的非法活动。
+
+## 4. 免责声明
+开发者已尽最大努力确保本项目的正当性及安全性,但不对用户使用本项目可能引起的任何形式的直接或间接损失承担责任。包括但不限于由于使用本项目而导致的任何数据丢失、设备损坏、法律诉讼等。
+
+## 5. 知识产权声明
+本项目的知识产权归开发者所有。本项目受到著作权法和国际著作权条约以及其他知识产权法律和条约的保护。用户在遵守本声明及相关法律法规的前提下,可以下载和使用本项目。
+
+## 6. 最终解释权
+关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/README_en.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_en.md
new file mode 100644
index 0000000..38b23e3
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_en.md
@@ -0,0 +1,327 @@
+
+
Special thanks to:
+
+
+
+
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+# 🔥 MediaCrawler - Social Media Platform Crawler 🕷️
+
+
+
+
+
+
+
+[](https://github.com/NanmiCoder/MediaCrawler/stargazers)
+[](https://github.com/NanmiCoder/MediaCrawler/network/members)
+[](https://github.com/NanmiCoder/MediaCrawler/issues)
+[](https://github.com/NanmiCoder/MediaCrawler/pulls)
+[](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
+[](README.md)
+[](README_en.md)
+[](README_es.md)
+
+
+
+> **Disclaimer:**
+>
+> Please use this repository for learning purposes only ⚠️⚠️⚠️⚠️, [Web scraping illegal cases](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China)
+>
+>All content in this repository is for learning and reference purposes only, and commercial use is prohibited. No person or organization may use the content of this repository for illegal purposes or infringe upon the legitimate rights and interests of others. The web scraping technology involved in this repository is only for learning and research, and may not be used for large-scale crawling of other platforms or other illegal activities. This repository assumes no legal responsibility for any legal liability arising from the use of the content of this repository. By using the content of this repository, you agree to all terms and conditions of this disclaimer.
+>
+> Click to view a more detailed disclaimer. [Click to jump](#disclaimer)
+
+## 📖 Project Introduction
+
+A powerful **multi-platform social media data collection tool** that supports crawling public information from mainstream platforms including Xiaohongshu, Douyin, Kuaishou, Bilibili, Weibo, Tieba, Zhihu, and more.
+
+### 🔧 Technical Principles
+
+- **Core Technology**: Based on [Playwright](https://playwright.dev/) browser automation framework for login and maintaining login state
+- **No JS Reverse Engineering Required**: Uses browser context environment with preserved login state to obtain signature parameters through JS expressions
+- **Advantages**: No need to reverse complex encryption algorithms, significantly lowering the technical barrier
+
+## ✨ Features
+| Platform | Keyword Search | Specific Post ID Crawling | Secondary Comments | Specific Creator Homepage | Login State Cache | IP Proxy Pool | Generate Comment Word Cloud |
+| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
+| Xiaohongshu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Douyin | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Kuaishou | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Bilibili | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Weibo | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Tieba | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Zhihu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+
+
+🔗 🚀 MediaCrawlerPro Major Release! More features, better architectural design!
+
+### 🚀 MediaCrawlerPro Major Release!
+
+> Focus on learning mature project architectural design, not just crawling technology. The code design philosophy of the Pro version is equally worth in-depth study!
+
+[MediaCrawlerPro](https://github.com/MediaCrawlerPro) core advantages over the open-source version:
+
+#### 🎯 Core Feature Upgrades
+- ✅ **Resume crawling functionality** (Key feature)
+- ✅ **Multi-account + IP proxy pool support** (Key feature)
+- ✅ **Remove Playwright dependency**, easier to use
+- ✅ **Complete Linux environment support**
+
+#### 🏗️ Architectural Design Optimization
+- ✅ **Code refactoring optimization**, more readable and maintainable (decoupled JS signature logic)
+- ✅ **Enterprise-level code quality**, suitable for building large-scale crawler projects
+- ✅ **Perfect architectural design**, high scalability, greater source code learning value
+
+#### 🎁 Additional Features
+- ✅ **Social media video downloader desktop app** (suitable for learning full-stack development)
+- ✅ **Multi-platform homepage feed recommendations** (HomeFeed)
+- [ ] **AI Agent based on social media platforms is under development 🚀🚀**
+
+Click to view: [MediaCrawlerPro Project Homepage](https://github.com/MediaCrawlerPro) for more information
+
+
+## 🚀 Quick Start
+
+> 💡 **Open source is not easy, if this project helps you, please give a ⭐ Star to support!**
+
+## 📋 Prerequisites
+
+### 🚀 uv Installation (Recommended)
+
+Before proceeding with the next steps, please ensure that uv is installed on your computer:
+
+- **Installation Guide**: [uv Official Installation Guide](https://docs.astral.sh/uv/getting-started/installation)
+- **Verify Installation**: Enter the command `uv --version` in the terminal. If the version number is displayed normally, the installation was successful
+- **Recommendation Reason**: uv is currently the most powerful Python package management tool, with fast speed and accurate dependency resolution
+
+### 🟢 Node.js Installation
+
+The project depends on Node.js, please download and install from the official website:
+
+- **Download Link**: https://nodejs.org/en/download/
+- **Version Requirement**: >= 16.0.0
+
+### 📦 Python Package Installation
+
+```shell
+# Enter project directory
+cd MediaCrawler
+
+# Use uv sync command to ensure consistency of python version and related dependency packages
+uv sync
+```
+
+### 🌐 Browser Driver Installation
+
+```shell
+# Install browser driver
+uv run playwright install
+```
+
+> **💡 Tip**: MediaCrawler now supports using playwright to connect to your local Chrome browser, solving some issues caused by Webdriver.
+>
+> Currently, `xhs` and `dy` are available using CDP mode to connect to local browsers. If needed, check the configuration items in `config/base_config.py`.
+
+## 🚀 Run Crawler Program
+
+```shell
+# The project does not enable comment crawling mode by default. If you need comments, please modify the ENABLE_GET_COMMENTS variable in config/base_config.py
+# Other supported options can also be viewed in config/base_config.py with Chinese comments
+
+# Read keywords from configuration file to search related posts and crawl post information and comments
+uv run main.py --platform xhs --lt qrcode --type search
+
+# Read specified post ID list from configuration file to get information and comment information of specified posts
+uv run main.py --platform xhs --lt qrcode --type detail
+
+# Open corresponding APP to scan QR code for login
+
+# For other platform crawler usage examples, execute the following command to view
+uv run main.py --help
+```
+
+
+🔗 Using Python native venv environment management (Not recommended)
+
+#### Create and activate Python virtual environment
+
+> If crawling Douyin and Zhihu, you need to install nodejs environment in advance, version greater than or equal to: `16`
+
+```shell
+# Enter project root directory
+cd MediaCrawler
+
+# Create virtual environment
+# My python version is: 3.9.6, the libraries in requirements.txt are based on this version
+# If using other python versions, the libraries in requirements.txt may not be compatible, please resolve on your own
+python -m venv venv
+
+# macOS & Linux activate virtual environment
+source venv/bin/activate
+
+# Windows activate virtual environment
+venv\Scripts\activate
+```
+
+#### Install dependency libraries
+
+```shell
+pip install -r requirements.txt
+```
+
+#### Install playwright browser driver
+
+```shell
+playwright install
+```
+
+#### Run crawler program (native environment)
+
+```shell
+# The project does not enable comment crawling mode by default. If you need comments, please modify the ENABLE_GET_COMMENTS variable in config/base_config.py
+# Other supported options can also be viewed in config/base_config.py with Chinese comments
+
+# Read keywords from configuration file to search related posts and crawl post information and comments
+python main.py --platform xhs --lt qrcode --type search
+
+# Read specified post ID list from configuration file to get information and comment information of specified posts
+python main.py --platform xhs --lt qrcode --type detail
+
+# Open corresponding APP to scan QR code for login
+
+# For other platform crawler usage examples, execute the following command to view
+python main.py --help
+```
+
+
+
+
+## 💾 Data Storage
+
+Supports multiple data storage methods:
+- **CSV Files**: Supports saving to CSV (under `data/` directory)
+- **JSON Files**: Supports saving to JSON (under `data/` directory)
+- **Database Storage**
+ - Use the `--init_db` parameter for database initialization (when using `--init_db`, no other optional arguments are needed)
+ - **SQLite Database**: Lightweight database, no server required, suitable for personal use (recommended)
+ 1. Initialization: `--init_db sqlite`
+ 2. Data Storage: `--save_data_option sqlite`
+ - **MySQL Database**: Supports saving to relational database MySQL (database needs to be created in advance)
+ 1. Initialization: `--init_db mysql`
+ 2. Data Storage: `--save_data_option db` (the db parameter is retained for compatibility with historical updates)
+
+
+### Usage Examples:
+```shell
+# Initialize SQLite database (when using '--init_db', no other optional arguments are needed)
+uv run main.py --init_db sqlite
+# Use SQLite to store data (recommended for personal users)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
+```
+```shell
+# Initialize MySQL database
+uv run main.py --init_db mysql
+# Use MySQL to store data (the db parameter is retained for compatibility with historical updates)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
+```
+
+---
+
+[🚀 MediaCrawlerPro Major Release 🚀! More features, better architectural design!](https://github.com/MediaCrawlerPro)
+
+## 🤝 Community & Support
+
+### 💬 Discussion Groups
+- **WeChat Discussion Group**: [Click to join](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
+
+### 📚 Documentation & Tutorials
+- **Online Documentation**: [MediaCrawler Complete Documentation](https://nanmicoder.github.io/MediaCrawler/)
+- **Crawler Tutorial**: [CrawlerTutorial Free Tutorial](https://github.com/NanmiCoder/CrawlerTutorial)
+
+
+# Other common questions can be viewed in the online documentation
+>
+> The online documentation includes usage methods, common questions, joining project discussion groups, etc.
+> [MediaCrawler Online Documentation](https://nanmicoder.github.io/MediaCrawler/)
+>
+
+# Author's Knowledge Services
+> If you want to quickly get started and learn the usage of this project, source code architectural design, learn programming technology, or want to understand the source code design of MediaCrawlerPro, you can check out my paid knowledge column.
+
+[Author's Paid Knowledge Column Introduction](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
+
+
+---
+
+## ⭐ Star Trend Chart
+
+If this project helps you, please give a ⭐ Star to support and let more people see MediaCrawler!
+
+[](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
+
+### 💰 Sponsor Display
+
+
+
+
+**Swiftproxy** - 90M+ global high-quality pure residential IPs, register to get free 500MB test traffic, dynamic traffic never expires!
+> Exclusive discount code: **GHB5** Get 10% off instantly!
+
+
+
+### 🤝 Become a Sponsor
+
+Become a sponsor and showcase your product here, getting massive exposure daily!
+
+**Contact Information**:
+- WeChat: `relakkes`
+- Email: `relakkes@gmail.com`
+
+
+## 📚 References
+
+- **Xiaohongshu Client**: [ReaJason's xhs repository](https://github.com/ReaJason/xhs)
+- **SMS Forwarding**: [SmsForwarder reference repository](https://github.com/pppscn/SmsForwarder)
+- **Intranet Penetration Tool**: [ngrok official documentation](https://ngrok.com/docs/)
+
+
+# Disclaimer
+
+
+## 1. Project Purpose and Nature
+This project (hereinafter referred to as "this project") was created as a technical research and learning tool, aimed at exploring and learning network data collection technologies. This project focuses on research of data crawling technologies for social media platforms, intended to provide learners and researchers with technical exchange purposes.
+
+## 2. Legal Compliance Statement
+The project developer (hereinafter referred to as "developer") solemnly reminds users to strictly comply with relevant laws and regulations of the People's Republic of China when downloading, installing and using this project, including but not limited to the "Cybersecurity Law of the People's Republic of China", "Counter-Espionage Law of the People's Republic of China" and all applicable national laws and policies. Users shall bear all legal responsibilities that may arise from using this project.
+
+## 3. Usage Purpose Restrictions
+This project is strictly prohibited from being used for any illegal purposes or non-learning, non-research commercial activities. This project may not be used for any form of illegal intrusion into other people's computer systems, nor may it be used for any activities that infringe upon others' intellectual property rights or other legitimate rights and interests. Users should ensure that their use of this project is purely for personal learning and technical research, and may not be used for any form of illegal activities.
+
+## 4. Disclaimer
+The developer has made every effort to ensure the legitimacy and security of this project, but assumes no responsibility for any form of direct or indirect losses that may arise from users' use of this project. Including but not limited to any data loss, equipment damage, legal litigation, etc. caused by using this project.
+
+## 5. Intellectual Property Statement
+The intellectual property rights of this project belong to the developer. This project is protected by copyright law and international copyright treaties as well as other intellectual property laws and treaties. Users may download and use this project under the premise of complying with this statement and relevant laws and regulations.
+
+## 6. Final Interpretation Rights
+The developer has the final interpretation rights regarding this project. The developer reserves the right to change or update this disclaimer at any time without further notice.
+
+
+
+## 🙏 Acknowledgments
+
+### JetBrains Open Source License Support
+
+Thanks to JetBrains for providing free open source license support for this project!
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md
new file mode 100644
index 0000000..61e7783
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md
@@ -0,0 +1,327 @@
+
+
Special thanks to:
+
+
+
+
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+
+# 🔥 MediaCrawler - Rastreador de Plataformas de Redes Sociales 🕷️
+
+
+
+
+
+
+
+[](https://github.com/NanmiCoder/MediaCrawler/stargazers)
+[](https://github.com/NanmiCoder/MediaCrawler/network/members)
+[](https://github.com/NanmiCoder/MediaCrawler/issues)
+[](https://github.com/NanmiCoder/MediaCrawler/pulls)
+[](https://github.com/NanmiCoder/MediaCrawler/blob/main/LICENSE)
+[](README.md)
+[](README_en.md)
+[](README_es.md)
+
+
+
+> **Descargo de responsabilidad:**
+>
+> Por favor, utilice este repositorio únicamente con fines de aprendizaje ⚠️⚠️⚠️⚠️, [Casos ilegales de web scraping](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China)
+>
+>Todo el contenido de este repositorio es únicamente para fines de aprendizaje y referencia, y está prohibido el uso comercial. Ninguna persona u organización puede usar el contenido de este repositorio para propósitos ilegales o infringir los derechos e intereses legítimos de otros. La tecnología de web scraping involucrada en este repositorio es solo para aprendizaje e investigación, y no puede ser utilizada para rastreo a gran escala de otras plataformas u otras actividades ilegales. Este repositorio no asume ninguna responsabilidad legal por cualquier responsabilidad legal que surja del uso del contenido de este repositorio. Al usar el contenido de este repositorio, usted acepta todos los términos y condiciones de este descargo de responsabilidad.
+>
+> Haga clic para ver un descargo de responsabilidad más detallado. [Haga clic para saltar](#disclaimer)
+
+## 📖 Introducción del Proyecto
+
+Una poderosa **herramienta de recolección de datos de redes sociales multiplataforma** que soporta el rastreo de información pública de plataformas principales incluyendo Xiaohongshu, Douyin, Kuaishou, Bilibili, Weibo, Tieba, Zhihu, y más.
+
+### 🔧 Principios Técnicos
+
+- **Tecnología Central**: Basado en el framework de automatización de navegador [Playwright](https://playwright.dev/) para login y mantenimiento del estado de login
+- **No Requiere Ingeniería Inversa de JS**: Utiliza el entorno de contexto del navegador con estado de login preservado para obtener parámetros de firma a través de expresiones JS
+- **Ventajas**: No necesita hacer ingeniería inversa de algoritmos de encriptación complejos, reduciendo significativamente la barrera técnica
+
+## ✨ Características
+| Plataforma | Búsqueda por Palabras Clave | Rastreo de ID de Publicación Específica | Comentarios Secundarios | Página de Inicio de Creador Específico | Caché de Estado de Login | Pool de Proxy IP | Generar Nube de Palabras de Comentarios |
+| ------ | ---------- | -------------- | -------- | -------------- | ---------- | -------- | -------------- |
+| Xiaohongshu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Douyin | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Kuaishou | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Bilibili | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Weibo | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Tieba | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| Zhihu | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+
+
+
+🔗 🚀 ¡Lanzamiento Mayor de MediaCrawlerPro! ¡Más características, mejor diseño arquitectónico!
+
+### 🚀 ¡Lanzamiento Mayor de MediaCrawlerPro!
+
+> Enfócate en aprender el diseño arquitectónico de proyectos maduros, no solo tecnología de rastreo. ¡La filosofía de diseño de código de la versión Pro también vale la pena estudiar en profundidad!
+
+[MediaCrawlerPro](https://github.com/MediaCrawlerPro) ventajas principales sobre la versión de código abierto:
+
+#### 🎯 Actualizaciones de Características Principales
+- ✅ **Funcionalidad de reanudación de rastreo** (Característica clave)
+- ✅ **Soporte de múltiples cuentas + pool de proxy IP** (Característica clave)
+- ✅ **Eliminar dependencia de Playwright**, más fácil de usar
+- ✅ **Soporte completo de entorno Linux**
+
+#### 🏗️ Optimización de Diseño Arquitectónico
+- ✅ **Optimización de refactorización de código**, más legible y mantenible (lógica de firma JS desacoplada)
+- ✅ **Calidad de código de nivel empresarial**, adecuado para construir proyectos de rastreo a gran escala
+- ✅ **Diseño arquitectónico perfecto**, alta escalabilidad, mayor valor de aprendizaje del código fuente
+
+#### 🎁 Características Adicionales
+- ✅ **Aplicación de escritorio descargadora de videos de redes sociales** (adecuada para aprender desarrollo full-stack)
+- ✅ **Recomendaciones de feed de página de inicio multiplataforma** (HomeFeed)
+- [ ] **Agente AI basado en plataformas de redes sociales está en desarrollo 🚀🚀**
+
+Haga clic para ver: [Página de Inicio del Proyecto MediaCrawlerPro](https://github.com/MediaCrawlerPro) para más información
+
+
+## 🚀 Inicio Rápido
+
+> 💡 **¡El código abierto no es fácil, si este proyecto te ayuda, por favor da una ⭐ Estrella para apoyar!**
+
+## 📋 Prerrequisitos
+
+### 🚀 Instalación de uv (Recomendado)
+
+Antes de proceder con los siguientes pasos, por favor asegúrese de que uv esté instalado en su computadora:
+
+- **Guía de Instalación**: [Guía Oficial de Instalación de uv](https://docs.astral.sh/uv/getting-started/installation)
+- **Verificar Instalación**: Ingrese el comando `uv --version` en la terminal. Si el número de versión se muestra normalmente, la instalación fue exitosa
+- **Razón de Recomendación**: uv es actualmente la herramienta de gestión de paquetes Python más poderosa, con velocidad rápida y resolución de dependencias precisa
+
+### 🟢 Instalación de Node.js
+
+El proyecto depende de Node.js, por favor descargue e instale desde el sitio web oficial:
+
+- **Enlace de Descarga**: https://nodejs.org/en/download/
+- **Requisito de Versión**: >= 16.0.0
+
+### 📦 Instalación de Paquetes Python
+
+```shell
+# Entrar al directorio del proyecto
+cd MediaCrawler
+
+# Usar el comando uv sync para asegurar la consistencia de la versión de python y paquetes de dependencias relacionados
+uv sync
+```
+
+### 🌐 Instalación de Controlador de Navegador
+
+```shell
+# Instalar controlador de navegador
+uv run playwright install
+```
+
+> **💡 Consejo**: MediaCrawler ahora soporta usar playwright para conectarse a su navegador Chrome local, resolviendo algunos problemas causados por Webdriver.
+>
+> Actualmente, `xhs` y `dy` están disponibles usando el modo CDP para conectarse a navegadores locales. Si es necesario, verifique los elementos de configuración en `config/base_config.py`.
+
+## 🚀 Ejecutar Programa Rastreador
+
+```shell
+# El proyecto no habilita el modo de rastreo de comentarios por defecto. Si necesita comentarios, por favor modifique la variable ENABLE_GET_COMMENTS en config/base_config.py
+# Otras opciones soportadas también pueden verse en config/base_config.py con comentarios en chino
+
+# Leer palabras clave del archivo de configuración para buscar publicaciones relacionadas y rastrear información de publicaciones y comentarios
+uv run main.py --platform xhs --lt qrcode --type search
+
+# Leer lista de ID de publicaciones específicas del archivo de configuración para obtener información e información de comentarios de publicaciones específicas
+uv run main.py --platform xhs --lt qrcode --type detail
+
+# Abrir la APP correspondiente para escanear código QR para login
+
+# Para ejemplos de uso de rastreador de otras plataformas, ejecute el siguiente comando para ver
+uv run main.py --help
+```
+
+
+🔗 Usando gestión de entorno venv nativo de Python (No recomendado)
+
+#### Crear y activar entorno virtual de Python
+
+> Si rastrea Douyin y Zhihu, necesita instalar el entorno nodejs con anticipación, versión mayor o igual a: `16`
+
+```shell
+# Entrar al directorio raíz del proyecto
+cd MediaCrawler
+
+# Crear entorno virtual
+# Mi versión de python es: 3.9.6, las librerías en requirements.txt están basadas en esta versión
+# Si usa otras versiones de python, las librerías en requirements.txt pueden no ser compatibles, por favor resuelva por su cuenta
+python -m venv venv
+
+# macOS & Linux activar entorno virtual
+source venv/bin/activate
+
+# Windows activar entorno virtual
+venv\Scripts\activate
+```
+
+#### Instalar librerías de dependencias
+
+```shell
+pip install -r requirements.txt
+```
+
+#### Instalar controlador de navegador playwright
+
+```shell
+playwright install
+```
+
+#### Ejecutar programa rastreador (entorno nativo)
+
+```shell
+# El proyecto no habilita el modo de rastreo de comentarios por defecto. Si necesita comentarios, por favor modifique la variable ENABLE_GET_COMMENTS en config/base_config.py
+# Otras opciones soportadas también pueden verse en config/base_config.py con comentarios en chino
+
+# Leer palabras clave del archivo de configuración para buscar publicaciones relacionadas y rastrear información de publicaciones y comentarios
+python main.py --platform xhs --lt qrcode --type search
+
+# Leer lista de ID de publicaciones específicas del archivo de configuración para obtener información e información de comentarios de publicaciones específicas
+python main.py --platform xhs --lt qrcode --type detail
+
+# Abrir la APP correspondiente para escanear código QR para login
+
+# Para ejemplos de uso de rastreador de otras plataformas, ejecute el siguiente comando para ver
+python main.py --help
+```
+
+
+
+
+## 💾 Almacenamiento de Datos
+
+Soporta múltiples métodos de almacenamiento de datos:
+- **Archivos CSV**: Soporta guardar en CSV (bajo el directorio `data/`)
+- **Archivos JSON**: Soporta guardar en JSON (bajo el directorio `data/`)
+- **Almacenamiento en Base de Datos**
+ - Use el parámetro `--init_db` para la inicialización de la base de datos (cuando use `--init_db`, no se necesitan otros argumentos opcionales)
+ - **Base de Datos SQLite**: Base de datos ligera, no requiere servidor, adecuada para uso personal (recomendado)
+ 1. Inicialización: `--init_db sqlite`
+ 2. Almacenamiento de Datos: `--save_data_option sqlite`
+ - **Base de Datos MySQL**: Soporta guardar en la base de datos relacional MySQL (la base de datos debe crearse con anticipación)
+ 1. Inicialización: `--init_db mysql`
+ 2. Almacenamiento de Datos: `--save_data_option db` (el parámetro db se mantiene por compatibilidad con actualizaciones históricas)
+
+
+### Ejemplos de Uso:
+```shell
+# Inicializar la base de datos SQLite (cuando use '--init_db', no se necesitan otros argumentos opcionales)
+uv run main.py --init_db sqlite
+# Usar SQLite para almacenar datos (recomendado para usuarios personales)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
+```
+```shell
+# Inicializar la base de datos MySQL
+uv run main.py --init_db mysql
+# Usar MySQL para almacenar datos (el parámetro db se mantiene por compatibilidad con actualizaciones históricas)
+uv run main.py --platform xhs --lt qrcode --type search --save_data_option db
+```
+
+---
+
+[🚀 ¡Lanzamiento Mayor de MediaCrawlerPro 🚀! ¡Más características, mejor diseño arquitectónico!](https://github.com/MediaCrawlerPro)
+
+## 🤝 Comunidad y Soporte
+
+### 💬 Grupos de Discusión
+- **Grupo de Discusión WeChat**: [Haga clic para unirse](https://nanmicoder.github.io/MediaCrawler/%E5%BE%AE%E4%BF%A1%E4%BA%A4%E6%B5%81%E7%BE%A4.html)
+
+### 📚 Documentación y Tutoriales
+- **Documentación en Línea**: [Documentación Completa de MediaCrawler](https://nanmicoder.github.io/MediaCrawler/)
+- **Tutorial de Rastreador**: [Tutorial Gratuito CrawlerTutorial](https://github.com/NanmiCoder/CrawlerTutorial)
+
+
+# Otras preguntas comunes pueden verse en la documentación en línea
+>
+> La documentación en línea incluye métodos de uso, preguntas comunes, unirse a grupos de discusión del proyecto, etc.
+> [Documentación en Línea de MediaCrawler](https://nanmicoder.github.io/MediaCrawler/)
+>
+
+# Servicios de Conocimiento del Autor
+> Si quiere comenzar rápidamente y aprender el uso de este proyecto, diseño arquitectónico del código fuente, aprender tecnología de programación, o quiere entender el diseño del código fuente de MediaCrawlerPro, puede revisar mi columna de conocimiento pagado.
+
+[Introducción de la Columna de Conocimiento Pagado del Autor](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
+
+
+---
+
+## ⭐ Gráfico de Tendencia de Estrellas
+
+¡Si este proyecto te ayuda, por favor da una ⭐ Estrella para apoyar y que más personas vean MediaCrawler!
+
+[](https://star-history.com/#NanmiCoder/MediaCrawler&Date)
+
+### 💰 Exhibición de Patrocinadores
+
+
+
+
+**Swiftproxy** - ¡90M+ IPs residenciales puras de alta calidad globales, regístrese para obtener 500MB de tráfico de prueba gratuito, el tráfico dinámico nunca expira!
+> Código de descuento exclusivo: **GHB5** ¡Obtenga 10% de descuento instantáneamente!
+
+
+### 🤝 Conviértase en Patrocinador
+
+¡Conviértase en patrocinador y muestre su producto aquí, obteniendo exposición masiva diariamente!
+
+**Información de Contacto**:
+- WeChat: `relakkes`
+- Email: `relakkes@gmail.com`
+
+
+## 📚 Referencias
+
+- **Cliente Xiaohongshu**: [Repositorio xhs de ReaJason](https://github.com/ReaJason/xhs)
+- **Reenvío de SMS**: [Repositorio de referencia SmsForwarder](https://github.com/pppscn/SmsForwarder)
+- **Herramienta de Penetración de Intranet**: [Documentación oficial de ngrok](https://ngrok.com/docs/)
+
+
+# Descargo de Responsabilidad
+
+
+## 1. Propósito y Naturaleza del Proyecto
+Este proyecto (en adelante denominado "este proyecto") fue creado como una herramienta de investigación técnica y aprendizaje, con el objetivo de explorar y aprender tecnologías de recolección de datos de red. Este proyecto se enfoca en la investigación de tecnologías de rastreo de datos para plataformas de redes sociales, destinado a proporcionar a estudiantes e investigadores propósitos de intercambio técnico.
+
+## 2. Declaración de Cumplimiento Legal
+El desarrollador del proyecto (en adelante denominado "desarrollador") recuerda solemnemente a los usuarios que cumplan estrictamente con las leyes y regulaciones relevantes de la República Popular China al descargar, instalar y usar este proyecto, incluyendo pero no limitado a la "Ley de Ciberseguridad de la República Popular China", "Ley de Contraespionaje de la República Popular China" y todas las leyes y políticas nacionales aplicables. Los usuarios deberán asumir todas las responsabilidades legales que puedan surgir del uso de este proyecto.
+
+## 3. Restricciones de Propósito de Uso
+Este proyecto está estrictamente prohibido de ser utilizado para cualquier propósito ilegal o actividades comerciales que no sean de aprendizaje o investigación. Este proyecto no puede ser utilizado para ninguna forma de intrusión ilegal en sistemas informáticos de otras personas, ni puede ser utilizado para cualquier actividad que infrinja los derechos de propiedad intelectual de otros u otros derechos e intereses legítimos. Los usuarios deben asegurar que su uso de este proyecto sea puramente para aprendizaje personal e investigación técnica, y no puede ser utilizado para ninguna forma de actividades ilegales.
+
+## 4. Descargo de Responsabilidad
+El desarrollador ha hecho todos los esfuerzos para asegurar la legitimidad y seguridad de este proyecto, pero no asume responsabilidad por ninguna forma de pérdidas directas o indirectas que puedan surgir del uso de este proyecto por parte de los usuarios. Incluyendo pero no limitado a cualquier pérdida de datos, daño de equipos, litigios legales, etc. causados por el uso de este proyecto.
+
+## 5. Declaración de Propiedad Intelectual
+Los derechos de propiedad intelectual de este proyecto pertenecen al desarrollador. Este proyecto está protegido por la ley de derechos de autor y tratados internacionales de derechos de autor, así como otras leyes y tratados de propiedad intelectual. Los usuarios pueden descargar y usar este proyecto bajo la premisa de cumplir con esta declaración y las leyes y regulaciones relevantes.
+
+## 6. Derechos de Interpretación Final
+El desarrollador tiene los derechos de interpretación final con respecto a este proyecto. El desarrollador se reserva el derecho de cambiar o actualizar este descargo de responsabilidad en cualquier momento sin previo aviso.
+
+
+
+## 🙏 Agradecimientos
+
+### Soporte de Licencia de Código Abierto de JetBrains
+
+¡Gracias a JetBrains por proporcionar soporte de licencia de código abierto gratuito para este proyecto!
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py
deleted file mode 100644
index 33859fa..0000000
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-
-# -*- coding: utf-8 -*-
-# @Author : relakkes@gmail.com
-# @Time : 2024/4/6 14:21
-# @Desc : 异步Aiomysql的增删改查封装
-from typing import Any, Dict, List, Union
-
-import aiomysql
-
-
-class AsyncMysqlDB:
- def __init__(self, pool: aiomysql.Pool) -> None:
- self.__pool = pool
-
- async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
- """
- 从给定的 SQL 中查询记录,返回的是一个列表
- :param sql: 查询的sql
- :param args: sql中传递动态参数列表
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, args)
- data = await cur.fetchall()
- return data or []
-
- async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
- """
- 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
- :param sql: 查询的sql
- :param args:sql中传递动态参数列表
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, args)
- data = await cur.fetchone()
- return data
-
- async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
- """
- 表中插入数据
- :param table_name: 表名
- :param item: 一条记录的字典信息
- :return:
- """
- fields = list(item.keys())
- values = list(item.values())
- fields = [f'`{field}`' for field in fields]
- fieldstr = ','.join(fields)
- valstr = ','.join(['%s'] * len(item))
- sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr)
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, values)
- lastrowid = cur.lastrowid
- return lastrowid
-
- async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
- value_where: Union[str, int, float]) -> int:
- """
- 更新指定表的记录
- :param table_name: 表名
- :param updates: 需要更新的字段和值的 key - value 映射
- :param field_where: update 语句 where 条件中的字段名
- :param value_where: update 语句 where 条件中的字段值
- :return:
- """
- upsets = []
- values = []
- for k, v in updates.items():
- s = '`%s`=%%s' % k
- upsets.append(s)
- values.append(v)
- upsets = ','.join(upsets)
- sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
- table_name,
- upsets,
- field_where, value_where,
- )
- async with self.__pool.acquire() as conn:
- async with conn.cursor() as cur:
- rows = await cur.execute(sql, values)
- return rows
-
- async def execute(self, sql: str, *args: Union[str, int]) -> int:
- """
- 需要更新、写入等操作的 excute 执行语句
- :param sql:
- :param args:
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor() as cur:
- rows = await cur.execute(sql, args)
- return rows
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py
deleted file mode 100644
index d9409bd..0000000
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-
-# -*- coding: utf-8 -*-
-# @Author : relakkes@gmail.com
-# @Time : 2024/4/6 14:21
-# @Desc : 异步SQLite的增删改查封装
-from typing import Any, Dict, List, Union
-
-import aiosqlite
-
-
-class AsyncSqliteDB:
- def __init__(self, db_path: str) -> None:
- self.__db_path = db_path
-
- async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
- """
- 从给定的 SQL 中查询记录,返回的是一个列表
- :param sql: 查询的sql
- :param args: sql中传递动态参数列表
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- conn.row_factory = aiosqlite.Row
- async with conn.execute(sql, args) as cursor:
- rows = await cursor.fetchall()
- return [dict(row) for row in rows] if rows else []
-
- async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
- """
- 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
- :param sql: 查询的sql
- :param args:sql中传递动态参数列表
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- conn.row_factory = aiosqlite.Row
- async with conn.execute(sql, args) as cursor:
- row = await cursor.fetchone()
- return dict(row) if row else None
-
- async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
- """
- 表中插入数据
- :param table_name: 表名
- :param item: 一条记录的字典信息
- :return:
- """
- fields = list(item.keys())
- values = list(item.values())
- fieldstr = ','.join(fields)
- valstr = ','.join(['?'] * len(item))
- sql = f"INSERT INTO {table_name} ({fieldstr}) VALUES({valstr})"
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, values) as cursor:
- await conn.commit()
- return cursor.lastrowid
-
- async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
- value_where: Union[str, int, float]) -> int:
- """
- 更新指定表的记录
- :param table_name: 表名
- :param updates: 需要更新的字段和值的 key - value 映射
- :param field_where: update 语句 where 条件中的字段名
- :param value_where: update 语句 where 条件中的字段值
- :return:
- """
- upsets = []
- values = []
- for k, v in updates.items():
- upsets.append(f'{k}=?')
- values.append(v)
- upsets_str = ','.join(upsets)
- values.append(value_where)
- sql = f'UPDATE {table_name} SET {upsets_str} WHERE {field_where}=?'
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, values) as cursor:
- await conn.commit()
- return cursor.rowcount
-
- async def execute(self, sql: str, *args: Union[str, int]) -> int:
- """
- 需要更新、写入等操作的 excute 执行语句
- :param sql:
- :param args:
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, args) as cursor:
- await conn.commit()
- return cursor.rowcount
-
- async def executescript(self, sql_script: str) -> None:
- """
- 执行SQL脚本,用于初始化数据库表结构
- :param sql_script: SQL脚本内容
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- await conn.executescript(sql_script)
- await conn.commit()
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
index 12643ee..0fa375b 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
@@ -1,55 +1,259 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-import argparse
+from __future__ import annotations
+
+
+import sys
+from enum import Enum
+from types import SimpleNamespace
+from typing import Iterable, Optional, Sequence, Type, TypeVar
+
+import typer
+from typing_extensions import Annotated
import config
from tools.utils import str2bool
-async def parse_cmd():
- # 读取command arg
- parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
- parser.add_argument('--platform', type=str,
- help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
- choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
- parser.add_argument('--lt', type=str,
- help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
- choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
- parser.add_argument('--type', type=str,
- help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
- choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
- parser.add_argument('--start', type=int,
- help='Number of start page / 起始页码', default=config.START_PAGE)
- parser.add_argument('--keywords', type=str,
- help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
- parser.add_argument('--get_comment', type=str2bool,
- help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
- parser.add_argument('--get_sub_comment', type=str2bool,
- help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
- parser.add_argument('--save_data_option', type=str,
- help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
- choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
- parser.add_argument('--cookies', type=str,
- help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
+EnumT = TypeVar("EnumT", bound=Enum)
- args = parser.parse_args()
- # override config
- config.PLATFORM = args.platform
- config.LOGIN_TYPE = args.lt
- config.CRAWLER_TYPE = args.type
- config.START_PAGE = args.start
- config.KEYWORDS = args.keywords
- config.ENABLE_GET_COMMENTS = args.get_comment
- config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
- config.SAVE_DATA_OPTION = args.save_data_option
- config.COOKIES = args.cookies
+class PlatformEnum(str, Enum):
+ """支持的媒体平台枚举"""
+
+ XHS = "xhs"
+ DOUYIN = "dy"
+ KUAISHOU = "ks"
+ BILIBILI = "bili"
+ WEIBO = "wb"
+ TIEBA = "tieba"
+ ZHIHU = "zhihu"
+
+
+class LoginTypeEnum(str, Enum):
+ """登录方式枚举"""
+
+ QRCODE = "qrcode"
+ PHONE = "phone"
+ COOKIE = "cookie"
+
+
+class CrawlerTypeEnum(str, Enum):
+ """爬虫类型枚举"""
+
+ SEARCH = "search"
+ DETAIL = "detail"
+ CREATOR = "creator"
+
+
+class SaveDataOptionEnum(str, Enum):
+ """数据保存方式枚举"""
+
+ CSV = "csv"
+ DB = "db"
+ JSON = "json"
+ SQLITE = "sqlite"
+ POSTGRESQL = "postgresql"
+
+
+class InitDbOptionEnum(str, Enum):
+ """数据库初始化选项"""
+
+ SQLITE = "sqlite"
+ MYSQL = "mysql"
+ POSTGRESQL = "postgresql"
+
+
+def _to_bool(value: bool | str) -> bool:
+ if isinstance(value, bool):
+ return value
+ return str2bool(value)
+
+
+def _coerce_enum(
+ enum_cls: Type[EnumT],
+ value: EnumT | str,
+ default: EnumT,
+) -> EnumT:
+ """Safely convert a raw config value to an enum member."""
+
+ if isinstance(value, enum_cls):
+ return value
+
+ try:
+ return enum_cls(value)
+ except ValueError:
+ typer.secho(
+ f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.",
+ fg=typer.colors.YELLOW,
+ )
+ return default
+
+
+def _normalize_argv(argv: Optional[Sequence[str]]) -> Iterable[str]:
+ if argv is None:
+ return list(sys.argv[1:])
+ return list(argv)
+
+
+def _inject_init_db_default(args: Sequence[str]) -> list[str]:
+ """Ensure bare --init_db defaults to sqlite for backward compatibility."""
+
+ normalized: list[str] = []
+ i = 0
+ while i < len(args):
+ arg = args[i]
+ normalized.append(arg)
+
+ if arg == "--init_db":
+ next_arg = args[i + 1] if i + 1 < len(args) else None
+ if not next_arg or next_arg.startswith("-"):
+ normalized.append(InitDbOptionEnum.SQLITE.value)
+ i += 1
+
+ return normalized
+
+
+async def parse_cmd(argv: Optional[Sequence[str]] = None):
+ """使用 Typer 解析命令行参数。"""
+
+ app = typer.Typer(add_completion=False)
+
+ @app.callback(invoke_without_command=True)
+ def main(
+ platform: Annotated[
+ PlatformEnum,
+ typer.Option(
+ "--platform",
+ help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
+ rich_help_panel="基础配置",
+ ),
+ ] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
+ lt: Annotated[
+ LoginTypeEnum,
+ typer.Option(
+ "--lt",
+ help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
+ rich_help_panel="账号配置",
+ ),
+ ] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
+ crawler_type: Annotated[
+ CrawlerTypeEnum,
+ typer.Option(
+ "--type",
+ help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
+ rich_help_panel="基础配置",
+ ),
+ ] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
+ start: Annotated[
+ int,
+ typer.Option(
+ "--start",
+ help="起始页码",
+ rich_help_panel="基础配置",
+ ),
+ ] = config.START_PAGE,
+ keywords: Annotated[
+ str,
+ typer.Option(
+ "--keywords",
+ help="请输入关键词,多个关键词用逗号分隔",
+ rich_help_panel="基础配置",
+ ),
+ ] = config.KEYWORDS,
+ get_comment: Annotated[
+ str,
+ typer.Option(
+ "--get_comment",
+ help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
+ rich_help_panel="评论配置",
+ show_default=True,
+ ),
+ ] = str(config.ENABLE_GET_COMMENTS),
+ get_sub_comment: Annotated[
+ str,
+ typer.Option(
+ "--get_sub_comment",
+ help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
+ rich_help_panel="评论配置",
+ show_default=True,
+ ),
+ ] = str(config.ENABLE_GET_SUB_COMMENTS),
+ save_data_option: Annotated[
+ SaveDataOptionEnum,
+ typer.Option(
+ "--save_data_option",
+ help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | postgresql=PostgreSQL数据库)",
+ rich_help_panel="存储配置",
+ ),
+ ] = _coerce_enum(
+ SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
+ ),
+ init_db: Annotated[
+ Optional[InitDbOptionEnum],
+ typer.Option(
+ "--init_db",
+ help="初始化数据库表结构 (sqlite | mysql | postgresql)",
+ rich_help_panel="存储配置",
+ ),
+ ] = None,
+ cookies: Annotated[
+ str,
+ typer.Option(
+ "--cookies",
+ help="Cookie 登录方式使用的 Cookie 值",
+ rich_help_panel="账号配置",
+ ),
+ ] = config.COOKIES,
+ ) -> SimpleNamespace:
+ """MediaCrawler 命令行入口"""
+
+ enable_comment = _to_bool(get_comment)
+ enable_sub_comment = _to_bool(get_sub_comment)
+ init_db_value = init_db.value if init_db else None
+
+ # override global config
+ config.PLATFORM = platform.value
+ config.LOGIN_TYPE = lt.value
+ config.CRAWLER_TYPE = crawler_type.value
+ config.START_PAGE = start
+ config.KEYWORDS = keywords
+ config.ENABLE_GET_COMMENTS = enable_comment
+ config.ENABLE_GET_SUB_COMMENTS = enable_sub_comment
+ config.SAVE_DATA_OPTION = save_data_option.value
+ config.COOKIES = cookies
+
+ return SimpleNamespace(
+ platform=config.PLATFORM,
+ lt=config.LOGIN_TYPE,
+ type=config.CRAWLER_TYPE,
+ start=config.START_PAGE,
+ keywords=config.KEYWORDS,
+ get_comment=config.ENABLE_GET_COMMENTS,
+ get_sub_comment=config.ENABLE_GET_SUB_COMMENTS,
+ save_data_option=config.SAVE_DATA_OPTION,
+ init_db=init_db_value,
+ cookies=config.COOKIES,
+ )
+
+ command = typer.main.get_command(app)
+
+ cli_args = _normalize_argv(argv)
+ cli_args = _inject_init_db_default(cli_args)
+
+ try:
+ result = command.main(args=cli_args, standalone_mode=False)
+ if isinstance(result, int): # help/options handled by Typer; propagate exit code
+ raise SystemExit(result)
+ return result
+ except typer.Exit as exc: # pragma: no cover - CLI exit paths
+ raise SystemExit(exc.exit_code) from exc
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
index eb3f161..1c1e97c 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
@@ -10,5 +10,4 @@
from .base_config import *
-from .db_config import *
-from .tieba_config import *
\ No newline at end of file
+from .db_config import *
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
index 70665b4..dbea153 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
@@ -9,11 +9,12 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置
-PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
-KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔
+PLATFORM = "bili" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
+KEYWORDS = "电影鬼灭之刃,亲属想侵吞3姐妹亡父赔偿款,网警斩断侵害未成年人网络黑色产业链,2007年后出生的人不能在马尔代夫吸烟,沈月,是公主也是自己的骑士,以军虐囚视频,唐朝诡事录,广州地铁回应APP乘车码频繁弹窗广告,全红婵的减肥计划精确到克" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
+
# 是否开启 IP 代理
ENABLE_IP_PROXY = False
@@ -36,7 +37,7 @@ SAVE_LOGIN_STATE = True
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
-ENABLE_CDP_MODE = False
+ENABLE_CDP_MODE = True
# CDP调试端口,用于与浏览器通信
# 如果端口被占用,系统会自动尝试下一个可用端口
@@ -59,8 +60,8 @@ BROWSER_LAUNCH_TIMEOUT = 30
# 设置为False可以保持浏览器运行,便于调试
AUTO_CLOSE_BROWSER = True
-# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。
-SAVE_DATA_OPTION = "db" # csv or db or json or sqlite
+# 数据保存类型选项配置,支持五种类型:csv、db、json、sqlite、postgresql, 最好保存到DB,有排重的功能。
+SAVE_DATA_OPTION = "postgresql" # csv or db or json or sqlite or postgresql
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@@ -69,7 +70,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
START_PAGE = 1
# 爬取视频/帖子的数量控制
-CRAWLER_MAX_NOTES_COUNT = 10
+CRAWLER_MAX_NOTES_COUNT = 5
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
index 2b516b4..779ab75 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
@@ -13,16 +13,23 @@
# 每天爬取视频/帖子的数量控制
MAX_NOTES_PER_DAY = 1
-# 指定B站视频ID列表
+# 指定B站视频URL列表 (支持完整URL或BV号)
+# 示例:
+# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
+# - BV号: "BV1d54y1g7db"
BILI_SPECIFIED_ID_LIST = [
- "BV1d54y1g7db",
+ "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]
-# 指定B站用户ID列表
+# 指定B站创作者URL列表 (支持完整URL或UID)
+# 示例:
+# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
+# - UID: "20813884"
BILI_CREATOR_ID_LIST = [
+ "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
"20813884",
# ........................
]
@@ -34,6 +41,11 @@ END_DAY = "2024-01-01"
# 搜索模式
BILI_SEARCH_MODE = "normal"
+# 视频清晰度(qn)配置,常见取值:
+# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p高码率, 116=1080p60, 120=4K
+# 注意:更高清晰度需要账号/视频本身支持
+BILI_QN = 80
+
# 是否爬取用户信息
CREATOR_MODE = True
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
index fd85c35..0b6d45b 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
@@ -12,11 +12,19 @@
import os
# mysql config - 使用MindSpider的数据库配置
-MYSQL_DB_PWD = "mneDccc7sHHANtFk"
-MYSQL_DB_USER = "root"
-MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com"
-MYSQL_DB_PORT = 3306
-MYSQL_DB_NAME = "mindspider"
+MYSQL_DB_PWD = "bettafish"
+MYSQL_DB_USER = "bettafish"
+MYSQL_DB_HOST = "127.0.0.1"
+MYSQL_DB_PORT = 5444
+MYSQL_DB_NAME = "bettafish"
+
+mysql_db_config = {
+ "user": MYSQL_DB_USER,
+ "password": MYSQL_DB_PWD,
+ "host": MYSQL_DB_HOST,
+ "port": MYSQL_DB_PORT,
+ "db_name": MYSQL_DB_NAME,
+}
# redis config
@@ -30,4 +38,24 @@ CACHE_TYPE_REDIS = "redis"
CACHE_TYPE_MEMORY = "memory"
# sqlite config
-SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")
\ No newline at end of file
+SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
+
+sqlite_db_config = {
+ "db_path": SQLITE_DB_PATH
+}
+
+# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
+POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "bettafish")
+POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "bettafish")
+POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "127.0.0.1")
+POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5444")
+POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "bettafish")
+
+postgresql_db_config = {
+ "user": POSTGRESQL_DB_USER,
+ "password": POSTGRESQL_DB_PWD,
+ "host": POSTGRESQL_DB_HOST,
+ "port": POSTGRESQL_DB_PORT,
+ "db_name": POSTGRESQL_DB_NAME,
+}
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
index b974dca..cd36065 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
@@ -11,15 +11,27 @@
# 抖音平台配置
PUBLISH_TIME_TYPE = 0
-# 指定DY视频ID列表
+# 指定DY视频URL列表 (支持多种格式)
+# 支持格式:
+# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
+# 2. 带modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
+# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
+# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
+# 5. 纯视频ID: "7280854932641664319"
DY_SPECIFIED_ID_LIST = [
- "7280854932641664319",
- "7202432992642387233",
+ "https://www.douyin.com/video/7525538910311632128",
+ "https://v.douyin.com/drIPtQ_WPWY/",
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525538910311632128",
+ "7202432992642387233",
# ........................
]
-# 指定DY用户ID列表
+# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
+# 支持格式:
+# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
+# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
DY_CREATOR_ID_LIST = [
- "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
+ "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
index 962b457..d84d4a7 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
@@ -10,11 +10,22 @@
# 快手平台配置
-# 指定快手视频ID列表
-KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
+# 指定快手视频URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
+# 2. 纯视频ID: "3xf8enb8dbj6uig"
+KS_SPECIFIED_ID_LIST = [
+ "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
+ "3xf8enb8dbj6uig",
+ # ........................
+]
-# 指定快手用户ID列表
+# 指定快手创作者URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
+# 2. 纯user_id: "3x4sm73aye7jq7i"
KS_CREATOR_ID_LIST = [
+ "https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
"3x4sm73aye7jq7i",
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
index 485277a..2359b96 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
@@ -17,12 +17,16 @@ SORT_TYPE = "popularity_descending"
# 指定笔记URL列表, 必须要携带xsec_token参数
XHS_SPECIFIED_NOTE_URL_LIST = [
- "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+ "https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
# ........................
]
-# 指定用户ID列表
+# 指定创作者URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
+# 2. 纯user_id: "63e36c9a000000002703502b"
XHS_CREATOR_ID_LIST = [
- "63e36c9a000000002703502b",
+ "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
+ "63e36c9a000000002703502b",
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py
new file mode 100644
index 0000000..68a651b
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py
@@ -0,0 +1,35 @@
+# persist-1
+# 原因:将 db.py 改造为模块,移除直接执行入口,修复相对导入问题。
+# 副作用:无
+# 回滚策略:还原此文件。
+import asyncio
+import sys
+from pathlib import Path
+
+# Add project root to sys.path
+project_root = Path(__file__).resolve().parents[1]
+if str(project_root) not in sys.path:
+ sys.path.append(str(project_root))
+
+from tools import utils
+from database.db_session import create_tables
+
+async def init_table_schema(db_type: str):
+ """
+ Initializes the database table schema.
+ This will create tables based on the ORM models.
+ Args:
+ db_type: The type of database, 'sqlite', 'mysql', or 'postgresql'.
+ """
+ utils.logger.info(f"[init_table_schema] begin init {db_type} table schema ...")
+ await create_tables(db_type)
+ utils.logger.info(f"[init_table_schema] {db_type} table schema init successful")
+
+async def init_db(db_type: str = None):
+ await init_table_schema(db_type)
+
+async def close():
+ """
+ Placeholder for closing database connections if needed in the future.
+ """
+ pass
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db_session.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db_session.py
new file mode 100644
index 0000000..64e2647
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db_session.py
@@ -0,0 +1,87 @@
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
+from sqlalchemy.orm import sessionmaker
+from contextlib import asynccontextmanager
+from .models import Base
+import config
+from config.db_config import mysql_db_config, sqlite_db_config, postgresql_db_config
+
+# Keep a cache of engines
+_engines = {}
+
+
+async def create_database_if_not_exists(db_type: str):
+ if db_type == "mysql" or db_type == "db":
+ # Connect to the server without a database
+ server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
+ engine = create_async_engine(server_url, echo=False)
+ async with engine.connect() as conn:
+ await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
+ await engine.dispose()
+ elif db_type == "postgresql":
+ # Connect to PostgreSQL default database (postgres) to create target database
+ server_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/postgres"
+ engine = create_async_engine(server_url, echo=False, isolation_level="AUTOCOMMIT")
+ async with engine.connect() as conn:
+ # PostgreSQL uses different syntax - check if database exists first
+ result = await conn.execute(
+ text(f"SELECT 1 FROM pg_database WHERE datname = '{postgresql_db_config['db_name']}'")
+ )
+ exists = result.scalar() is not None
+ if not exists:
+ # Set autocommit for CREATE DATABASE
+ await conn.commit()
+ await conn.execute(text(f"CREATE DATABASE {postgresql_db_config['db_name']}"))
+ await engine.dispose()
+
+
+def get_async_engine(db_type: str = None):
+ if db_type is None:
+ db_type = config.SAVE_DATA_OPTION
+
+ if db_type in _engines:
+ return _engines[db_type]
+
+ if db_type in ["json", "csv"]:
+ return None
+
+ if db_type == "sqlite":
+ db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
+ elif db_type == "mysql" or db_type == "db":
+ db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
+ elif db_type == "postgresql":
+ db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
+ else:
+ raise ValueError(f"Unsupported database type: {db_type}")
+
+ engine = create_async_engine(db_url, echo=False)
+ _engines[db_type] = engine
+ return engine
+
+
+async def create_tables(db_type: str = None):
+ if db_type is None:
+ db_type = config.SAVE_DATA_OPTION
+ await create_database_if_not_exists(db_type)
+ engine = get_async_engine(db_type)
+ if engine:
+ async with engine.begin() as conn:
+ await conn.run_sync(Base.metadata.create_all)
+
+
+@asynccontextmanager
+async def get_session() -> AsyncSession:
+ engine = get_async_engine(config.SAVE_DATA_OPTION)
+ if not engine:
+ yield None
+ return
+ AsyncSessionFactory = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
+ session = AsyncSessionFactory()
+ try:
+ yield session
+ await session.commit()
+ except Exception as e:
+ await session.rollback()
+ raise e
+ finally:
+ await session.close()
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/models.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/models.py
new file mode 100644
index 0000000..a22c7be
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/models.py
@@ -0,0 +1,434 @@
+from sqlalchemy import create_engine, Column, Integer, Text, String, BigInteger
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+
+Base = declarative_base()
+
+class BilibiliVideo(Base):
+ __tablename__ = 'bilibili_video'
+ id = Column(Integer, primary_key=True)
+ video_id = Column(BigInteger, nullable=False, index=True, unique=True)
+ video_url = Column(Text, nullable=False)
+ user_id = Column(BigInteger, index=True)
+ nickname = Column(Text)
+ avatar = Column(Text)
+ liked_count = Column(Integer)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ video_type = Column(Text)
+ title = Column(Text)
+ desc = Column(Text)
+ create_time = Column(BigInteger, index=True)
+ disliked_count = Column(Text)
+ video_play_count = Column(Text)
+ video_favorite_count = Column(Text)
+ video_share_count = Column(Text)
+ video_coin_count = Column(Text)
+ video_danmaku = Column(Text)
+ video_comment = Column(Text)
+ video_cover_url = Column(Text)
+ source_keyword = Column(Text, default='')
+
+class BilibiliVideoComment(Base):
+ __tablename__ = 'bilibili_video_comment'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ sex = Column(Text)
+ sign = Column(Text)
+ avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ comment_id = Column(BigInteger, index=True)
+ video_id = Column(BigInteger, index=True)
+ content = Column(Text)
+ create_time = Column(BigInteger)
+ sub_comment_count = Column(Text)
+ parent_comment_id = Column(String(255))
+ like_count = Column(Text, default='0')
+
+class BilibiliUpInfo(Base):
+ __tablename__ = 'bilibili_up_info'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(BigInteger, index=True)
+ nickname = Column(Text)
+ sex = Column(Text)
+ sign = Column(Text)
+ avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ total_fans = Column(Integer)
+ total_liked = Column(Integer)
+ user_rank = Column(Integer)
+ is_official = Column(Integer)
+
+class BilibiliContactInfo(Base):
+ __tablename__ = 'bilibili_contact_info'
+ id = Column(Integer, primary_key=True)
+ up_id = Column(BigInteger, index=True)
+ fan_id = Column(BigInteger, index=True)
+ up_name = Column(Text)
+ fan_name = Column(Text)
+ up_sign = Column(Text)
+ fan_sign = Column(Text)
+ up_avatar = Column(Text)
+ fan_avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+
+class BilibiliUpDynamic(Base):
+ __tablename__ = 'bilibili_up_dynamic'
+ id = Column(Integer, primary_key=True)
+ dynamic_id = Column(BigInteger, index=True)
+ user_id = Column(String(255))
+ user_name = Column(Text)
+ text = Column(Text)
+ type = Column(Text)
+ pub_ts = Column(BigInteger)
+ total_comments = Column(Integer)
+ total_forwards = Column(Integer)
+ total_liked = Column(Integer)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+
+class DouyinAweme(Base):
+ __tablename__ = 'douyin_aweme'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ sec_uid = Column(String(255))
+ short_user_id = Column(String(255))
+ user_unique_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ user_signature = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ aweme_id = Column(BigInteger, index=True)
+ aweme_type = Column(Text)
+ title = Column(Text)
+ desc = Column(Text)
+ create_time = Column(BigInteger, index=True)
+ liked_count = Column(Text)
+ comment_count = Column(Text)
+ share_count = Column(Text)
+ collected_count = Column(Text)
+ aweme_url = Column(Text)
+ cover_url = Column(Text)
+ video_download_url = Column(Text)
+ music_download_url = Column(Text)
+ note_download_url = Column(Text)
+ source_keyword = Column(Text, default='')
+
+class DouyinAwemeComment(Base):
+ __tablename__ = 'douyin_aweme_comment'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ sec_uid = Column(String(255))
+ short_user_id = Column(String(255))
+ user_unique_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ user_signature = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ comment_id = Column(BigInteger, index=True)
+ aweme_id = Column(BigInteger, index=True)
+ content = Column(Text)
+ create_time = Column(BigInteger)
+ sub_comment_count = Column(Text)
+ parent_comment_id = Column(String(255))
+ like_count = Column(Text, default='0')
+ pictures = Column(Text, default='')
+
+class DyCreator(Base):
+ __tablename__ = 'dy_creator'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ desc = Column(Text)
+ gender = Column(Text)
+ follows = Column(Text)
+ fans = Column(Text)
+ interaction = Column(Text)
+ videos_count = Column(String(255))
+
+class KuaishouVideo(Base):
+ __tablename__ = 'kuaishou_video'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(64))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ video_id = Column(String(255), index=True)
+ video_type = Column(Text)
+ title = Column(Text)
+ desc = Column(Text)
+ create_time = Column(BigInteger, index=True)
+ liked_count = Column(Text)
+ viewd_count = Column(Text)
+ video_url = Column(Text)
+ video_cover_url = Column(Text)
+ video_play_url = Column(Text)
+ source_keyword = Column(Text, default='')
+
+class KuaishouVideoComment(Base):
+ __tablename__ = 'kuaishou_video_comment'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(Text)
+ nickname = Column(Text)
+ avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ comment_id = Column(BigInteger, index=True)
+ video_id = Column(String(255), index=True)
+ content = Column(Text)
+ create_time = Column(BigInteger)
+ sub_comment_count = Column(Text)
+
+class WeiboNote(Base):
+ __tablename__ = 'weibo_note'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ gender = Column(Text)
+ profile_url = Column(Text)
+ ip_location = Column(Text, default='')
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ note_id = Column(BigInteger, index=True)
+ content = Column(Text)
+ create_time = Column(BigInteger, index=True)
+ create_date_time = Column(String(255), index=True)
+ liked_count = Column(Text)
+ comments_count = Column(Text)
+ shared_count = Column(Text)
+ note_url = Column(Text)
+ source_keyword = Column(Text, default='')
+
+class WeiboNoteComment(Base):
+ __tablename__ = 'weibo_note_comment'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ gender = Column(Text)
+ profile_url = Column(Text)
+ ip_location = Column(Text, default='')
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ comment_id = Column(BigInteger, index=True)
+ note_id = Column(BigInteger, index=True)
+ content = Column(Text)
+ create_time = Column(BigInteger)
+ create_date_time = Column(String(255), index=True)
+ comment_like_count = Column(Text)
+ sub_comment_count = Column(Text)
+ parent_comment_id = Column(String(255))
+
+class WeiboCreator(Base):
+ __tablename__ = 'weibo_creator'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ desc = Column(Text)
+ gender = Column(Text)
+ follows = Column(Text)
+ fans = Column(Text)
+ tag_list = Column(Text)
+
+class XhsCreator(Base):
+ __tablename__ = 'xhs_creator'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ desc = Column(Text)
+ gender = Column(Text)
+ follows = Column(Text)
+ fans = Column(Text)
+ interaction = Column(Text)
+ tag_list = Column(Text)
+
+class XhsNote(Base):
+ __tablename__ = 'xhs_note'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ note_id = Column(String(255), index=True)
+ type = Column(Text)
+ title = Column(Text)
+ desc = Column(Text)
+ video_url = Column(Text)
+ time = Column(BigInteger, index=True)
+ last_update_time = Column(BigInteger)
+ liked_count = Column(Text)
+ collected_count = Column(Text)
+ comment_count = Column(Text)
+ share_count = Column(Text)
+ image_list = Column(Text)
+ tag_list = Column(Text)
+ note_url = Column(Text)
+ source_keyword = Column(Text, default='')
+ xsec_token = Column(Text)
+
+class XhsNoteComment(Base):
+ __tablename__ = 'xhs_note_comment'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(255))
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ comment_id = Column(String(255), index=True)
+ create_time = Column(BigInteger, index=True)
+ note_id = Column(String(255))
+ content = Column(Text)
+ sub_comment_count = Column(Integer)
+ pictures = Column(Text)
+ parent_comment_id = Column(String(255))
+ like_count = Column(Text)
+
+class TiebaNote(Base):
+ __tablename__ = 'tieba_note'
+ id = Column(Integer, primary_key=True)
+ note_id = Column(String(644), index=True)
+ title = Column(Text)
+ desc = Column(Text)
+ note_url = Column(Text)
+ publish_time = Column(String(255), index=True)
+ user_link = Column(Text, default='')
+ user_nickname = Column(Text, default='')
+ user_avatar = Column(Text, default='')
+ tieba_id = Column(String(255), default='')
+ tieba_name = Column(Text)
+ tieba_link = Column(Text)
+ total_replay_num = Column(Integer, default=0)
+ total_replay_page = Column(Integer, default=0)
+ ip_location = Column(Text, default='')
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ source_keyword = Column(Text, default='')
+
+class TiebaComment(Base):
+ __tablename__ = 'tieba_comment'
+ id = Column(Integer, primary_key=True)
+ comment_id = Column(String(255), index=True)
+ parent_comment_id = Column(String(255), default='')
+ content = Column(Text)
+ user_link = Column(Text, default='')
+ user_nickname = Column(Text, default='')
+ user_avatar = Column(Text, default='')
+ tieba_id = Column(String(255), default='')
+ tieba_name = Column(Text)
+ tieba_link = Column(Text)
+ publish_time = Column(String(255), index=True)
+ ip_location = Column(Text, default='')
+ sub_comment_count = Column(Integer, default=0)
+ note_id = Column(String(255), index=True)
+ note_url = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+
+class TiebaCreator(Base):
+ __tablename__ = 'tieba_creator'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(64))
+ user_name = Column(Text)
+ nickname = Column(Text)
+ avatar = Column(Text)
+ ip_location = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+ gender = Column(Text)
+ follows = Column(Text)
+ fans = Column(Text)
+ registration_duration = Column(Text)
+
+class ZhihuContent(Base):
+ __tablename__ = 'zhihu_content'
+ id = Column(Integer, primary_key=True)
+ content_id = Column(String(64), index=True)
+ content_type = Column(Text)
+ content_text = Column(Text)
+ content_url = Column(Text)
+ question_id = Column(String(255))
+ title = Column(Text)
+ desc = Column(Text)
+ created_time = Column(String(32), index=True)
+ updated_time = Column(Text)
+ voteup_count = Column(Integer, default=0)
+ comment_count = Column(Integer, default=0)
+ source_keyword = Column(Text)
+ user_id = Column(String(255))
+ user_link = Column(Text)
+ user_nickname = Column(Text)
+ user_avatar = Column(Text)
+ user_url_token = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+
+ # persist-1
+ # 原因:修复 ORM 模型定义错误,确保与数据库表结构一致。
+ # 副作用:无
+ # 回滚策略:还原此行
+
+class ZhihuComment(Base):
+ __tablename__ = 'zhihu_comment'
+ id = Column(Integer, primary_key=True)
+ comment_id = Column(String(64), index=True)
+ parent_comment_id = Column(String(64))
+ content = Column(Text)
+ publish_time = Column(String(32), index=True)
+ ip_location = Column(Text)
+ sub_comment_count = Column(Integer, default=0)
+ like_count = Column(Integer, default=0)
+ dislike_count = Column(Integer, default=0)
+ content_id = Column(String(64), index=True)
+ content_type = Column(Text)
+ user_id = Column(String(64))
+ user_link = Column(Text)
+ user_nickname = Column(Text)
+ user_avatar = Column(Text)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
+
+class ZhihuCreator(Base):
+ __tablename__ = 'zhihu_creator'
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(64), unique=True, index=True)
+ user_link = Column(Text)
+ user_nickname = Column(Text)
+ user_avatar = Column(Text)
+ url_token = Column(Text)
+ gender = Column(Text)
+ ip_location = Column(Text)
+ follows = Column(Integer, default=0)
+ fans = Column(Integer, default=0)
+ anwser_count = Column(Integer, default=0)
+ video_count = Column(Integer, default=0)
+ question_count = Column(Integer, default=0)
+ article_count = Column(Integer, default=0)
+ column_count = Column(Integer, default=0)
+ get_voteup_count = Column(Integer, default=0)
+ add_ts = Column(BigInteger)
+ last_modify_ts = Column(BigInteger)
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py
deleted file mode 100644
index eb9c4ce..0000000
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/db.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-
-# -*- coding: utf-8 -*-
-# @Author : relakkes@gmail.com
-# @Time : 2024/4/6 14:54
-# @Desc : mediacrawler db 管理
-import asyncio
-from typing import Dict
-from urllib.parse import urlparse
-
-import aiofiles
-import aiomysql
-
-import config
-from async_db import AsyncMysqlDB
-from async_sqlite_db import AsyncSqliteDB
-from tools import utils
-from var import db_conn_pool_var, media_crawler_db_var
-
-
-async def init_mediacrawler_db():
- """
- 初始化数据库链接池对象,并将该对象塞给media_crawler_db_var上下文变量
- Returns:
-
- """
- pool = await aiomysql.create_pool(
- host=config.MYSQL_DB_HOST,
- port=config.MYSQL_DB_PORT,
- user=config.MYSQL_DB_USER,
- password=config.MYSQL_DB_PWD,
- db=config.MYSQL_DB_NAME,
- autocommit=True,
- )
- async_db_obj = AsyncMysqlDB(pool)
-
- # 将连接池对象和封装的CRUD sql接口对象放到上下文变量中
- db_conn_pool_var.set(pool)
- media_crawler_db_var.set(async_db_obj)
-
-
-async def init_sqlite_db():
- """
- 初始化SQLite数据库对象,并将该对象塞给media_crawler_db_var上下文变量
- Returns:
-
- """
- async_db_obj = AsyncSqliteDB(config.SQLITE_DB_PATH)
-
- # 将SQLite数据库对象放到上下文变量中
- media_crawler_db_var.set(async_db_obj)
-
-
-async def init_db():
- """
- 初始化db连接池
- Returns:
-
- """
- utils.logger.info("[init_db] start init mediacrawler db connect object")
- if config.SAVE_DATA_OPTION == "sqlite":
- await init_sqlite_db()
- utils.logger.info("[init_db] end init sqlite db connect object")
- else:
- await init_mediacrawler_db()
- utils.logger.info("[init_db] end init mysql db connect object")
-
-
-async def close():
- """
- 关闭数据库连接
- Returns:
-
- """
- utils.logger.info("[close] close mediacrawler db connection")
- if config.SAVE_DATA_OPTION == "sqlite":
- # SQLite数据库连接会在AsyncSqliteDB对象销毁时自动关闭
- utils.logger.info("[close] sqlite db connection will be closed automatically")
- else:
- # MySQL连接池关闭
- db_pool: aiomysql.Pool = db_conn_pool_var.get()
- if db_pool is not None:
- db_pool.close()
- utils.logger.info("[close] mysql db pool closed")
-
-
-async def init_table_schema(db_type: str = None):
- """
- 用来初始化数据库表结构,请在第一次需要创建表结构的时候使用,多次执行该函数会将已有的表以及数据全部删除
- Args:
- db_type: 数据库类型,可选值为 'sqlite' 或 'mysql',如果不指定则使用配置文件中的设置
- Returns:
-
- """
- # 如果没有指定数据库类型,则使用配置文件中的设置
- if db_type is None:
- db_type = config.SAVE_DATA_OPTION
-
- if db_type == "sqlite":
- utils.logger.info("[init_table_schema] begin init sqlite table schema ...")
-
- # 检查并删除可能存在的损坏数据库文件
- import os
- if os.path.exists(config.SQLITE_DB_PATH):
- try:
- # 尝试删除现有的数据库文件
- os.remove(config.SQLITE_DB_PATH)
- utils.logger.info(f"[init_table_schema] removed existing sqlite db file: {config.SQLITE_DB_PATH}")
- except Exception as e:
- utils.logger.warning(f"[init_table_schema] failed to remove existing sqlite db file: {e}")
- # 如果删除失败,尝试重命名文件
- try:
- backup_path = f"{config.SQLITE_DB_PATH}.backup_{utils.get_current_timestamp()}"
- os.rename(config.SQLITE_DB_PATH, backup_path)
- utils.logger.info(f"[init_table_schema] renamed existing sqlite db file to: {backup_path}")
- except Exception as rename_e:
- utils.logger.error(f"[init_table_schema] failed to rename existing sqlite db file: {rename_e}")
- raise rename_e
-
- await init_sqlite_db()
- async_db_obj: AsyncSqliteDB = media_crawler_db_var.get()
- async with aiofiles.open("schema/sqlite_tables.sql", mode="r", encoding="utf-8") as f:
- schema_sql = await f.read()
- await async_db_obj.executescript(schema_sql)
- utils.logger.info("[init_table_schema] sqlite table schema init successful")
- elif db_type == "mysql":
- utils.logger.info("[init_table_schema] begin init mysql table schema ...")
- await init_mediacrawler_db()
- async_db_obj: AsyncMysqlDB = media_crawler_db_var.get()
- async with aiofiles.open("schema/tables.sql", mode="r", encoding="utf-8") as f:
- schema_sql = await f.read()
- await async_db_obj.execute(schema_sql)
- utils.logger.info("[init_table_schema] mysql table schema init successful")
- await close()
- else:
- utils.logger.error(f"[init_table_schema] 不支持的数据库类型: {db_type}")
- raise ValueError(f"不支持的数据库类型: {db_type},支持的类型: sqlite, mysql")
-
-
-def show_database_options():
- """
- 显示支持的数据库选项
- """
- print("\n=== MediaCrawler 数据库初始化工具 ===")
- print("支持的数据库类型:")
- print("1. sqlite - SQLite 数据库 (轻量级,无需额外配置)")
- print("2. mysql - MySQL 数据库 (需要配置数据库连接信息)")
- print("3. config - 使用配置文件中的设置")
- print("4. exit - 退出程序")
- print("="*50)
-
-
-def get_user_choice():
- """
- 获取用户选择的数据库类型
- Returns:
- str: 用户选择的数据库类型
- """
- while True:
- choice = input("请输入数据库类型 (sqlite/mysql/config/exit): ").strip().lower()
-
- if choice in ['sqlite', 'mysql', 'config', 'exit']:
- return choice
- else:
- print("❌ 无效的选择,请输入: sqlite, mysql, config 或 exit")
-
-
-async def main():
- """
- 主函数,处理用户交互和数据库初始化
- """
- try:
- show_database_options()
-
- while True:
- choice = get_user_choice()
-
- if choice == 'exit':
- print("👋 程序已退出")
- break
- elif choice == 'config':
- print(f"📋 使用配置文件中的设置: {config.SAVE_DATA_OPTION}")
- await init_table_schema()
- print("✅ 数据库表结构初始化完成!")
- break
- else:
- print(f"🚀 开始初始化 {choice.upper()} 数据库...")
- await init_table_schema(choice)
- print("✅ 数据库表结构初始化完成!")
- break
-
- except KeyboardInterrupt:
- print("\n\n⚠️ 用户中断操作")
- except Exception as e:
- print(f"\n❌ 初始化失败: {str(e)}")
- utils.logger.error(f"[main] 数据库初始化失败: {str(e)}")
-
-
-if __name__ == '__main__':
- asyncio.get_event_loop().run_until_complete(main())
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/config.mjs b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/config.mjs
new file mode 100644
index 0000000..0c8ea73
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/config.mjs
@@ -0,0 +1,89 @@
+import {defineConfig} from 'vitepress'
+
+// https://vitepress.dev/reference/site-config
+export default defineConfig({
+ title: "MediaCrawler自媒体爬虫",
+ description: "小红书爬虫,抖音爬虫, 快手爬虫, B站爬虫, 微博爬虫,百度贴吧爬虫,知乎爬虫...。 ",
+ lastUpdated: true,
+ base: '/MediaCrawler/',
+ head: [
+ [
+ 'script',
+ {async: '', src: 'https://www.googletagmanager.com/gtag/js?id=G-5TK7GF3KK1'}
+ ],
+ [
+ 'script',
+ {},
+ `window.dataLayer = window.dataLayer || [];
+ function gtag(){dataLayer.push(arguments);}
+ gtag('js', new Date());
+ gtag('config', 'G-5TK7GF3KK1');`
+ ]
+ ],
+ themeConfig: {
+ editLink: {
+ pattern: 'https://github.com/NanmiCoder/MediaCrawler/tree/main/docs/:path'
+ },
+ search: {
+ provider: 'local'
+ },
+ // https://vitepress.dev/reference/default-theme-config
+ nav: [
+ {text: '首页', link: '/'},
+ {text: '联系我', link: '/作者介绍'},
+ {text: '支持我', link: '/知识付费介绍'},
+ ],
+
+ sidebar: [
+ {
+ text: '作者介绍',
+ link: '/作者介绍',
+ },
+ {
+ text: 'MediaCrawler使用文档',
+ items: [
+ {text: '基本使用', link: '/'},
+ {text: '常见问题汇总', link: '/常见问题'},
+ {text: 'IP代理使用', link: '/代理使用'},
+ {text: '词云图使用', link: '/词云图使用配置'},
+ {text: '项目目录结构', link: '/项目代码结构'},
+ {text: '手机号登录说明', link: '/手机号登录说明'},
+ ]
+ },
+ {
+ text: '知识付费',
+ items: [
+ {text: '知识付费介绍', link: '/知识付费介绍'},
+ {text: 'MediaCrawlerPro订阅', link: '/mediacrawlerpro订阅'},
+ {
+ text: 'MediaCrawler源码剖析课',
+ link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh'
+ },
+ {text: '知识星球文章专栏', link: '/知识星球介绍'},
+ {text: '开发者咨询服务', link: '/开发者咨询'},
+ ]
+ },
+ {
+ text: 'MediaCrawler项目交流群',
+ link: '/微信交流群',
+ },
+ {
+ text: '爬虫入门教程分享',
+ items: [
+ {text: "我写的爬虫入门教程", link: 'https://github.com/NanmiCoder/CrawlerTutorial'}
+ ]
+ },
+ {
+ text: 'MediaCrawler捐赠名单',
+ items: [
+ {text: "捐赠名单", link: '/捐赠名单'}
+ ]
+ },
+
+ ],
+
+ socialLinks: [
+ {icon: 'github', link: 'https://github.com/NanmiCoder/MediaCrawler'}
+ ]
+ }
+})
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/DynamicAds.vue b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/DynamicAds.vue
new file mode 100644
index 0000000..196356a
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/DynamicAds.vue
@@ -0,0 +1,85 @@
+
+
+
+
+
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/MyLayout.vue b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/MyLayout.vue
new file mode 100644
index 0000000..517f3b3
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/MyLayout.vue
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/custom.css b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/custom.css
new file mode 100644
index 0000000..fcc8e90
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/custom.css
@@ -0,0 +1,9 @@
+/* .vitepress/theme/custom.css */
+/**
+ * Component: Sidebar
+ * -------------------------------------------------------------------------- */
+
+:root {
+ --vp-sidebar-width: 285px;
+ --vp-sidebar-bg-color: var(--vp-c-bg-alt);
+}
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/index.js b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/index.js
new file mode 100644
index 0000000..d3eb3e7
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/.vitepress/theme/index.js
@@ -0,0 +1,9 @@
+// .vitepress/theme/index.js
+import DefaultTheme from 'vitepress/theme'
+import MyLayout from './MyLayout.vue'
+
+export default {
+ extends: DefaultTheme,
+ // 使用注入插槽的包装组件覆盖 Layout
+ Layout: MyLayout
+}
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/CDP模式使用指南.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/CDP模式使用指南.md
new file mode 100644
index 0000000..541cbc3
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/CDP模式使用指南.md
@@ -0,0 +1,246 @@
+# CDP模式使用指南
+
+## 概述
+
+CDP(Chrome DevTools Protocol)模式是一种高级的反检测爬虫技术,通过控制用户现有的Chrome/Edge浏览器来进行网页爬取。与传统的Playwright自动化相比,CDP模式具有以下优势:
+
+### 🎯 主要优势
+
+1. **真实浏览器环境**: 使用用户实际安装的浏览器,包含所有扩展、插件和个人设置
+2. **更好的反检测能力**: 浏览器指纹更加真实,难以被网站检测为自动化工具
+3. **保留用户状态**: 自动继承用户的登录状态、Cookie和浏览历史
+4. **扩展支持**: 可以利用用户安装的广告拦截器、代理扩展等工具
+5. **更自然的行为**: 浏览器行为模式更接近真实用户
+
+## 快速开始
+
+### 1. 启用CDP模式
+
+在 `config/base_config.py` 中设置:
+
+```python
+# 启用CDP模式
+ENABLE_CDP_MODE = True
+
+# CDP调试端口(可选,默认9222)
+CDP_DEBUG_PORT = 9222
+
+# 是否在无头模式下运行(建议设为False以获得最佳反检测效果)
+CDP_HEADLESS = False
+
+# 程序结束时是否自动关闭浏览器
+AUTO_CLOSE_BROWSER = True
+```
+
+### 2. 运行测试
+
+```bash
+# 运行CDP功能测试
+python examples/cdp_example.py
+
+# 运行小红书爬虫(CDP模式)
+python main.py
+```
+
+## 配置选项详解
+
+### 基础配置
+
+| 配置项 | 类型 | 默认值 | 说明 |
+|--------|------|--------|------|
+| `ENABLE_CDP_MODE` | bool | False | 是否启用CDP模式 |
+| `CDP_DEBUG_PORT` | int | 9222 | CDP调试端口 |
+| `CDP_HEADLESS` | bool | False | CDP模式下的无头模式 |
+| `AUTO_CLOSE_BROWSER` | bool | True | 程序结束时是否关闭浏览器 |
+
+### 高级配置
+
+| 配置项 | 类型 | 默认值 | 说明 |
+|--------|------|--------|------|
+| `CUSTOM_BROWSER_PATH` | str | "" | 自定义浏览器路径 |
+| `BROWSER_LAUNCH_TIMEOUT` | int | 30 | 浏览器启动超时时间(秒) |
+
+### 自定义浏览器路径
+
+如果系统自动检测失败,可以手动指定浏览器路径:
+
+```python
+# Windows示例
+CUSTOM_BROWSER_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
+
+# macOS示例
+CUSTOM_BROWSER_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+
+# Linux示例
+CUSTOM_BROWSER_PATH = "/usr/bin/google-chrome"
+```
+
+## 支持的浏览器
+
+### Windows
+- Google Chrome (稳定版、Beta、Dev、Canary)
+- Microsoft Edge (稳定版、Beta、Dev、Canary)
+
+### macOS
+- Google Chrome (稳定版、Beta、Dev、Canary)
+- Microsoft Edge (稳定版、Beta、Dev、Canary)
+
+### Linux
+- Google Chrome / Chromium
+- Microsoft Edge
+
+## 使用示例
+
+### 基本使用
+
+```python
+import asyncio
+from playwright.async_api import async_playwright
+from tools.cdp_browser import CDPBrowserManager
+
+async def main():
+ cdp_manager = CDPBrowserManager()
+
+ async with async_playwright() as playwright:
+ # 启动CDP浏览器
+ browser_context = await cdp_manager.launch_and_connect(
+ playwright=playwright,
+ user_agent="自定义User-Agent",
+ headless=False
+ )
+
+ # 创建页面并访问网站
+ page = await browser_context.new_page()
+ await page.goto("https://example.com")
+
+ # 执行爬取操作...
+
+ # 清理资源
+ await cdp_manager.cleanup()
+
+asyncio.run(main())
+```
+
+### 在爬虫中使用
+
+CDP模式已集成到所有平台爬虫中,只需启用配置即可:
+
+```python
+# 在config/base_config.py中
+ENABLE_CDP_MODE = True
+
+# 然后正常运行爬虫
+python main.py
+```
+
+## 故障排除
+
+### 常见问题
+
+#### 1. 浏览器检测失败
+**错误**: `未找到可用的浏览器`
+
+**解决方案**:
+- 确保已安装Chrome或Edge浏览器
+- 检查浏览器是否在标准路径下
+- 使用`CUSTOM_BROWSER_PATH`指定浏览器路径
+
+#### 2. 端口被占用
+**错误**: `无法找到可用的端口`
+
+**解决方案**:
+- 关闭其他使用调试端口的程序
+- 修改`CDP_DEBUG_PORT`为其他端口
+- 系统会自动尝试下一个可用端口
+
+#### 3. 浏览器启动超时
+**错误**: `浏览器在30秒内未能启动`
+
+**解决方案**:
+- 增加`BROWSER_LAUNCH_TIMEOUT`值
+- 检查系统资源是否充足
+- 尝试关闭其他占用资源的程序
+
+#### 4. CDP连接失败
+**错误**: `CDP连接失败`
+
+**解决方案**:
+- 检查防火墙设置
+- 确保localhost访问正常
+- 尝试重启浏览器
+
+### 调试技巧
+
+#### 1. 启用详细日志
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+
+#### 2. 手动测试CDP连接
+```bash
+# 手动启动Chrome
+chrome --remote-debugging-port=9222
+
+# 访问调试页面
+curl http://localhost:9222/json
+```
+
+#### 3. 检查浏览器进程
+```bash
+# Windows
+tasklist | findstr chrome
+
+# macOS/Linux
+ps aux | grep chrome
+```
+
+## 最佳实践
+
+### 1. 反检测优化
+- 保持`CDP_HEADLESS = False`以获得最佳反检测效果
+- 使用真实的User-Agent字符串
+- 避免过于频繁的请求
+
+### 2. 性能优化
+- 合理设置`AUTO_CLOSE_BROWSER`
+- 复用浏览器实例而不是频繁重启
+- 监控内存使用情况
+
+### 3. 安全考虑
+- 不要在生产环境中保存敏感Cookie
+- 定期清理浏览器数据
+- 注意用户隐私保护
+
+### 4. 兼容性
+- 测试不同浏览器版本的兼容性
+- 准备回退方案(标准Playwright模式)
+- 监控目标网站的反爬策略变化
+
+## 技术原理
+
+CDP模式的工作原理:
+
+1. **浏览器检测**: 自动扫描系统中的Chrome/Edge安装路径
+2. **进程启动**: 使用`--remote-debugging-port`参数启动浏览器
+3. **CDP连接**: 通过WebSocket连接到浏览器的调试接口
+4. **Playwright集成**: 使用`connectOverCDP`方法接管浏览器控制
+5. **上下文管理**: 创建或复用浏览器上下文进行操作
+
+这种方式绕过了传统WebDriver的检测机制,提供了更加隐蔽的自动化能力。
+
+## 更新日志
+
+### v1.0.0
+- 初始版本发布
+- 支持Windows和macOS的Chrome/Edge检测
+- 集成到所有平台爬虫
+- 提供完整的配置选项和错误处理
+
+## 贡献
+
+欢迎提交Issue和Pull Request来改进CDP模式功能。
+
+## 许可证
+
+本功能遵循项目的整体许可证条款,仅供学习和研究使用。
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/hit_stopwords.txt b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/hit_stopwords.txt
new file mode 100644
index 0000000..1d1818e
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/hit_stopwords.txt
@@ -0,0 +1,768 @@
+\n
+———
+》),
+)÷(1-
+”,
+)、
+=(
+:
+→
+℃
+&
+*
+一一
+~~~~
+’
+.
+『
+.一
+./
+--
+』
+=″
+【
+[*]
+}>
+[⑤]]
+[①D]
+c]
+ng昉
+*
+//
+[
+]
+[②e]
+[②g]
+={
+}
+,也
+‘
+A
+[①⑥]
+[②B]
+[①a]
+[④a]
+[①③]
+[③h]
+③]
+1.
+--
+[②b]
+’‘
+×××
+[①⑧]
+0:2
+=[
+[⑤b]
+[②c]
+[④b]
+[②③]
+[③a]
+[④c]
+[①⑤]
+[①⑦]
+[①g]
+∈[
+[①⑨]
+[①④]
+[①c]
+[②f]
+[②⑧]
+[②①]
+[①C]
+[③c]
+[③g]
+[②⑤]
+[②②]
+一.
+[①h]
+.数
+[]
+[①B]
+数/
+[①i]
+[③e]
+[①①]
+[④d]
+[④e]
+[③b]
+[⑤a]
+[①A]
+[②⑧]
+[②⑦]
+[①d]
+[②j]
+〕〔
+][
+://
+′∈
+[②④
+[⑤e]
+12%
+b]
+...
+...................
+…………………………………………………③
+ZXFITL
+[③F]
+」
+[①o]
+]∧′=[
+∪φ∈
+′|
+{-
+②c
+}
+[③①]
+R.L.
+[①E]
+Ψ
+-[*]-
+↑
+.日
+[②d]
+[②
+[②⑦]
+[②②]
+[③e]
+[①i]
+[①B]
+[①h]
+[①d]
+[①g]
+[①②]
+[②a]
+f]
+[⑩]
+a]
+[①e]
+[②h]
+[②⑥]
+[③d]
+[②⑩]
+e]
+〉
+】
+元/吨
+[②⑩]
+2.3%
+5:0
+[①]
+::
+[②]
+[③]
+[④]
+[⑤]
+[⑥]
+[⑦]
+[⑧]
+[⑨]
+……
+——
+?
+、
+。
+“
+”
+《
+》
+!
+,
+:
+;
+?
+.
+,
+.
+'
+?
+·
+———
+──
+?
+—
+<
+>
+(
+)
+〔
+〕
+[
+]
+(
+)
+-
++
+~
+×
+/
+/
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+Ⅲ
+В
+"
+;
+#
+@
+γ
+μ
+φ
+φ.
+×
+Δ
+■
+▲
+sub
+exp
+sup
+sub
+Lex
+#
+%
+&
+'
++
++ξ
+++
+-
+-β
+<
+<±
+<Δ
+<λ
+<φ
+<<
+=
+=
+=☆
+=-
+>
+>λ
+_
+~±
+~+
+[⑤f]
+[⑤d]
+[②i]
+≈
+[②G]
+[①f]
+LI
+㈧
+[-
+......
+〉
+[③⑩]
+第二
+一番
+一直
+一个
+一些
+许多
+种
+有的是
+也就是说
+末##末
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
+并且
+不比
+不成
+不单
+不但
+不独
+不管
+不光
+不过
+不仅
+不拘
+不论
+不怕
+不然
+不如
+不特
+不惟
+不问
+不只
+朝
+朝着
+趁
+趁着
+乘
+冲
+除
+除此之外
+除非
+除了
+此
+此间
+此外
+从
+从而
+打
+待
+但
+但是
+当
+当着
+到
+得
+的
+的话
+等
+等等
+地
+第
+叮咚
+对
+对于
+多
+多少
+而
+而况
+而且
+而是
+而外
+而言
+而已
+尔后
+反过来
+反过来说
+反之
+非但
+非徒
+否则
+嘎
+嘎登
+该
+赶
+个
+各
+各个
+各位
+各种
+各自
+给
+根据
+跟
+故
+故此
+固然
+关于
+管
+归
+果然
+果真
+过
+哈
+哈哈
+呵
+和
+何
+何处
+何况
+何时
+嘿
+哼
+哼唷
+呼哧
+乎
+哗
+还是
+还有
+换句话说
+换言之
+或
+或是
+或者
+极了
+及
+及其
+及至
+即
+即便
+即或
+即令
+即若
+即使
+几
+几时
+己
+既
+既然
+既是
+继而
+加之
+假如
+假若
+假使
+鉴于
+将
+较
+较之
+叫
+接着
+结果
+借
+紧接着
+进而
+尽
+尽管
+经
+经过
+就
+就是
+就是说
+据
+具体地说
+具体说来
+开始
+开外
+靠
+咳
+可
+可见
+可是
+可以
+况且
+啦
+来
+来着
+离
+例如
+哩
+连
+连同
+两者
+了
+临
+另
+另外
+另一方面
+论
+嘛
+吗
+慢说
+漫说
+冒
+么
+每
+每当
+们
+莫若
+某
+某个
+某些
+拿
+哪
+哪边
+哪儿
+哪个
+哪里
+哪年
+哪怕
+哪天
+哪些
+哪样
+那
+那边
+那儿
+那个
+那会儿
+那里
+那么
+那么些
+那么样
+那时
+那些
+那样
+乃
+乃至
+呢
+能
+你
+你们
+您
+宁
+宁可
+宁肯
+宁愿
+哦
+呕
+啪达
+旁人
+呸
+凭
+凭借
+其
+其次
+其二
+其他
+其它
+其一
+其余
+其中
+起
+起见
+起见
+岂但
+恰恰相反
+前后
+前者
+且
+然而
+然后
+然则
+让
+人家
+任
+任何
+任凭
+如
+如此
+如果
+如何
+如其
+如若
+如上所述
+若
+若非
+若是
+啥
+上下
+尚且
+设若
+设使
+甚而
+甚么
+甚至
+省得
+时候
+什么
+什么样
+使得
+是
+是的
+首先
+谁
+谁知
+顺
+顺着
+似的
+虽
+虽然
+虽说
+虽则
+随
+随着
+所
+所以
+他
+他们
+他人
+它
+它们
+她
+她们
+倘
+倘或
+倘然
+倘若
+倘使
+腾
+替
+通过
+同
+同时
+哇
+万一
+往
+望
+为
+为何
+为了
+为什么
+为着
+喂
+嗡嗡
+我
+我们
+呜
+呜呼
+乌乎
+无论
+无宁
+毋宁
+嘻
+吓
+相对而言
+像
+向
+向着
+嘘
+呀
+焉
+沿
+沿着
+要
+要不
+要不然
+要不是
+要么
+要是
+也
+也罢
+也好
+一
+一般
+一旦
+一方面
+一来
+一切
+一样
+一则
+依
+依照
+矣
+以
+以便
+以及
+以免
+以至
+以至于
+以致
+抑或
+因
+因此
+因而
+因为
+哟
+用
+由
+由此可见
+由于
+有
+有的
+有关
+有些
+又
+于
+于是
+于是乎
+与
+与此同时
+与否
+与其
+越是
+云云
+哉
+再说
+再者
+在
+在下
+咱
+咱们
+则
+怎
+怎么
+怎么办
+怎么样
+怎样
+咋
+照
+照着
+者
+这
+这边
+这儿
+这个
+这会儿
+这就是说
+这里
+这么
+这么点儿
+这么些
+这么样
+这时
+这些
+这样
+正如
+吱
+之
+之类
+之所以
+之一
+只是
+只限
+只要
+只有
+至
+至于
+诸位
+着
+着呢
+自
+自从
+自个儿
+自各儿
+自己
+自家
+自身
+综上所述
+总的来看
+总的来说
+总的说来
+总而言之
+总之
+纵
+纵令
+纵然
+纵使
+遵照
+作为
+兮
+呃
+呗
+咚
+咦
+喏
+啐
+喔唷
+嗬
+嗯
+嗳
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/index.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/index.md
new file mode 100644
index 0000000..9c55dd2
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/index.md
@@ -0,0 +1,77 @@
+# MediaCrawler使用方法
+
+## 创建并激活 python 虚拟环境
+> 如果是爬取抖音和知乎,需要提前安装nodejs环境,版本大于等于:`16`即可
+ ```shell
+ # 进入项目根目录
+ cd MediaCrawler
+
+ # 创建虚拟环境
+ # 我的python版本是:3.9.6,requirements.txt中的库是基于这个版本的,如果是其他python版本,可能requirements.txt中的库不兼容,自行解决一下。
+ python -m venv venv
+
+ # macos & linux 激活虚拟环境
+ source venv/bin/activate
+
+ # windows 激活虚拟环境
+ venv\Scripts\activate
+
+ ```
+
+## 安装依赖库
+
+ ```shell
+ pip install -r requirements.txt
+ ```
+
+## 安装 playwright浏览器驱动
+
+ ```shell
+ playwright install
+ ```
+
+## 运行爬虫程序
+
+ ```shell
+ ### 项目默认是没有开启评论爬取模式,如需评论请在config/base_config.py中的 ENABLE_GET_COMMENTS 变量修改
+ ### 一些其他支持项,也可以在config/base_config.py查看功能,写的有中文注释
+
+ # 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
+ python main.py --platform xhs --lt qrcode --type search
+
+ # 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
+ python main.py --platform xhs --lt qrcode --type detail
+
+ # 使用SQLite数据库存储数据(推荐个人用户使用)
+ python main.py --platform xhs --lt qrcode --type search --save_data_option sqlite
+
+ # 使用MySQL数据库存储数据
+ python main.py --platform xhs --lt qrcode --type search --save_data_option db
+
+ # 打开对应APP扫二维码登录
+
+ # 其他平台爬虫使用示例,执行下面的命令查看
+ python main.py --help
+ ```
+
+## 💾 数据存储
+
+支持多种数据存储方式:
+- **CSV 文件**: 支持保存至 CSV (位于 `data/` 目录下)
+- **JSON 文件**: 支持保存至 JSON (位于 `data/` 目录下)
+- **数据库存储**
+ - 使用 `--init_db` 参数进行数据库初始化 (使用 `--init_db` 时,无需其他可选参数)
+ - **SQLite 数据库**: 轻量级数据库,无需服务器,适合个人使用 (推荐)
+ 1. 初始化: `--init_db sqlite`
+ 2. 数据存储: `--save_data_option sqlite`
+ - **MySQL 数据库**: 支持保存至关系型数据库 MySQL (需提前创建数据库)
+ 1. 初始化: `--init_db mysql`
+ 2. 数据存储: `--save_data_option db` (db 参数为兼容历史更新保留)
+
+## 免责声明
+> **免责声明:**
+>
+> 大家请以学习为目的使用本仓库,爬虫违法违规的案件:https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China
+>
+>本项目的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/mediacrawlerpro订阅.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/mediacrawlerpro订阅.md
new file mode 100644
index 0000000..9e2b611
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/mediacrawlerpro订阅.md
@@ -0,0 +1,45 @@
+# 订阅MediaCrawlerPro版本源码访问权限
+
+## 获取Pro版本的访问权限
+> MediaCrawler开源超过一年了,相信该仓库帮过不少朋友低门槛的学习和了解爬虫。维护真的耗费了大量精力和人力
+>
+> 所以Pro版本不会开源,可以订阅Pro版本让我更加有动力去更新。
+>
+> 如果感兴趣可以加我微信,订阅Pro版本访问权限哦,有门槛💰。
+>
+> 仅针对想学习Pro版本源码实现的用户,如果是公司或者商业化盈利性质的就不要加我了,谢谢🙏
+>
+> 代码设计拓展性强,可以自己扩展更多的爬虫平台,更多的数据存储方式,相信对你架构这种爬虫代码有所帮助。
+>
+>
+> **MediaCrawlerPro项目主页地址**
+> [MediaCrawlerPro Github主页地址](https://github.com/MediaCrawlerPro)
+
+
+
+扫描下方我的个人微信,备注:pro版本(如果图片展示不出来,可以直接添加我的微信号:relakkes)
+
+
+
+
+## Pro版本诞生的背景
+[MediaCrawler](https://github.com/NanmiCoder/MediaCrawler)这个项目开源至今获得了大量的关注,同时也暴露出来了一系列问题,比如:
+- 能否支持多账号?
+- 能否在linux部署?
+- 能否去掉playwright的依赖?
+- 有没有更简单的部署方法?
+- 有没有针对新手上门槛更低的方法?
+
+诸如上面的此类问题,想要在原有项目上去动刀,无疑是增加了复杂度,可能导致后续的维护更加困难。
+出于可持续维护、简便易用、部署简单等目的,对MediaCrawler进行彻底重构。
+
+## 项目介绍
+### [MediaCrawler](https://github.com/NanmiCoder/MediaCrawler)的Pro版本python实现
+**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**,**知乎爬虫**...。
+
+支持多种平台的爬虫,支持多种数据的爬取,支持多种数据的存储,最重要的**完美支持多账号+IP代理池,让你的爬虫更加稳定**。
+相较于MediaCrawler,Pro版本最大的变化:
+- 去掉了playwright的依赖,不再将Playwright集成到爬虫主干中,依赖过重。
+- 增加了Docker,Docker-compose的方式部署,让部署更加简单。
+- 多账号+IP代理池的支持,让爬虫更加稳定。
+- 新增签名服务,解耦签名逻辑,让爬虫更加灵活。
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/11群二维码.JPG b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/11群二维码.JPG
new file mode 100644
index 0000000..4726ad1
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/11群二维码.JPG differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/12群二维码.JPG b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/12群二维码.JPG
new file mode 100644
index 0000000..321d3ee
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/12群二维码.JPG differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/13群二维码.JPG b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/13群二维码.JPG
new file mode 100644
index 0000000..06d34ab
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/13群二维码.JPG differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/14群二维码.jpeg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/14群二维码.jpeg
new file mode 100644
index 0000000..415536d
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/14群二维码.jpeg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/IP_提取图.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/IP_提取图.png
new file mode 100644
index 0000000..e15de57
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/IP_提取图.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/auto_test.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/auto_test.png
new file mode 100644
index 0000000..8729527
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/auto_test.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img.png
new file mode 100644
index 0000000..b20e8d1
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_1.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_1.png
new file mode 100644
index 0000000..4a52f93
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_1.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_2.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_2.png
new file mode 100644
index 0000000..790aec8
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_2.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_3.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_3.png
new file mode 100644
index 0000000..acf4041
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_3.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_4.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_4.png
new file mode 100644
index 0000000..4b33c95
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_4.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_5.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_5.png
new file mode 100644
index 0000000..c5e3bfa
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_5.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_6.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_6.png
new file mode 100644
index 0000000..a5172b3
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_6.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_7.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_7.png
new file mode 100644
index 0000000..cbe9355
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_7.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_8.jpg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_8.jpg
new file mode 100644
index 0000000..f087f9f
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/img_8.jpg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/nstbrowser.jpg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/nstbrowser.jpg
new file mode 100644
index 0000000..b74cc65
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/nstbrowser.jpg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/relakkes_weichat.jpg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/relakkes_weichat.jpg
new file mode 100644
index 0000000..f72e2e3
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/relakkes_weichat.jpg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner.png
new file mode 100644
index 0000000..9e42f52
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner_zh.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner_zh.png
new file mode 100644
index 0000000..3df91b7
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/tikhub_banner_zh.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img.png
new file mode 100644
index 0000000..26fa2d4
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_1.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_1.png
new file mode 100644
index 0000000..23aedbb
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_1.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_2.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_2.png
new file mode 100644
index 0000000..a65828e
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_2.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_4.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_4.png
new file mode 100644
index 0000000..8d329fa
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wd_http_img_4.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wechat_pay.jpeg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wechat_pay.jpeg
new file mode 100644
index 0000000..97aba85
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/wechat_pay.jpeg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/xingqiu.jpg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/xingqiu.jpg
new file mode 100644
index 0000000..7cf0eb9
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/xingqiu.jpg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/zfb_pay.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/zfb_pay.png
new file mode 100644
index 0000000..be98c4c
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/zfb_pay.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/代理IP 流程图.drawio.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/代理IP 流程图.drawio.png
new file mode 100644
index 0000000..fefa563
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/代理IP 流程图.drawio.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/修改代理密钥.png b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/修改代理密钥.png
new file mode 100644
index 0000000..d8b70d2
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/修改代理密钥.png differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/星球qrcode.jpg b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/星球qrcode.jpg
new file mode 100644
index 0000000..53bc497
Binary files /dev/null and b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/static/images/星球qrcode.jpg differ
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/代理使用.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/代理使用.md
new file mode 100644
index 0000000..681a0dd
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/代理使用.md
@@ -0,0 +1,15 @@
+# 代理 IP 使用说明
+> 还是得跟大家再次强调下,不要对一些自媒体平台进行大规模爬虫或其他非法行为,要踩缝纫机的哦🤣
+
+## 简易的流程图
+
+
+
+
+## 选择一个代理IP提供商
+
+### 快代理
+[快代理使用文档](快代理使用文档.md)
+
+### 豌豆HTTP文档查看
+[豌豆HTTP使用文档](豌豆HTTP使用文档.md)
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/作者介绍.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/作者介绍.md
new file mode 100644
index 0000000..2e64305
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/作者介绍.md
@@ -0,0 +1,21 @@
+# 关于作者
+> 大家都叫我阿江,网名:程序员阿江-Relakkes,目前裸辞正探索自由职业,希望能靠自己的技术能力和努力,实现自己理想的生活方式。
+>
+> 我身边有大量的技术人脉资源,如果大家有一些爬虫咨询或者编程单子可以向我丢过来
+
+- [Github万星开源自媒体爬虫仓库MediaCrawler作者](https://github.com/NanmiCoder/MediaCrawler)
+- 全栈程序员,熟悉Python、Golang、JavaScript,工作中主要用Golang。
+- 曾经主导并参与过百万级爬虫采集系统架构设计与编码
+- 爬虫是一种技术兴趣爱好,参与爬虫有一种对抗的感觉,越难越兴奋。
+
+## 微信联系方式
+
+
+## B站主页地址
+https://space.bilibili.com/434377496
+
+## 抖音主页地址
+https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?previous_page=app_code_link
+
+## 小红书主页地址
+https://www.xiaohongshu.com/user/profile/5f58bd990000000001003753?xhsshare=CopyLink&appuid=5f58bd990000000001003753&apptime=1724737153
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/原生环境管理文档.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/原生环境管理文档.md
new file mode 100644
index 0000000..08b981a
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/原生环境管理文档.md
@@ -0,0 +1,52 @@
+## 使用python原生venv管理依赖(不推荐了)
+
+## 创建并激活 python 虚拟环境
+> 如果是爬取抖音和知乎,需要提前安装nodejs环境,版本大于等于:`16`即可
+> 新增 [uv](https://github.com/astral-sh/uv) 来管理项目依赖,使用uv来替代python版本管理、pip进行依赖安装,更加方便快捷
+ ```shell
+ # 进入项目根目录
+ cd MediaCrawler
+
+ # 创建虚拟环境
+ # 我的python版本是:3.9.6,requirements.txt中的库是基于这个版本的,如果是其他python版本,可能requirements.txt中的库不兼容,自行解决一下。
+ python -m venv venv
+
+ # macos & linux 激活虚拟环境
+ source venv/bin/activate
+
+ # windows 激活虚拟环境
+ venv\Scripts\activate
+
+ ```
+
+## 安装依赖库
+
+ ```shell
+ pip install -r requirements.txt
+ ```
+
+## 查看配置文件
+
+## 安装 playwright浏览器驱动 (非必需)
+
+ ```shell
+ playwright install
+ ```
+
+## 运行爬虫程序
+
+ ```shell
+ ### 项目默认是没有开启评论爬取模式,如需评论请在config/base_config.py中的 ENABLE_GET_COMMENTS 变量修改
+ ### 一些其他支持项,也可以在config/base_config.py查看功能,写的有中文注释
+
+ # 从配置文件中读取关键词搜索相关的帖子并爬取帖子信息与评论
+ python main.py --platform xhs --lt qrcode --type search
+
+ # 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
+ python main.py --platform xhs --lt qrcode --type detail
+
+ # 打开对应APP扫二维码登录
+
+ # 其他平台爬虫使用示例,执行下面的命令查看
+ python main.py --help
+ ```
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/常见问题.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/常见问题.md
new file mode 100644
index 0000000..e2d294e
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/常见问题.md
@@ -0,0 +1,45 @@
+# 常见程序运行出错问题
+
+## 缺少node环境导致的问题
+Q: 爬取抖音和知乎报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
+A: 该错误为缺少 nodejs 环境,这个错误可以通过安装 nodejs 环境来解决,版本大于等:`v16`
+
+Q: 使用Cookie爬取抖音报错: execjs._exceptions.ProgramError: TypeError: Cannot read property 'JS_MD5_NO_COMMON_JS' of null
+A: windows电脑去网站下载`https://nodejs.org/en/blog/release/v16.8.0` Windows 64-bit Installer 版本,一直下一步即可。
+
+## xhs登录出现滑块一直验证不通过问题
+
+Q: 小红书扫码登录成功后,浏览器一直在验证滑块,无法登录?
+A: 这种情况一般是因为使用playwright浏览器驱动被识别出来的问题,可以尝试删除项目目录下的`brower_data`文件夹,重新走登录流程。
+
+## 如何指定关键词
+Q: 可以指定关键词爬取吗?
+A: 在config/base_config.py 中 KEYWORDS 参数用于控制需要爬取的关键词
+
+## 如何指定帖子
+Q: 可以指定帖子爬取吗?
+A:在config/base_config.py 中 XHS_SPECIFIED_ID_LIST 参数用于控制需要指定爬取的帖子ID列表
+
+## 爬取失效
+Q: 刚开始能爬取数据,过一段时间就是失效了?
+A:出现这种情况多半是由于你的账号触发了平台风控机制了,❗️❗️请勿大规模对平台进行爬虫,影响平台。
+
+## 如何更换另一个账号
+Q: 如何更换登录账号?
+A:删除项目根目录下的 brower_data/ 文件夹即可
+
+## playwright超时问题
+Q: 报错 `playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.`
+A: 出现这种情况检查下开梯子没有
+
+## 如果配置playwright浏览器驱动过滑块验证
+Q: 小红书扫码登录成功后如何手动验证?
+A: 打开 config/base_config.py 文件, 找到 HEADLESS 配置项, 将其设置为 False, 此时重启项目, 在浏览器中手动通过验证码
+
+## 词云图生成
+Q: 如何配置词云图的生成?
+A: 打开 config/base_config.py 文件, 找到`ENABLE_GET_WORDCLOUD` 以及`ENABLE_GET_COMMENTS` 两个配置项,将其都设为True即可使用该功能。
+
+## 词云图添加禁用词和自定义词组
+Q: 如何给词云图添加禁用词和自定义词组?
+A: 打开 `docs/hit_stopwords.txt` 输入禁用词(注意一个词语一行)。打开 config/base_config.py 文件找到 `CUSTOM_WORDS `按格式添加自定义词组即可。
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/开发者咨询.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/开发者咨询.md
new file mode 100644
index 0000000..8b910a2
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/开发者咨询.md
@@ -0,0 +1,17 @@
+# 开发者咨询
+
+## 咨询价格
+
+提供200/小时的咨询服务,最低收费为1小时,帮你快速解决项目中遇到的问题
+
+##### 支持的提问类别
+- MediaCrawler项目源码解读、安装、部署、使用问题
+- 爬虫项目开发问题
+- Python、Golang、JavaScript等编程问题
+- JS逆向问题
+- 其他问题(职业规划、工作经验等)
+
+## 加我微信
+> 备注:咨服服务
+>
+
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/微信交流群.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/微信交流群.md
new file mode 100644
index 0000000..c6dffd7
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/微信交流群.md
@@ -0,0 +1,12 @@
+# MediaCrawler项目微信交流群
+
+👏👏👏 汇聚爬虫技术爱好者,共同学习,共同进步。
+
+❗️❗️❗️群内禁止广告,禁止发各类违规和MediaCrawler不相关的问题
+
+## 加群方式
+> 备注:github,会有拉群小助手自动拉你进群。
+>
+> 如果图片展示不出来或过期,可以直接添加我的微信号:relakkes,并备注github,会有拉群小助手自动拉你进群
+
+
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/快代理使用文档.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/快代理使用文档.md
new file mode 100644
index 0000000..acba3bd
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/快代理使用文档.md
@@ -0,0 +1,41 @@
+## 快代理使用文档(支持个人和企业用户)
+
+## 准备代理 IP 信息
+点击 快代理 官网注册并实名认证(国内使用代理 IP 必须要实名,懂的都懂)
+
+## 获取 IP 代理的密钥信息
+从 快代理 官网获取免费试用,如下图所示
+
+
+注意:选择私密代理
+
+
+选择开通试用
+
+
+初始化一个快代理的示例,如下代码所示,需要4个参数
+
+```python
+# 文件地址: proxy/providers/kuai_daili_proxy.py
+# -*- coding: utf-8 -*-
+def new_kuai_daili_proxy() -> KuaiDaiLiProxy:
+ """
+ 构造快代理HTTP实例
+ Returns:
+
+ """
+ return KuaiDaiLiProxy(
+ kdl_secret_id=os.getenv("kdl_secret_id", "你的快代理secert_id"),
+ kdl_signature=os.getenv("kdl_signature", "你的快代理签名"),
+ kdl_user_name=os.getenv("kdl_user_name", "你的快代理用户名"),
+ kdl_user_pwd=os.getenv("kdl_user_pwd", "你的快代理密码"),
+ )
+
+```
+在试用的订单中可以看到这四个参数,如下图所示
+
+`kdl_user_name`、`kdl_user_pwd`
+
+
+`kdl_secret_id`、`kdl_signature`
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/手机号登录说明.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/手机号登录说明.md
new file mode 100644
index 0000000..63c1dd5
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/手机号登录说明.md
@@ -0,0 +1,20 @@
+# 关于手机号+验证码登录的说明
+> 配置过程相当复杂,不建议采用该种方式
+
+当在浏览器模拟人为发起手机号登录请求时,使用短信转发软件将验证码发送至爬虫端回填,完成自动登录
+
+准备工作:
+
+- 安卓机1台(IOS没去研究,理论上监控短信也是可行的)
+- 安装短信转发软件 [参考仓库](https://github.com/pppscn/SmsForwarder)
+- 转发软件中配置WEBHOOK相关的信息,主要分为 消息模板(请查看本项目中的recv_sms_notification.py)、一个能push短信通知的API地址
+- push的API地址一般是需要绑定一个域名的(当然也可以是内网的IP地址),我用的是内网穿透方式,会有一个免费的域名绑定到内网的web
+ server,内网穿透工具 [ngrok](https://ngrok.com/docs/)
+- 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html)
+- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知
+- 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone`
+
+备注:
+
+- 短信转发软件会不会监控自己手机上其他短信内容?(理论上应该不会,因为[短信转发仓库](https://github.com/pppscn/SmsForwarder)
+star还是蛮多的)
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/捐赠名单.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/捐赠名单.md
new file mode 100644
index 0000000..d724f88
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/捐赠名单.md
@@ -0,0 +1,71 @@
+## 捐赠MediaCrawler开源项目
+> 捐赠时请务必备注您的昵称,我会在捐赠名单中表达对您的感谢
+
+## 赞赏二维码
+
+
+
+
+ 微信赞赏
+
+ |
+
+ 支付宝赞赏
+
+ |
+
+
+
+# MediaCrawler捐赠名单
+
+> 再次感谢下面的捐赠者们对MediaCrawler的鼎力支持,是你们的支持让MediaCrawler的更新有了动力。
+
+PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉)
+
+| 捐赠者 | 捐赠金额 | 捐赠日期 |
+| ----------- | -------- | ---------- |
+| RichardYU | 99 元 | 2025-06-19 |
+| Z.FB | 20 元 | 2025-04-10 |
+| 若成 | 20 元 | 2025-04-01 |
+| Puple_twirl | 20 元 | 2025-03-30 |
+| N--F | 20 元 | 2025-03-13 |
+| 财* | 20 元 | 2025-03-06 |
+| 布莱** | 1 元 | 2025-01-27 |
+| xldmilktea | 20 元 | 2025-01-25 |
+| ChenWenLon | 20 元 | 2025-01-07 |
+| steam | 20 元 | 2024-12-20 |
+| mike | 20 元 | 2024-12-17 |
+| thechnolog | 5 元 | 2024-11-05 |
+| yinzhou | 100 元 | 2024-10-21 |
+| Tnk_se | 50 元 | 2024-10-21 |
+| 望、7 | 66 元 | 2024-09-26 |
+| 凌凌7 | 200 元 | 2024-09-19 |
+| yutao | 20 元 | 2024-09-19 |
+| Urtb* | 100 元 | 2024-09-07 |
+| Tornado | 66 元 | 2024-09-04 |
+| srhedbj | 50 元 | 2024-08-20 |
+| *嘉 | 20 元 | 2024-08-15 |
+| *良 | 50 元 | 2024-08-13 |
+| *皓 | 50 元 | 2024-03-18 |
+| *刚 | 50 元 | 2024-03-18 |
+| *乐 | 20 元 | 2024-03-17 |
+| *木 | 20 元 | 2024-03-17 |
+| *诚 | 20 元 | 2024-03-17 |
+| Strem Gamer | 20 元 | 2024-03-16 |
+| *鑫 | 20 元 | 2024-03-14 |
+| Yuzu | 20 元 | 2024-03-07 |
+| **宁 | 100 元 | 2024-03-03 |
+| **媛 | 20 元 | 2024-03-03 |
+| Scarlett | 20 元 | 2024-02-16 |
+| Asun | 20 元 | 2024-01-30 |
+| 何* | 100 元 | 2024-01-21 |
+| allen | 20 元 | 2024-01-10 |
+| llllll | 20 元 | 2024-01-07 |
+| 邝*元 | 20 元 | 2023-12-29 |
+| 50chen | 50 元 | 2023-12-22 |
+| xiongot | 20 元 | 2023-12-17 |
+| atom.hu | 20 元 | 2023-12-16 |
+| 一呆 | 20 元 | 2023-12-01 |
+| 坠落 | 50 元 | 2023-11-08 |
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识付费介绍.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识付费介绍.md
new file mode 100644
index 0000000..dfa98c1
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识付费介绍.md
@@ -0,0 +1,19 @@
+# 知识付费介绍
+开源是一种无私奉献,从MediaCrawler开源到现在有一年多,它并没有带给我多少实质性的东西,就拿收入来说,赞助费、赞赏等等全部加起来还没有之前一个月的薪水。
+
+后面搞了MediaCrawler源码剖析课程之后,收入稍微好一点,但也是群里兄弟对我开源的支持,在此也非常感谢你们~
+
+但是我依然坚持持续开源,从开始的xhs、dy 2个平台支持,到现在已经有**7个平台**支持,每一次增加一个平台其实都会耗费很大的时间去写代码和调试代码。。。。
+
+在今天跟一个群里好朋友聊天,他说:开源开发者也要活下去。你不要不好意思做知识付费,你的劳动是有价值的。
+
+他点醒我了,因此我把我所提供的知识付费内容放在下面,有需要的朋友可以看看~
+
+## MediaCrawlerPro项目源码订阅服务
+[mediacrawlerpro订阅文档说明](mediacrawlerpro订阅.md)
+
+## MediaCrawler源码剖析视频课程
+[mediacrawler源码课程介绍](https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh)
+
+## 知识星球爬虫逆向、编程专栏
+[知识星球专栏介绍](知识星球介绍.md)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识星球介绍.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识星球介绍.md
new file mode 100644
index 0000000..f97ad7a
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/知识星球介绍.md
@@ -0,0 +1,31 @@
+# 知识星球专栏
+
+## 基本介绍
+
+文章:
+- 1.爬虫JS逆向案例分享
+- 2.MediaCrawler技术实现分享。
+- 3.沉淀python开发经验和技巧
+- ......................
+
+提问:
+- 4.在星球内向我提问关于MediaCrawler、爬虫、编程任何问题
+
+## 章节内容
+ - [逆向案例 - 某16x8平台商品列表接口逆向参数分析](https://articles.zsxq.com/id_x1qmtg8pzld9.html)
+ - [逆向案例 - Product Hunt月度最佳产品榜单接口加密参数分析](https://articles.zsxq.com/id_au4eich3x2sg.html)
+ - [逆向案例 - 某zhi乎x-zse-96参数分析过程](https://articles.zsxq.com/id_dui2vil0ag1l.html)
+ - [逆向案例 - 某x识星球X-Signature加密参数分析过程](https://articles.zsxq.com/id_pp4madwcwcg8.html)
+ - [【独创】使用Playwright获取某音a_bogus参数流程(包含加密参数分析)](https://articles.zsxq.com/id_u89al50jk9x0.html)
+ - [【独创】使用Playwright低成本获取某书X-s参数流程分析(当年的回忆录)](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
+ - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
+ - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
+ - [一次Mysql数据库中混用collation排序规则带来的bug](https://articles.zsxq.com/id_pibwr1wnst2p.html)
+ - [错误使用 Python 可变类型带来的隐藏 Bug](https://articles.zsxq.com/id_f7vn89l1d303.html)
+ - [【MediaCrawler】微博帖子评论爬虫教程](https://articles.zsxq.com/id_vrmuhw0ovj3t.html)
+ - [Python协程在并发场景下的幂等性问题](https://articles.zsxq.com/id_wocdwsfmfcmp.html)
+ - ........................................
+
+## 加入星球
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/词云图使用配置.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/词云图使用配置.md
new file mode 100644
index 0000000..a0f84b0
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/词云图使用配置.md
@@ -0,0 +1,57 @@
+# 关于词云图相关操作
+
+## 1.如何正确调用词云图
+> ps:目前只有保存格式为json文件时,才会生成词云图。其他存储方式添加词云图将在近期添加。
+
+需要修改的配置项(./config/base_config.py):
+
+```python
+# 数据保存类型选项配置,支持三种类型:csv、db、json
+#此处需要为json格式保存,原因如上
+SAVE_DATA_OPTION = "json" # csv or db or json
+```
+
+```python
+# 是否开启爬评论模式, 默认不开启爬评论
+#此处为True,需要爬取评论才可以生成评论的词云图。
+ENABLE_GET_COMMENTS = True
+```
+
+```python
+#词云相关
+#是否开启生成评论词云图
+#打开词云图功能
+ENABLE_GET_WORDCLOUD = True
+```
+
+```python
+# 添加自定义词语及其分组
+#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
+CUSTOM_WORDS = {
+ '零几': '年份', # 将“零几”识别为一个整体
+ '高频词': '专业术语' # 示例自定义词
+}
+```
+
+```python
+#停用(禁用)词文件路径
+STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
+```
+
+```python
+#中文字体文件路径
+FONT_PATH= "./docs/STZHONGS.TTF"
+```
+
+**相关解释**
+
+- 自定义词组的添加,`xx:yy` 中`xx`为自定义词语,`yy`为`xx`分配词语的组别。`yy`可以随便给任意值。
+
+- 如果需要添加禁用词,请在./docs/hit_stopwords.txt添加禁用词(保证格式正确,一个词语一行)
+- `FONT_PATH`为生成词云图中中文字体的格式,默认为宋体。可以自行添加字体文件,修改路径。
+
+## 2.生成词云图的位置
+
+
+
+如图,在data文件下的`words文件夹`下,其中json为词频统计文件,png为词云图。原本的评论内容在`json文件夹`下。
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/豌豆HTTP使用文档.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/豌豆HTTP使用文档.md
new file mode 100644
index 0000000..6bd77fa
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/豌豆HTTP使用文档.md
@@ -0,0 +1,38 @@
+## 豌豆HTTP代理使用文档 (只支持企业用户)
+
+## 准备代理 IP 信息
+点击 豌豆HTTP代理 官网注册并实名认证(国内使用代理 IP 必须要实名,懂的都懂)
+
+## 获取 IP 代理的密钥信息 appkey
+从 豌豆HTTP代理 官网获取免费试用,如下图所示
+
+
+选择自己需要的套餐
+
+
+
+初始化一个豌豆HTTP代理的示例,如下代码所示,需要1个参数: app_key
+
+```python
+# 文件地址: proxy/providers/wandou_http_proxy.py
+# -*- coding: utf-8 -*-
+
+def new_wandou_http_proxy() -> WanDouHttpProxy:
+ """
+ 构造豌豆HTTP实例
+ Returns:
+
+ """
+ return WanDouHttpProxy(
+ app_key=os.getenv(
+ "wandou_app_key", "你的豌豆HTTP app_key"
+ ), # 通过环境变量的方式获取豌豆HTTP app_key
+ )
+
+```
+
+在个人中心的`开放接口`找到 `app_key`,如下图所示
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/项目代码结构.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/项目代码结构.md
new file mode 100644
index 0000000..6a5e2ed
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/docs/项目代码结构.md
@@ -0,0 +1,71 @@
+# 项目代码结构
+
+```
+MediaCrawler
+├── base
+│ └── base_crawler.py # 项目的抽象基类
+├── cache
+│ ├── abs_cache.py # 缓存抽象基类
+│ ├── cache_factory.py # 缓存工厂
+│ ├── local_cache.py # 本地缓存实现
+│ └── redis_cache.py # Redis缓存实现
+├── cmd_arg
+│ └── arg.py # 命令行参数定义
+├── config
+│ ├── base_config.py # 基础配置
+│ ├── db_config.py # 数据库配置
+│ └── ... # 各平台配置文件
+├── constant
+│ └── ... # 各平台常量定义
+├── database
+│ ├── db.py # 数据库ORM,封装增删改查
+│ ├── db_session.py # 数据库会话管理
+│ └── models.py # 数据库模型定义
+├── docs
+│ └── ... # 项目文档
+├── libs
+│ ├── douyin.js # 抖音Sign函数
+│ ├── stealth.min.js # 去除浏览器自动化特征的JS
+│ └── zhihu.js # 知乎Sign函数
+├── media_platform
+│ ├── bilibili # B站采集实现
+│ ├── douyin # 抖音采集实现
+│ ├── kuaishou # 快手采集实现
+│ ├── tieba # 百度贴吧采集实现
+│ ├── weibo # 微博采集实现
+│ ├── xhs # 小红书采集实现
+│ └── zhihu # 知乎采集实现
+├── model
+│ ├── m_baidu_tieba.py # 百度贴吧数据模型
+│ ├── m_douyin.py # 抖音数据模型
+│ ├── m_kuaishou.py # 快手数据模型
+│ ├── m_weibo.py # 微博数据模型
+│ ├── m_xiaohongshu.py # 小红书数据模型
+│ └── m_zhihu.py # 知乎数据模型
+├── proxy
+│ ├── base_proxy.py # 代理基类
+│ ├── providers # 代理提供商实现
+│ ├── proxy_ip_pool.py # 代理IP池
+│ └── types.py # 代理类型定义
+├── store
+│ ├── bilibili # B站数据存储实现
+│ ├── douyin # 抖音数据存储实现
+│ ├── kuaishou # 快手数据存储实现
+│ ├── tieba # 贴吧数据存储实现
+│ ├── weibo # 微博数据存储实现
+│ ├── xhs # 小红书数据存储实现
+│ └── zhihu # 知乎数据存储实现
+├── test
+│ ├── test_db_sync.py # 数据库同步测试
+│ ├── test_proxy_ip_pool.py # 代理IP池测试
+│ └── ... # 其他测试用例
+├── tools
+│ ├── browser_launcher.py # 浏览器启动器
+│ ├── cdp_browser.py # CDP浏览器控制
+│ ├── crawler_util.py # 爬虫工具函数
+│ ├── utils.py # 通用工具函数
+│ └── ...
+├── main.py # 程序入口, 支持 --init_db 参数来初始化数据库
+├── recv_sms.py # 短信转发HTTP SERVER接口
+└── var.py # 全局上下文变量定义
+```
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py
index c074c7d..b4c55a1 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/main.py
@@ -15,7 +15,7 @@ from typing import Optional
import cmd_arg
import config
-import db
+from database import db
from base.base_crawler import AbstractCrawler
from media_platform.bilibili import BilibiliCrawler
from media_platform.douyin import DouYinCrawler
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler
+from tools.async_file_writer import AsyncFileWriter
+from var import crawler_type_var
class CrawlerFactory:
@@ -50,20 +52,40 @@ class CrawlerFactory:
crawler: Optional[AbstractCrawler] = None
+# persist-1
+# 原因:增加 --init_db 功能,用于数据库初始化。
+# 副作用:无
+# 回滚策略:还原此文件。
async def main():
# Init crawler
global crawler
# parse cmd
- await cmd_arg.parse_cmd()
+ args = await cmd_arg.parse_cmd()
# init db
- if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
- await db.init_db()
+ if args.init_db:
+ await db.init_db(args.init_db)
+ print(f"Database {args.init_db} initialized successfully.")
+ return # Exit the main function cleanly
+
+
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
+ # Generate wordcloud after crawling is complete
+ # Only for JSON save mode
+ if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
+ try:
+ file_writer = AsyncFileWriter(
+ platform=config.PLATFORM,
+ crawler_type=crawler_type_var.get()
+ )
+ await file_writer.generate_wordcloud_from_comments()
+ except Exception as e:
+ print(f"Error generating wordcloud: {e}")
+
def cleanup():
if crawler:
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py
index 0abf872..7019c10 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/client.py
@@ -189,10 +189,11 @@ class BilibiliClient(AbstractApiClient):
if not aid or not cid or aid <= 0 or cid <= 0:
raise ValueError("aid 和 cid 必须存在")
uri = "/x/player/wbi/playurl"
+ qn_value = getattr(config, "BILI_QN", 80)
params = {
"avid": aid,
"cid": cid,
- "qn": 80,
+ "qn": qn_value,
"fourk": 1,
"fnval": 1,
"platform": "pc",
@@ -201,15 +202,17 @@ class BilibiliClient(AbstractApiClient):
return await self.get(uri, params, enable_params_sign=True)
async def get_video_media(self, url: str) -> Union[bytes, None]:
- async with httpx.AsyncClient(proxy=self.proxy) as client:
+ # Follow CDN 302 redirects and treat any 2xx as success (some endpoints return 206)
+ async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=True) as client:
try:
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
response.raise_for_status()
- if not response.reason_phrase == "OK":
- utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
- return None
- else:
+ if 200 <= response.status_code < 300:
return response.content
+ utils.logger.error(
+ f"[BilibiliClient.get_video_media] Unexpected status {response.status_code} for {url}"
+ )
+ return None
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
return None
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py
index e63d31a..5f1f42c 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/core.py
@@ -15,7 +15,7 @@
import asyncio
import os
-import random
+# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime, timedelta
@@ -41,6 +41,7 @@ from var import crawler_type_var, source_keyword_var
from .client import BilibiliClient
from .exception import DataFetchError
from .field import SearchOrderType
+from .help import parse_video_info_from_url, parse_creator_info_from_url
from .login import BilibiliLogin
@@ -77,8 +78,9 @@ class BilibiliCrawler(AbstractCrawler):
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -103,8 +105,14 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
if config.CREATOR_MODE:
- for creator_id in config.BILI_CREATOR_ID_LIST:
- await self.get_creator_videos(int(creator_id))
+ for creator_url in config.BILI_CREATOR_ID_LIST:
+ try:
+ creator_info = parse_creator_info_from_url(creator_url)
+ utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
+ await self.get_creator_videos(int(creator_info.creator_id))
+ except ValueError as e:
+ utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}")
+ continue
else:
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
else:
@@ -208,6 +216,11 @@ class BilibiliCrawler(AbstractCrawler):
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+
await self.batch_get_video_comments(video_id_list)
async def search_by_keywords_in_time_range(self, daily_limit: bool):
@@ -284,6 +297,11 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_bilibili_video(video_item, semaphore)
page += 1
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+
await self.batch_get_video_comments(video_id_list)
except Exception as e:
@@ -318,10 +336,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
- await asyncio.sleep(random.uniform(0.5, 1.5))
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
await self.bili_client.get_video_all_comments(
video_id=video_id,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=bilibili_store.batch_update_bilibili_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
@@ -347,14 +366,27 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_specified_videos(video_bvids_list)
if int(result["page"]["count"]) <= pn * ps:
break
- await asyncio.sleep(random.random())
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
pn += 1
- async def get_specified_videos(self, bvids_list: List[str]):
+ async def get_specified_videos(self, video_url_list: List[str]):
"""
- get specified videos info
+ get specified videos info from URLs or BV IDs
+ :param video_url_list: List of video URLs or BV IDs
:return:
"""
+ utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...")
+ bvids_list = []
+ for video_url in video_url_list:
+ try:
+ video_info = parse_video_info_from_url(video_url)
+ bvids_list.append(video_info.video_id)
+ utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}")
+ except ValueError as e:
+ utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}")
+ continue
+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
video_details = await asyncio.gather(*task_list)
@@ -381,6 +413,11 @@ class BilibiliCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
+
+ # Sleep after fetching video details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
+
return result
except DataFetchError as ex:
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
@@ -544,24 +581,37 @@ class BilibiliCrawler(AbstractCrawler):
return
content = await self.bili_client.get_video_media(video_url)
- await asyncio.sleep(random.random())
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
if content is None:
return
extension_file_name = f"video.mp4"
await bilibili_store.store_video(aid, content, extension_file_name)
- async def get_all_creator_details(self, creator_id_list: List[int]):
+ async def get_all_creator_details(self, creator_url_list: List[str]):
"""
- creator_id_list: get details for creator from creator_id_list
+ creator_url_list: get details for creator from creator URL list
"""
- utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
- utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
+ utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators")
+ utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...")
+
+ creator_id_list = []
+ for creator_url in creator_url_list:
+ try:
+ creator_info = parse_creator_info_from_url(creator_url)
+ creator_id_list.append(int(creator_info.creator_id))
+ utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
+ except ValueError as e:
+ utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}")
+ continue
+
+ utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
try:
for creator_id in creator_id_list:
- task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
+ task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id))
task_list.append(task)
except Exception as e:
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
@@ -600,7 +650,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
await self.bili_client.get_creator_all_fans(
creator_info=creator_info,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_fans,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
@@ -623,7 +673,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
await self.bili_client.get_creator_all_followings(
creator_info=creator_info,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_followings,
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
)
@@ -646,7 +696,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py
index b4e6221..614117a 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/bilibili/help.py
@@ -9,15 +9,17 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
- # -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 23:26
# @Desc : bilibili 请求参数签名
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
+import re
import urllib.parse
from hashlib import md5
from typing import Dict
+from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo
from tools import utils
@@ -66,16 +68,71 @@ class BilibiliSign:
return req_data
+def parse_video_info_from_url(url: str) -> VideoUrlInfo:
+ """
+ 从B站视频URL中解析出视频ID
+ Args:
+ url: B站视频链接
+ - https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
+ - https://www.bilibili.com/video/BV1d54y1g7db
+ - BV1d54y1g7db (直接传入BV号)
+ Returns:
+ VideoUrlInfo: 包含视频ID的对象
+ """
+ # 如果传入的已经是BV号,直接返回
+ if url.startswith("BV"):
+ return VideoUrlInfo(video_id=url)
+
+ # 使用正则表达式提取BV号
+ # 匹配 /video/BV... 或 /video/av... 格式
+ bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
+ match = re.search(bv_pattern, url)
+
+ if match:
+ video_id = match.group(1)
+ return VideoUrlInfo(video_id=video_id)
+
+ raise ValueError(f"无法从URL中解析出视频ID: {url}")
+
+
+def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
+ """
+ 从B站创作者空间URL中解析出创作者ID
+ Args:
+ url: B站创作者空间链接
+ - https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
+ - https://space.bilibili.com/20813884
+ - 434377496 (直接传入UID)
+ Returns:
+ CreatorUrlInfo: 包含创作者ID的对象
+ """
+ # 如果传入的已经是纯数字ID,直接返回
+ if url.isdigit():
+ return CreatorUrlInfo(creator_id=url)
+
+ # 使用正则表达式提取UID
+ # 匹配 /space.bilibili.com/数字 格式
+ uid_pattern = r'space\.bilibili\.com/(\d+)'
+ match = re.search(uid_pattern, url)
+
+ if match:
+ creator_id = match.group(1)
+ return CreatorUrlInfo(creator_id=creator_id)
+
+ raise ValueError(f"无法从URL中解析出创作者ID: {url}")
+
+
if __name__ == '__main__':
- _img_key = "7cd084941338484aae1ad9425b84077c"
- _sub_key = "4932caff0ff746eab6f01bf08b70ac45"
- _search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
- _req_data = dict()
- for params in _search_url.split("&"):
- kvalues = params.split("=")
- key = kvalues[0]
- value = kvalues[1]
- _req_data[key] = value
- print("pre req_data", _req_data)
- _req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
- print(_req_data)
+ # 测试视频URL解析
+ video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
+ video_url2 = "BV1d54y1g7db"
+ print("视频URL解析测试:")
+ print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
+ print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
+
+ # 测试创作者URL解析
+ creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
+ creator_url2 = "20813884"
+ print("\n创作者URL解析测试:")
+ print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
+ print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py
index 46a3e8f..5d980ec 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/client.py
@@ -324,3 +324,28 @@ class DouYinClient(AbstractApiClient):
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
return None
+
+ async def resolve_short_url(self, short_url: str) -> str:
+ """
+ 解析抖音短链接,获取重定向后的真实URL
+ Args:
+ short_url: 短链接,如 https://v.douyin.com/iF12345ABC/
+ Returns:
+ 重定向后的完整URL
+ """
+ async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False) as client:
+ try:
+ utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}")
+ response = await client.get(short_url, timeout=10)
+
+ # 短链接通常返回302重定向
+ if response.status_code in [301, 302, 303, 307, 308]:
+ redirect_url = response.headers.get("Location", "")
+ utils.logger.info(f"[DouYinClient.resolve_short_url] Resolved to: {redirect_url}")
+ return redirect_url
+ else:
+ utils.logger.warning(f"[DouYinClient.resolve_short_url] Unexpected status code: {response.status_code}")
+ return ""
+ except Exception as e:
+ utils.logger.error(f"[DouYinClient.resolve_short_url] Failed to resolve short URL: {e}")
+ return ""
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py
index 1d7ce4d..c002155 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/core.py
@@ -33,6 +33,7 @@ from var import crawler_type_var, source_keyword_var
from .client import DouYinClient
from .exception import DataFetchError
from .field import PublishTimeType
+from .help import parse_video_info_from_url, parse_creator_info_from_url
from .login import DouYinLogin
@@ -73,8 +74,9 @@ class DouYinCrawler(AbstractCrawler):
user_agent=None,
headless=config.HEADLESS,
)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -147,25 +149,56 @@ class DouYinCrawler(AbstractCrawler):
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
await self.get_aweme_media(aweme_item=aweme_info)
+ # Sleep after each page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
- """Get the information and comments of the specified post"""
+ """Get the information and comments of the specified post from URLs or IDs"""
+ utils.logger.info("[DouYinCrawler.get_specified_awemes] Parsing video URLs...")
+ aweme_id_list = []
+ for video_url in config.DY_SPECIFIED_ID_LIST:
+ try:
+ video_info = parse_video_info_from_url(video_url)
+
+ # 处理短链接
+ if video_info.url_type == "short":
+ utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Resolving short link: {video_url}")
+ resolved_url = await self.dy_client.resolve_short_url(video_url)
+ if resolved_url:
+ # 从解析后的URL中提取视频ID
+ video_info = parse_video_info_from_url(resolved_url)
+ utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Short link resolved to aweme ID: {video_info.aweme_id}")
+ else:
+ utils.logger.error(f"[DouYinCrawler.get_specified_awemes] Failed to resolve short link: {video_url}")
+ continue
+
+ aweme_id_list.append(video_info.aweme_id)
+ utils.logger.info(f"[DouYinCrawler.get_specified_awemes] Parsed aweme ID: {video_info.aweme_id} from {video_url}")
+ except ValueError as e:
+ utils.logger.error(f"[DouYinCrawler.get_specified_awemes] Failed to parse video URL: {e}")
+ continue
+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
- task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
+ task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in aweme_id_list]
aweme_details = await asyncio.gather(*task_list)
for aweme_detail in aweme_details:
if aweme_detail is not None:
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
await self.get_aweme_media(aweme_item=aweme_detail)
- await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
+ await self.batch_get_note_comments(aweme_id_list)
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
"""Get note detail"""
async with semaphore:
try:
- return await self.dy_client.get_video_by_id(aweme_id)
+ result = await self.dy_client.get_video_by_id(aweme_id)
+ # Sleep after fetching aweme detail
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
+ return result
except DataFetchError as ex:
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None
@@ -193,23 +226,38 @@ class DouYinCrawler(AbstractCrawler):
async with semaphore:
try:
# 将关键词列表传递给 get_aweme_all_comments 方法
+ # Use fixed crawling interval
+ crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
- crawl_interval=random.random(),
+ crawl_interval=crawl_interval,
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
+ # Sleep after fetching comments
+ await asyncio.sleep(crawl_interval)
+ utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
async def get_creators_and_videos(self) -> None:
"""
- Get the information and videos of the specified creator
+ Get the information and videos of the specified creator from URLs or IDs
"""
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
- for user_id in config.DY_CREATOR_ID_LIST:
+ utils.logger.info("[DouYinCrawler.get_creators_and_videos] Parsing creator URLs...")
+
+ for creator_url in config.DY_CREATOR_ID_LIST:
+ try:
+ creator_info_parsed = parse_creator_info_from_url(creator_url)
+ user_id = creator_info_parsed.sec_user_id
+ utils.logger.info(f"[DouYinCrawler.get_creators_and_videos] Parsed sec_user_id: {user_id} from {creator_url}")
+ except ValueError as e:
+ utils.logger.error(f"[DouYinCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
+ continue
+
creator_info: Dict = await self.dy_client.get_user_info(user_id)
if creator_info:
await douyin_store.save_creator(user_id, creator=creator_info)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py
index 1ed3111..d4e245d 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/douyin/help.py
@@ -16,10 +16,15 @@
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
import random
+import re
+from typing import Optional
import execjs
from playwright.async_api import Page
+from model.m_douyin import VideoUrlInfo, CreatorUrlInfo
+from tools.crawler_util import extract_url_params_to_dict
+
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
def get_web_id():
@@ -83,3 +88,103 @@ async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: s
return a_bogus
+
+def parse_video_info_from_url(url: str) -> VideoUrlInfo:
+ """
+ 从抖音视频URL中解析出视频ID
+ 支持以下格式:
+ 1. 普通视频链接: https://www.douyin.com/video/7525082444551310602
+ 2. 带modal_id参数的链接:
+ - https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602
+ - https://www.douyin.com/root/search/python?modal_id=7471165520058862848
+ 3. 短链接: https://v.douyin.com/iF12345ABC/ (需要client解析)
+ 4. 纯ID: 7525082444551310602
+
+ Args:
+ url: 抖音视频链接或ID
+ Returns:
+ VideoUrlInfo: 包含视频ID的对象
+ """
+ # 如果是纯数字ID,直接返回
+ if url.isdigit():
+ return VideoUrlInfo(aweme_id=url, url_type="normal")
+
+ # 检查是否是短链接 (v.douyin.com)
+ if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url:
+ return VideoUrlInfo(aweme_id="", url_type="short") # 需要通过client解析
+
+ # 尝试从URL参数中提取modal_id
+ params = extract_url_params_to_dict(url)
+ modal_id = params.get("modal_id")
+ if modal_id:
+ return VideoUrlInfo(aweme_id=modal_id, url_type="modal")
+
+ # 从标准视频URL中提取ID: /video/数字
+ video_pattern = r'/video/(\d+)'
+ match = re.search(video_pattern, url)
+ if match:
+ aweme_id = match.group(1)
+ return VideoUrlInfo(aweme_id=aweme_id, url_type="normal")
+
+ raise ValueError(f"无法从URL中解析出视频ID: {url}")
+
+
+def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
+ """
+ 从抖音创作者主页URL中解析出创作者ID (sec_user_id)
+ 支持以下格式:
+ 1. 创作者主页: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main
+ 2. 纯ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE
+
+ Args:
+ url: 抖音创作者主页链接或sec_user_id
+ Returns:
+ CreatorUrlInfo: 包含创作者ID的对象
+ """
+ # 如果是纯ID格式(通常以MS4wLjABAAAA开头),直接返回
+ if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url):
+ return CreatorUrlInfo(sec_user_id=url)
+
+ # 从创作者主页URL中提取sec_user_id: /user/xxx
+ user_pattern = r'/user/([^/?]+)'
+ match = re.search(user_pattern, url)
+ if match:
+ sec_user_id = match.group(1)
+ return CreatorUrlInfo(sec_user_id=sec_user_id)
+
+ raise ValueError(f"无法从URL中解析出创作者ID: {url}")
+
+
+if __name__ == '__main__':
+ # 测试视频URL解析
+ print("=== 视频URL解析测试 ===")
+ test_urls = [
+ "https://www.douyin.com/video/7525082444551310602",
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602",
+ "https://www.douyin.com/root/search/python?aid=b733a3b0-4662-4639-9a72-c2318fba9f3f&modal_id=7471165520058862848&type=general",
+ "7525082444551310602",
+ ]
+ for url in test_urls:
+ try:
+ result = parse_video_info_from_url(url)
+ print(f"✓ URL: {url[:80]}...")
+ print(f" 结果: {result}\n")
+ except Exception as e:
+ print(f"✗ URL: {url}")
+ print(f" 错误: {e}\n")
+
+ # 测试创作者URL解析
+ print("=== 创作者URL解析测试 ===")
+ test_creator_urls = [
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
+ "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
+ ]
+ for url in test_creator_urls:
+ try:
+ result = parse_creator_info_from_url(url)
+ print(f"✓ URL: {url[:80]}...")
+ print(f" 结果: {result}\n")
+ except Exception as e:
+ print(f"✗ URL: {url}")
+ print(f" 错误: {e}\n")
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py
index 4ae1d63..4cd2eb8 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/core.py
@@ -11,7 +11,7 @@
import asyncio
import os
-import random
+# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -26,6 +26,7 @@ from playwright.async_api import (
import config
from base.base_crawler import AbstractCrawler
+from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import kuaishou as kuaishou_store
from tools import utils
@@ -34,6 +35,7 @@ from var import comment_tasks_var, crawler_type_var, source_keyword_var
from .client import KuaiShouClient
from .exception import DataFetchError
+from .help import parse_video_info_from_url, parse_creator_info_from_url
from .login import KuaishouLogin
@@ -76,8 +78,10 @@ class KuaishouCrawler(AbstractCrawler):
self.browser_context = await self.launch_browser(
chromium, None, self.user_agent, headless=config.HEADLESS
)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(f"{self.index_url}?isHome=1")
@@ -159,20 +163,36 @@ class KuaishouCrawler(AbstractCrawler):
# batch fetch video comments
page += 1
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+
await self.batch_get_video_comments(video_id_list)
async def get_specified_videos(self):
"""Get the information and comments of the specified post"""
+ utils.logger.info("[KuaishouCrawler.get_specified_videos] Parsing video URLs...")
+ video_ids = []
+ for video_url in config.KS_SPECIFIED_ID_LIST:
+ try:
+ video_info = parse_video_info_from_url(video_url)
+ video_ids.append(video_info.video_id)
+ utils.logger.info(f"Parsed video ID: {video_info.video_id} from {video_url}")
+ except ValueError as e:
+ utils.logger.error(f"Failed to parse video URL: {e}")
+ continue
+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
- for video_id in config.KS_SPECIFIED_ID_LIST
+ for video_id in video_ids
]
video_details = await asyncio.gather(*task_list)
for video_detail in video_details:
if video_detail is not None:
await kuaishou_store.update_kuaishou_video(video_detail)
- await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
+ await self.batch_get_video_comments(video_ids)
async def get_video_info_task(
self, video_id: str, semaphore: asyncio.Semaphore
@@ -181,6 +201,11 @@ class KuaishouCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.ks_client.get_video_info(video_id)
+
+ # Sleep after fetching video details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
+
utils.logger.info(
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
)
@@ -234,9 +259,14 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info(
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
)
+
+ # Sleep before fetching comments
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}")
+
await self.ks_client.get_video_all_comments(
photo_id=video_id,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=kuaishou_store.batch_update_ks_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@@ -352,16 +382,25 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info(
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
)
- for user_id in config.KS_CREATOR_ID_LIST:
- # get creator detail info from web html content
- createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
- if createor_info:
- await kuaishou_store.save_creator(user_id, creator=createor_info)
+ for creator_url in config.KS_CREATOR_ID_LIST:
+ try:
+ # Parse creator URL to get user_id
+ creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
+ utils.logger.info(f"[KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: {creator_info}")
+ user_id = creator_info.user_id
+
+ # get creator detail info from web html content
+ createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
+ if createor_info:
+ await kuaishou_store.save_creator(user_id, creator=createor_info)
+ except ValueError as e:
+ utils.logger.error(f"[KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
+ continue
# Get all video information of the creator
all_video_list = await self.ks_client.get_all_videos_by_creator(
user_id=user_id,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=self.fetch_creator_video_detail,
)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/help.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/help.py
new file mode 100644
index 0000000..5015f2d
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/kuaishou/help.py
@@ -0,0 +1,99 @@
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+
+# -*- coding: utf-8 -*-
+
+import re
+from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
+
+
+def parse_video_info_from_url(url: str) -> VideoUrlInfo:
+ """
+ 从快手视频URL中解析出视频ID
+ 支持以下格式:
+ 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
+ 2. 纯视频ID: "3x3zxz4mjrsc8ke"
+
+ Args:
+ url: 快手视频链接或视频ID
+ Returns:
+ VideoUrlInfo: 包含视频ID的对象
+ """
+ # 如果不包含http且不包含kuaishou.com,认为是纯ID
+ if not url.startswith("http") and "kuaishou.com" not in url:
+ return VideoUrlInfo(video_id=url, url_type="normal")
+
+ # 从标准视频URL中提取ID: /short-video/视频ID
+ video_pattern = r'/short-video/([a-zA-Z0-9_-]+)'
+ match = re.search(video_pattern, url)
+ if match:
+ video_id = match.group(1)
+ return VideoUrlInfo(video_id=video_id, url_type="normal")
+
+ raise ValueError(f"无法从URL中解析出视频ID: {url}")
+
+
+def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
+ """
+ 从快手创作者主页URL中解析出创作者ID
+ 支持以下格式:
+ 1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
+ 2. 纯ID: "3x4sm73aye7jq7i"
+
+ Args:
+ url: 快手创作者主页链接或user_id
+ Returns:
+ CreatorUrlInfo: 包含创作者ID的对象
+ """
+ # 如果不包含http且不包含kuaishou.com,认为是纯ID
+ if not url.startswith("http") and "kuaishou.com" not in url:
+ return CreatorUrlInfo(user_id=url)
+
+ # 从创作者主页URL中提取user_id: /profile/xxx
+ user_pattern = r'/profile/([a-zA-Z0-9_-]+)'
+ match = re.search(user_pattern, url)
+ if match:
+ user_id = match.group(1)
+ return CreatorUrlInfo(user_id=user_id)
+
+ raise ValueError(f"无法从URL中解析出创作者ID: {url}")
+
+
+if __name__ == '__main__':
+ # 测试视频URL解析
+ print("=== 视频URL解析测试 ===")
+ test_video_urls = [
+ "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
+ "3xf8enb8dbj6uig",
+ ]
+ for url in test_video_urls:
+ try:
+ result = parse_video_info_from_url(url)
+ print(f"✓ URL: {url[:80]}...")
+ print(f" 结果: {result}\n")
+ except Exception as e:
+ print(f"✗ URL: {url}")
+ print(f" 错误: {e}\n")
+
+ # 测试创作者URL解析
+ print("=== 创作者URL解析测试 ===")
+ test_creator_urls = [
+ "https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
+ "3x4sm73aye7jq7i",
+ ]
+ for url in test_creator_urls:
+ try:
+ result = parse_creator_info_from_url(url)
+ print(f"✓ URL: {url[:80]}...")
+ print(f" 结果: {result}\n")
+ except Exception as e:
+ print(f"✗ URL: {url}")
+ print(f" 错误: {e}\n")
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py
index 1b8c463..5de7458 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/client.py
@@ -11,10 +11,10 @@
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Union
-from urllib.parse import urlencode
+from urllib.parse import urlencode, quote
-import httpx
-from playwright.async_api import BrowserContext
+import requests
+from playwright.async_api import BrowserContext, Page
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
import config
@@ -34,34 +34,76 @@ class BaiduTieBaClient(AbstractApiClient):
timeout=10,
ip_pool=None,
default_ip_proxy=None,
+ headers: Dict[str, str] = None,
+ playwright_page: Optional[Page] = None,
):
self.ip_pool: Optional[ProxyIpPool] = ip_pool
self.timeout = timeout
- self.headers = {
+ # 使用传入的headers(包含真实浏览器UA)或默认headers
+ self.headers = headers or {
"User-Agent": utils.get_user_agent(),
- "Cookies": "",
+ "Cookie": "",
}
self._host = "https://tieba.baidu.com"
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
+ self.playwright_page = playwright_page # Playwright页面对象
+
+ def _sync_request(self, method, url, proxy=None, **kwargs):
+ """
+ 同步的requests请求方法
+ Args:
+ method: 请求方法
+ url: 请求的URL
+ proxy: 代理IP
+ **kwargs: 其他请求参数
+
+ Returns:
+ response对象
+ """
+ # 构造代理字典
+ proxies = None
+ if proxy:
+ proxies = {
+ "http": proxy,
+ "https": proxy,
+ }
+
+ # 发送请求
+ response = requests.request(
+ method=method,
+ url=url,
+ headers=self.headers,
+ proxies=proxies,
+ timeout=self.timeout,
+ **kwargs
+ )
+ return response
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
"""
- 封装httpx的公共请求方法,对请求响应做一些处理
+ 封装requests的公共请求方法,对请求响应做一些处理
Args:
method: 请求方法
url: 请求的URL
return_ori_content: 是否返回原始内容
- proxies: 代理IP
+ proxy: 代理IP
**kwargs: 其他请求参数,例如请求头、请求体等
Returns:
"""
actual_proxy = proxy if proxy else self.default_ip_proxy
- async with httpx.AsyncClient(proxy=actual_proxy) as client:
- response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs)
+
+ # 在线程池中执行同步的requests请求
+ response = await asyncio.to_thread(
+ self._sync_request,
+ method,
+ url,
+ actual_proxy,
+ **kwargs
+ )
if response.status_code != 200:
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
@@ -69,7 +111,7 @@ class BaiduTieBaClient(AbstractApiClient):
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
if response.text == "" or response.text == "blocked":
- utils.logger.error(f"request params incrr, response.text: {response.text}")
+ utils.logger.error(f"request params incorrect, response.text: {response.text}")
raise Exception("account blocked")
if return_ori_content:
@@ -119,26 +161,41 @@ class BaiduTieBaClient(AbstractApiClient):
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)
- async def pong(self) -> bool:
+ async def pong(self, browser_context: BrowserContext = None) -> bool:
"""
用于检查登录态是否失效了
- Returns:
+ 使用Cookie检测而非API调用,避免被检测
+ Args:
+ browser_context: 浏览器上下文对象
+ Returns:
+ bool: True表示已登录,False表示未登录
"""
- utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
+ utils.logger.info("[BaiduTieBaClient.pong] Begin to check tieba login state by cookies...")
+
+ if not browser_context:
+ utils.logger.warning("[BaiduTieBaClient.pong] browser_context is None, assume not logged in")
+ return False
+
try:
- uri = "/mo/q/sync"
- res: Dict = await self.get(uri)
- utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
- if res and res.get("no") == 0:
- ping_flag = True
+ # 从浏览器获取cookies并检查关键登录cookie
+ _, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+
+ # 百度贴吧的登录标识: STOKEN 或 PTOKEN
+ stoken = cookie_dict.get("STOKEN")
+ ptoken = cookie_dict.get("PTOKEN")
+ bduss = cookie_dict.get("BDUSS") # 百度通用登录cookie
+
+ if stoken or ptoken or bduss:
+ utils.logger.info(f"[BaiduTieBaClient.pong] Login state verified by cookies (STOKEN: {bool(stoken)}, PTOKEN: {bool(ptoken)}, BDUSS: {bool(bduss)})")
+ return True
else:
- utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
- ping_flag = False
+ utils.logger.info("[BaiduTieBaClient.pong] No valid login cookies found, need to login")
+ return False
+
except Exception as e:
- utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
- ping_flag = False
- return ping_flag
+ utils.logger.error(f"[BaiduTieBaClient.pong] Check login state failed: {e}, assume not logged in")
+ return False
async def update_cookies(self, browser_context: BrowserContext):
"""
@@ -149,7 +206,9 @@ class BaiduTieBaClient(AbstractApiClient):
Returns:
"""
- pass
+ cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+ self.headers["Cookie"] = cookie_str
+ utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
async def get_notes_by_keyword(
self,
@@ -160,7 +219,7 @@ class BaiduTieBaClient(AbstractApiClient):
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
) -> List[TiebaNote]:
"""
- 根据关键词搜索贴吧帖子
+ 根据关键词搜索贴吧帖子 (使用Playwright访问页面,避免API检测)
Args:
keyword: 关键词
page: 分页第几页
@@ -170,30 +229,81 @@ class BaiduTieBaClient(AbstractApiClient):
Returns:
"""
- uri = "/f/search/res"
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based search")
+
+ # 构造搜索URL
+ # 示例: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=编程
+ search_url = f"{self._host}/f/search/res"
params = {
- "isnew": 1,
+ "ie": "utf-8",
"qw": keyword,
"rn": page_size,
"pn": page,
"sm": sort.value,
"only_thread": note_type.value,
}
- page_content = await self.get(uri, params=params, return_ori_content=True)
- return self._page_extractor.extract_search_note_list(page_content)
+
+ # 拼接完整URL
+ full_url = f"{search_url}?{urlencode(params)}"
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 访问搜索页面: {full_url}")
+
+ try:
+ # 使用Playwright访问搜索页面
+ await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 成功获取搜索页面HTML,长度: {len(page_content)}")
+
+ # 提取搜索结果
+ notes = self._page_extractor.extract_search_note_list(page_content)
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 提取到 {len(notes)} 条帖子")
+ return notes
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] 搜索失败: {e}")
+ raise
async def get_note_by_id(self, note_id: str) -> TiebaNote:
"""
- 根据帖子ID获取帖子详情
+ 根据帖子ID获取帖子详情 (使用Playwright访问页面,避免API检测)
Args:
- note_id:
+ note_id: 帖子ID
Returns:
-
+ TiebaNote: 帖子详情对象
"""
- uri = f"/p/{note_id}"
- page_content = await self.get(uri, return_ori_content=True)
- return self._page_extractor.extract_note_detail(page_content)
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based note detail fetching")
+
+ # 构造帖子详情URL
+ note_url = f"{self._host}/p/{note_id}"
+ utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 访问帖子详情页面: {note_url}")
+
+ try:
+ # 使用Playwright访问帖子详情页面
+ await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+ utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 成功获取帖子详情HTML,长度: {len(page_content)}")
+
+ # 提取帖子详情
+ note_detail = self._page_extractor.extract_note_detail(page_content)
+ return note_detail
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] 获取帖子详情失败: {e}")
+ raise
async def get_note_all_comments(
self,
@@ -203,35 +313,68 @@ class BaiduTieBaClient(AbstractApiClient):
max_count: int = 10,
) -> List[TiebaComment]:
"""
- 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
+ 获取指定帖子下的所有一级评论 (使用Playwright访问页面,避免API检测)
Args:
note_detail: 帖子详情对象
crawl_interval: 爬取一次笔记的延迟单位(秒)
- callback: 一次笔记爬取结束后
+ callback: 一次笔记爬取结束后的回调函数
max_count: 一次帖子爬取的最大评论数量
Returns:
-
+ List[TiebaComment]: 评论列表
"""
- uri = f"/p/{note_detail.note_id}"
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_note_all_comments] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based comment fetching")
+
result: List[TiebaComment] = []
current_page = 1
+
while note_detail.total_replay_page >= current_page and len(result) < max_count:
- params = {
- "pn": current_page,
- }
- page_content = await self.get(uri, params=params, return_ori_content=True)
- comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
- if not comments:
+ # 构造评论页URL
+ comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
+ utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 访问评论页面: {comment_url}")
+
+ try:
+ # 使用Playwright访问评论页面
+ await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+
+ # 提取评论
+ comments = self._page_extractor.extract_tieba_note_parment_comments(
+ page_content, note_id=note_detail.note_id
+ )
+
+ if not comments:
+ utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
+ break
+
+ # 限制评论数量
+ if len(result) + len(comments) > max_count:
+ comments = comments[:max_count - len(result)]
+
+ if callback:
+ await callback(note_detail.note_id, comments)
+
+ result.extend(comments)
+
+ # 获取所有子评论
+ await self.get_comments_all_sub_comments(
+ comments, crawl_interval=crawl_interval, callback=callback
+ )
+
+ await asyncio.sleep(crawl_interval)
+ current_page += 1
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] 获取第{current_page}页评论失败: {e}")
break
- if len(result) + len(comments) > max_count:
- comments = comments[:max_count - len(result)]
- if callback:
- await callback(note_detail.note_id, comments)
- result.extend(comments)
- # 获取所有子评论
- await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
- await asyncio.sleep(crawl_interval)
- current_page += 1
+
+ utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 共获取 {len(result)} 条一级评论")
return result
async def get_comments_all_sub_comments(
@@ -241,93 +384,194 @@ class BaiduTieBaClient(AbstractApiClient):
callback: Optional[Callable] = None,
) -> List[TiebaComment]:
"""
- 获取指定评论下的所有子评论
+ 获取指定评论下的所有子评论 (使用Playwright访问页面,避免API检测)
Args:
comments: 评论列表
crawl_interval: 爬取一次笔记的延迟单位(秒)
- callback: 一次笔记爬取结束后
+ callback: 一次笔记爬取结束后的回调函数
Returns:
-
+ List[TiebaComment]: 子评论列表
"""
- uri = "/p/comment"
if not config.ENABLE_GET_SUB_COMMENTS:
return []
- # # 贴吧获取所有子评论需要登录态
- # if self.headers.get("Cookies") == "" or not self.pong():
- # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_comments_all_sub_comments] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based sub-comment fetching")
all_sub_comments: List[TiebaComment] = []
+
for parment_comment in comments:
if parment_comment.sub_comment_count == 0:
continue
current_page = 1
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
- while max_sub_page_num >= current_page:
- params = {
- "tid": parment_comment.note_id, # 帖子ID
- "pid": parment_comment.comment_id, # 父级评论ID
- "fid": parment_comment.tieba_id, # 贴吧ID
- "pn": current_page # 页码
- }
- page_content = await self.get(uri, params=params, return_ori_content=True)
- sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)
- if not sub_comments:
+ while max_sub_page_num >= current_page:
+ # 构造子评论URL
+ sub_comment_url = (
+ f"{self._host}/p/comment?"
+ f"tid={parment_comment.note_id}&"
+ f"pid={parment_comment.comment_id}&"
+ f"fid={parment_comment.tieba_id}&"
+ f"pn={current_page}"
+ )
+ utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 访问子评论页面: {sub_comment_url}")
+
+ try:
+ # 使用Playwright访问子评论页面
+ await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+
+ # 提取子评论
+ sub_comments = self._page_extractor.extract_tieba_note_sub_comments(
+ page_content, parent_comment=parment_comment
+ )
+
+ if not sub_comments:
+ utils.logger.info(
+ f"[BaiduTieBaClient.get_comments_all_sub_comments] "
+ f"评论{parment_comment.comment_id}第{current_page}页没有子评论,停止爬取"
+ )
+ break
+
+ if callback:
+ await callback(parment_comment.note_id, sub_comments)
+
+ all_sub_comments.extend(sub_comments)
+ await asyncio.sleep(crawl_interval)
+ current_page += 1
+
+ except Exception as e:
+ utils.logger.error(
+ f"[BaiduTieBaClient.get_comments_all_sub_comments] "
+ f"获取评论{parment_comment.comment_id}第{current_page}页子评论失败: {e}"
+ )
break
- if callback:
- await callback(parment_comment.note_id, sub_comments)
- all_sub_comments.extend(sub_comments)
- await asyncio.sleep(crawl_interval)
- current_page += 1
+
+ utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 共获取 {len(all_sub_comments)} 条子评论")
return all_sub_comments
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
"""
- 根据贴吧名称获取帖子列表
+ 根据贴吧名称获取帖子列表 (使用Playwright访问页面,避免API检测)
Args:
tieba_name: 贴吧名称
- page_num: 分页数量
+ page_num: 分页页码
Returns:
-
+ List[TiebaNote]: 帖子列表
"""
- uri = f"/f?kw={tieba_name}&pn={page_num}"
- page_content = await self.get(uri, return_ori_content=True)
- return self._page_extractor.extract_tieba_note_list(page_content)
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based tieba note fetching")
+
+ # 构造贴吧帖子列表URL
+ tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 访问贴吧页面: {tieba_url}")
+
+ try:
+ # 使用Playwright访问贴吧页面
+ await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 成功获取贴吧页面HTML,长度: {len(page_content)}")
+
+ # 提取帖子列表
+ notes = self._page_extractor.extract_tieba_note_list(page_content)
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 提取到 {len(notes)} 条帖子")
+ return notes
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] 获取贴吧帖子列表失败: {e}")
+ raise
async def get_creator_info_by_url(self, creator_url: str) -> str:
"""
- 根据创作者ID获取创作者信息
+ 根据创作者URL获取创作者信息 (使用Playwright访问页面,避免API检测)
Args:
creator_url: 创作者主页URL
Returns:
-
+ str: 页面HTML内容
"""
- page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
- return page_content
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based creator info fetching")
+
+ utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 访问创作者主页: {creator_url}")
+
+ try:
+ # 使用Playwright访问创作者主页
+ await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面HTML内容
+ page_content = await self.playwright_page.content()
+ utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 成功获取创作者主页HTML,长度: {len(page_content)}")
+
+ return page_content
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] 获取创作者主页失败: {e}")
+ raise
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
"""
- 根据创作者获取创作者的所有帖子
+ 根据创作者获取创作者的帖子 (使用Playwright访问页面,避免API检测)
Args:
- user_name:
- page_number:
+ user_name: 创作者用户名
+ page_number: 页码
Returns:
-
+ Dict: 包含帖子数据的字典
"""
- uri = f"/home/get/getthread"
- params = {
- "un": user_name,
- "pn": page_number,
- "id": "utf-8",
- "_": utils.get_current_timestamp(),
- }
- return await self.get(uri, params=params)
+ if not self.playwright_page:
+ utils.logger.error("[BaiduTieBaClient.get_notes_by_creator] playwright_page is None, cannot use browser mode")
+ raise Exception("playwright_page is required for browser-based creator notes fetching")
+
+ # 构造创作者帖子列表URL
+ creator_url = f"{self._host}/home/get/getthread?un={quote(user_name)}&pn={page_number}&id=utf-8&_={utils.get_current_timestamp()}"
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 访问创作者帖子列表: {creator_url}")
+
+ try:
+ # 使用Playwright访问创作者帖子列表页面
+ await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
+
+ # 等待页面加载,使用配置文件中的延时设置
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # 获取页面内容(这个接口返回JSON)
+ page_content = await self.playwright_page.content()
+
+ # 提取JSON数据(页面会包含标签或直接是JSON)
+ try:
+ # 尝试从页面中提取JSON
+ json_text = await self.playwright_page.evaluate("() => document.body.innerText")
+ result = json.loads(json_text)
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
+ return result
+ except json.JSONDecodeError as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
+ raise Exception(f"Failed to parse JSON from creator notes page: {e}")
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
+ raise
async def get_all_notes_by_creator_user_name(
self,
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
index 8635104..268cf26 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
@@ -11,7 +11,6 @@
import asyncio
import os
-import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -26,7 +25,7 @@ from playwright.async_api import (
import config
from base.base_crawler import AbstractCrawler
from model.m_baidu_tieba import TiebaCreator, TiebaNote
-from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from proxy.proxy_ip_pool import IpInfoModel, ProxyIpPool, create_ip_pool
from store import tieba as tieba_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
@@ -56,7 +55,7 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
- ip_proxy_pool, httpx_proxy_format = None, None
+ playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
utils.logger.info(
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
@@ -65,31 +64,73 @@ class TieBaCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
- _, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
+ playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
utils.logger.info(
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
)
- # Create a client to interact with the baidutieba website.
- self.tieba_client = BaiduTieBaClient(
- ip_pool=ip_proxy_pool,
- default_ip_proxy=httpx_proxy_format,
- )
- crawler_type_var.set(config.CRAWLER_TYPE)
- if config.CRAWLER_TYPE == "search":
- # Search for notes and retrieve their comment information.
- await self.search()
- await self.get_specified_tieba_notes()
- elif config.CRAWLER_TYPE == "detail":
- # Get the information and comments of the specified post
- await self.get_specified_notes()
- elif config.CRAWLER_TYPE == "creator":
- # Get creator's information and their notes and comments
- await self.get_creators_and_notes()
- else:
- pass
+ async with async_playwright() as playwright:
+ # 根据配置选择启动模式
+ if config.ENABLE_CDP_MODE:
+ utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
+ self.browser_context = await self.launch_browser_with_cdp(
+ playwright,
+ playwright_proxy_format,
+ self.user_agent,
+ headless=config.CDP_HEADLESS,
+ )
+ else:
+ utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
+ # Launch a browser context.
+ chromium = playwright.chromium
+ self.browser_context = await self.launch_browser(
+ chromium,
+ playwright_proxy_format,
+ self.user_agent,
+ headless=config.HEADLESS,
+ )
- utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
+ # 注入反检测脚本 - 针对百度的特殊检测
+ await self._inject_anti_detection_scripts()
+
+ self.context_page = await self.browser_context.new_page()
+
+ # 先访问百度首页,再点击贴吧链接,避免触发安全验证
+ await self._navigate_to_tieba_via_baidu()
+
+ # Create a client to interact with the baidutieba website.
+ self.tieba_client = await self.create_tieba_client(
+ httpx_proxy_format,
+ ip_proxy_pool if config.ENABLE_IP_PROXY else None
+ )
+
+ # Check login status and perform login if necessary
+ if not await self.tieba_client.pong(browser_context=self.browser_context):
+ login_obj = BaiduTieBaLogin(
+ login_type=config.LOGIN_TYPE,
+ login_phone="", # your phone number
+ browser_context=self.browser_context,
+ context_page=self.context_page,
+ cookie_str=config.COOKIES,
+ )
+ await login_obj.begin()
+ await self.tieba_client.update_cookies(browser_context=self.browser_context)
+
+ crawler_type_var.set(config.CRAWLER_TYPE)
+ if config.CRAWLER_TYPE == "search":
+ # Search for notes and retrieve their comment information.
+ await self.search()
+ await self.get_specified_tieba_notes()
+ elif config.CRAWLER_TYPE == "detail":
+ # Get the information and comments of the specified post
+ await self.get_specified_notes()
+ elif config.CRAWLER_TYPE == "creator":
+ # Get creator's information and their notes and comments
+ await self.get_creators_and_notes()
+ else:
+ pass
+
+ utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
async def search(self) -> None:
"""
@@ -141,6 +182,11 @@ class TieBaCrawler(AbstractCrawler):
await self.get_specified_notes(
note_id_list=[note_detail.note_id for note_detail in notes_list]
)
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
+
page += 1
except Exception as ex:
utils.logger.error(
@@ -178,6 +224,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
)
await self.get_specified_notes([note.note_id for note in note_list])
+
+ # Sleep after processing notes
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
+
page_number += tieba_limit_count
async def get_specified_notes(
@@ -222,6 +273,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
)
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
+
+ # Sleep after fetching note details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+
if not note_detail:
utils.logger.error(
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
@@ -277,9 +333,14 @@ class TieBaCrawler(AbstractCrawler):
utils.logger.info(
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
)
+
+ # Sleep before fetching comments
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
+
await self.tieba_client.get_note_all_comments(
note_detail=note_detail,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=tieba_store.batch_update_tieba_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@@ -327,6 +388,198 @@ class TieBaCrawler(AbstractCrawler):
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
)
+ async def _navigate_to_tieba_via_baidu(self):
+ """
+ 模拟真实用户访问路径:
+ 1. 先访问百度首页 (https://www.baidu.com/)
+ 2. 等待页面加载
+ 3. 点击顶部导航栏的"贴吧"链接
+ 4. 跳转到贴吧首页
+
+ 这样做可以避免触发百度的安全验证
+ """
+ utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
+
+ try:
+ # Step 1: 访问百度首页
+ utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
+ await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")
+
+ # Step 2: 等待页面加载,使用配置文件中的延时设置
+ utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # Step 3: 查找并点击"贴吧"链接
+ utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
+
+ # 尝试多种选择器,确保能找到贴吧链接
+ tieba_selectors = [
+ 'a[href="http://tieba.baidu.com/"]',
+ 'a[href="https://tieba.baidu.com/"]',
+ 'a.mnav:has-text("贴吧")',
+ 'text=贴吧',
+ ]
+
+ tieba_link = None
+ for selector in tieba_selectors:
+ try:
+ tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
+ if tieba_link:
+ utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
+ break
+ except Exception:
+ continue
+
+ if not tieba_link:
+ utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
+ await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+ return
+
+ # Step 4: 点击贴吧链接 (检查是否会打开新标签页)
+ utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
+
+ # 检查链接的target属性
+ target_attr = await tieba_link.get_attribute("target")
+ utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
+
+ if target_attr == "_blank":
+ # 如果是新标签页,需要等待新页面并切换
+ utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
+
+ async with self.browser_context.expect_page() as new_page_info:
+ await tieba_link.click()
+
+ # 获取新打开的页面
+ new_page = await new_page_info.value
+ await new_page.wait_for_load_state("domcontentloaded")
+
+ # 关闭旧的百度首页
+ await self.context_page.close()
+
+ # 切换到新的贴吧页面
+ self.context_page = new_page
+ utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
+ else:
+ # 如果是同一标签页跳转,正常等待导航
+ utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
+ async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
+ await tieba_link.click()
+
+ # Step 5: 等待页面稳定,使用配置文件中的延时设置
+ utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ current_url = self.context_page.url
+ utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
+
+ except Exception as e:
+ utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
+ utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
+ await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+
+ async def _inject_anti_detection_scripts(self):
+ """
+ 注入反检测JavaScript脚本
+ 针对百度贴吧的特殊检测机制
+ """
+ utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")
+
+ # 轻量级反检测脚本,只覆盖关键检测点
+ anti_detection_js = """
+ // 覆盖 navigator.webdriver
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => undefined,
+ configurable: true
+ });
+
+ // 覆盖 window.navigator.chrome
+ if (!window.navigator.chrome) {
+ window.navigator.chrome = {
+ runtime: {},
+ loadTimes: function() {},
+ csi: function() {},
+ app: {}
+ };
+ }
+
+ // 覆盖 Permissions API
+ const originalQuery = window.navigator.permissions.query;
+ window.navigator.permissions.query = (parameters) => (
+ parameters.name === 'notifications' ?
+ Promise.resolve({ state: Notification.permission }) :
+ originalQuery(parameters)
+ );
+
+ // 覆盖 plugins 长度(让它看起来有插件)
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5],
+ configurable: true
+ });
+
+ // 覆盖 languages
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['zh-CN', 'zh', 'en'],
+ configurable: true
+ });
+
+ // 移除 window.cdc_ 等 ChromeDriver 残留
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
+
+ console.log('[Anti-Detection] Scripts injected successfully');
+ """
+
+ await self.browser_context.add_init_script(anti_detection_js)
+ utils.logger.info("[TieBaCrawler] Anti-detection scripts injected")
+
+ async def create_tieba_client(
+ self, httpx_proxy: Optional[str], ip_pool: Optional[ProxyIpPool] = None
+ ) -> BaiduTieBaClient:
+ """
+ Create tieba client with real browser User-Agent and complete headers
+ Args:
+ httpx_proxy: HTTP代理
+ ip_pool: IP代理池
+
+ Returns:
+ BaiduTieBaClient实例
+ """
+ utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")
+
+ # 从真实浏览器提取User-Agent,避免被检测
+ user_agent = await self.context_page.evaluate("() => navigator.userAgent")
+ utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
+
+ cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
+
+ # 构建完整的浏览器请求头,模拟真实浏览器行为
+ tieba_client = BaiduTieBaClient(
+ timeout=10,
+ ip_pool=ip_pool,
+ default_ip_proxy=httpx_proxy,
+ headers={
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "Accept-Language": "zh-CN,zh;q=0.9",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Connection": "keep-alive",
+ "User-Agent": user_agent, # 使用真实浏览器的UA
+ "Cookie": cookie_str,
+ "Host": "tieba.baidu.com",
+ "Referer": "https://tieba.baidu.com/",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "same-origin",
+ "Sec-Fetch-User": "?1",
+ "Upgrade-Insecure-Requests": "1",
+ "sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-platform": '"macOS"',
+ },
+ playwright_page=self.context_page, # 传入playwright页面对象
+ )
+ return tieba_client
+
async def launch_browser(
self,
chromium: BrowserType,
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
index 552801f..e78a212 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
@@ -15,7 +15,7 @@
import asyncio
import os
-import random
+# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -77,8 +77,11 @@ class WeiboCrawler(AbstractCrawler):
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.mobile_index_url)
@@ -160,6 +163,11 @@ class WeiboCrawler(AbstractCrawler):
await self.get_note_images(mblog)
page += 1
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+
await self.batch_get_notes_comments(note_id_list)
async def get_specified_notes(self):
@@ -185,6 +193,11 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.wb_client.get_note_info_by_id(note_id)
+
+ # Sleep after fetching note details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+
return result
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +234,14 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
+
+ # Sleep before fetching comments
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
+
await self.wb_client.get_note_all_comments(
note_id=note_id,
- crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random
callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@@ -250,7 +268,8 @@ class WeiboCrawler(AbstractCrawler):
if not url:
continue
content = await self.wb_client.get_note_image(url)
- await asyncio.sleep(random.random())
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
if content != None:
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
index 982373a..652667f 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
@@ -10,22 +10,24 @@
import asyncio
import json
-import re
+import time
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
+from tenacity import retry, stop_after_attempt, wait_fixed
import config
from base.base_crawler import AbstractApiClient
from tools import utils
-from html import unescape
+
from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign
+from .extractor import XiaoHongShuExtractor
+from .secsign import seccore_signv2_playwright
class XiaoHongShuClient(AbstractApiClient):
@@ -50,6 +52,7 @@ class XiaoHongShuClient(AbstractApiClient):
self.NOTE_ABNORMAL_CODE = -510001
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
+ self._extractor = XiaoHongShuExtractor()
async def _pre_headers(self, url: str, data=None) -> Dict:
"""
@@ -61,13 +64,13 @@ class XiaoHongShuClient(AbstractApiClient):
Returns:
"""
- encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
+ x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign(
a1=self.cookie_dict.get("a1", ""),
b1=local_storage.get("b1", ""),
- x_s=encrypt_params.get("X-s", ""),
- x_t=str(encrypt_params.get("X-t", "")),
+ x_s=x_s,
+ x_t=str(int(time.time())),
)
headers = {
@@ -128,7 +131,9 @@ class XiaoHongShuClient(AbstractApiClient):
if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}"
headers = await self._pre_headers(final_uri)
- return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
+ return await self.request(
+ method="GET", url=f"{self._host}{final_uri}", headers=headers
+ )
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
@@ -156,12 +161,18 @@ class XiaoHongShuClient(AbstractApiClient):
response = await client.request("GET", url, timeout=self.timeout)
response.raise_for_status()
if not response.reason_phrase == "OK":
- utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}"
+ )
return None
else:
return response.content
- except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
- utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
+ except (
+ httpx.HTTPError
+ ) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}"
+ ) # 保留原始异常类型名称,以便开发者调试
return None
async def pong(self) -> bool:
@@ -178,7 +189,9 @@ class XiaoHongShuClient(AbstractApiClient):
if note_card.get("items"):
ping_flag = True
except Exception as e:
- utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
+ utils.logger.error(
+ f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..."
+ )
ping_flag = False
return ping_flag
@@ -249,9 +262,7 @@ class XiaoHongShuClient(AbstractApiClient):
data = {
"source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"],
- "extra": {
- "need_body_topic": 1
- },
+ "extra": {"need_body_topic": 1},
"xsec_source": xsec_source,
"xsec_token": xsec_token,
}
@@ -261,7 +272,9 @@ class XiaoHongShuClient(AbstractApiClient):
res_dict: Dict = res["items"][0]["note_card"]
return res_dict
# 爬取频繁了可能会出现有的笔记能有结果有的没有
- utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
+ )
return dict()
async def get_note_comments(
@@ -345,15 +358,19 @@ class XiaoHongShuClient(AbstractApiClient):
comments_has_more = True
comments_cursor = ""
while comments_has_more and len(result) < max_count:
- comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
+ comments_res = await self.get_note_comments(
+ note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor
+ )
comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if len(result) + len(comments) > max_count:
- comments = comments[:max_count - len(result)]
+ comments = comments[: max_count - len(result)]
if callback:
await callback(note_id, comments)
await asyncio.sleep(crawl_interval)
@@ -386,7 +403,9 @@ class XiaoHongShuClient(AbstractApiClient):
"""
if not config.ENABLE_GET_SUB_COMMENTS:
- utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+ utils.logger.info(
+ f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+ )
return []
result = []
@@ -413,12 +432,16 @@ class XiaoHongShuClient(AbstractApiClient):
)
if comments_res is None:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}"
+ )
continue
sub_comment_has_more = comments_res.get("has_more", False)
sub_comment_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if callback:
@@ -427,23 +450,30 @@ class XiaoHongShuClient(AbstractApiClient):
result.extend(comments)
return result
- async def get_creator_info(self, user_id: str) -> Dict:
+ async def get_creator_info(
+ self, user_id: str, xsec_token: str = "", xsec_source: str = ""
+ ) -> Dict:
"""
通过解析网页版的用户主页HTML,获取用户个人简要信息
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
- eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
+
+ Args:
+ user_id: 用户ID
+ xsec_token: 验证token (可选,如果URL中包含此参数则传入)
+ xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
+
+ Returns:
+ Dict: 创作者信息
"""
+ # 构建URI,如果有xsec参数则添加到URL中
uri = f"/user/profile/{user_id}"
- html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
- match = re.search(r"", html)[0].replace("undefined", '""')
-
- if state != "{}":
- note_dict = transform_json_keys(state)
- return note_dict["note"]["note_detail_map"][note_id]["note"]
- return {}
-
- try:
- return get_note_dict(html)
- except:
- return None
+ return self._extractor.extract_note_detail_from_html(note_id, html)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
index 9c88f1c..68d2139 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
@@ -11,9 +11,8 @@
import asyncio
import os
import random
-import time
from asyncio import Task
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
from playwright.async_api import (
BrowserContext,
@@ -27,7 +26,7 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
-from model.m_xiaohongshu import NoteUrlInfo
+from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
@@ -37,7 +36,7 @@ from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient
from .exception import DataFetchError
from .field import SearchSortType
-from .help import parse_note_info_from_note_url, get_search_id
+from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
from .login import XiaoHongShuLogin
@@ -80,8 +79,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.user_agent,
headless=config.HEADLESS,
)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -164,6 +164,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_ids, xsec_tokens)
+
+ # Sleep after each page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[XiaoHongShuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
except DataFetchError:
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
break
@@ -171,17 +175,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_creators_and_notes(self) -> None:
"""Get creator's notes and retrieve their comment information."""
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
- for user_id in config.XHS_CREATOR_ID_LIST:
- # get creator detail info from web html content
- createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
- if createor_info:
- await xhs_store.save_creator(user_id, creator=createor_info)
+ for creator_url in config.XHS_CREATOR_ID_LIST:
+ try:
+ # Parse creator URL to get user_id and security tokens
+ creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
+ utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}")
+ user_id = creator_info.user_id
- # When proxy is not enabled, increase the crawling interval
- if config.ENABLE_IP_PROXY:
- crawl_interval = random.random()
- else:
- crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+ # get creator detail info from web html content
+ createor_info: Dict = await self.xhs_client.get_creator_info(
+ user_id=user_id,
+ xsec_token=creator_info.xsec_token,
+ xsec_source=creator_info.xsec_source
+ )
+ if createor_info:
+ await xhs_store.save_creator(user_id, creator=createor_info)
+ except ValueError as e:
+ utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}")
+ continue
+
+ # Use fixed crawling interval
+ crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
# Get all note information of the creator
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
user_id=user_id,
@@ -268,18 +282,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-
- try:
- note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
- except RetryError as e:
- pass
-
+ note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
if not note_detail:
- note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
- if not note_detail:
- raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+ raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
+
+ # Sleep after fetching note detail
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note {note_id}")
+
return note_detail
except DataFetchError as ex:
@@ -310,11 +322,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note comments with keyword filtering and quantity limitation"""
async with semaphore:
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
- # When proxy is not enabled, increase the crawling interval
- if config.ENABLE_IP_PROXY:
- crawl_interval = random.random()
- else:
- crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+ # Use fixed crawling interval
+ crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
await self.xhs_client.get_note_all_comments(
note_id=note_id,
xsec_token=xsec_token,
@@ -322,6 +331,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
+
+ # Sleep after fetching comments
+ await asyncio.sleep(crawl_interval)
+ utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}")
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client"""
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py
new file mode 100644
index 0000000..b8d7540
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py
@@ -0,0 +1,60 @@
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import json
+import re
+from typing import Dict, Optional
+
+import humps
+
+
+class XiaoHongShuExtractor:
+ def __init__(self):
+ pass
+
+ def extract_note_detail_from_html(self, note_id: str, html: str) -> Optional[Dict]:
+ """从html中提取笔记详情
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 笔记详情字典
+ """
+ if "noteDetailMap" not in html:
+ # 这种情况要么是出了验证码了,要么是笔记不存在
+ return None
+
+ state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[
+ 0
+ ].replace("undefined", '""')
+ if state != "{}":
+ note_dict = humps.decamelize(json.loads(state))
+ return note_dict["note"]["note_detail_map"][note_id]["note"]
+ return None
+
+ def extract_creator_info_from_html(self, html: str) -> Optional[Dict]:
+ """从html中提取用户信息
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 用户信息字典
+ """
+ match = re.search(
+ r"