1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -11,8 +11,8 @@ from datetime import date, timedelta, datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
import random
|
||||
import pymysql
|
||||
from pymysql.cursors import DictCursor
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -23,30 +23,38 @@ try:
|
||||
except ImportError:
|
||||
raise ImportError("无法导入config.py配置文件")
|
||||
|
||||
from config import settings
|
||||
from loguru import logger
|
||||
|
||||
class KeywordManager:
|
||||
"""关键词管理器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化关键词管理器"""
|
||||
self.connection = None
|
||||
self.engine: Engine = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""连接数据库"""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
database=config.DB_NAME,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True,
|
||||
cursorclass=DictCursor
|
||||
)
|
||||
print(f"关键词管理器成功连接到数据库: {config.DB_NAME}")
|
||||
dialect = (settings.DB_DIALECT or "mysql").lower()
|
||||
if dialect in ("postgresql", "postgres"):
|
||||
url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
|
||||
else:
|
||||
url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
|
||||
self.engine = create_engine(url, future=True)
|
||||
logger.info(f"关键词管理器成功连接到数据库: {settings.DB_NAME}")
|
||||
except ModuleNotFoundError as e:
|
||||
missing: str = str(e)
|
||||
if "psycopg" in missing:
|
||||
logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
|
||||
elif "pymysql" in missing:
|
||||
logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
|
||||
else:
|
||||
logger.error(f"数据库连接失败(缺少驱动): {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"关键词管理器数据库连接失败: {e}")
|
||||
logger.exception(f"关键词管理器数据库连接失败: {e}")
|
||||
raise
|
||||
|
||||
def get_latest_keywords(self, target_date: date = None, max_keywords: int = 100) -> List[str]:
|
||||
@@ -63,24 +71,24 @@ class KeywordManager:
|
||||
if not target_date:
|
||||
target_date = date.today()
|
||||
|
||||
print(f"正在获取 {target_date} 的关键词...")
|
||||
logger.info(f"正在获取 {target_date} 的关键词...")
|
||||
|
||||
# 首先尝试获取指定日期的关键词
|
||||
topics_data = self.get_daily_topics(target_date)
|
||||
|
||||
if topics_data and topics_data.get('keywords'):
|
||||
keywords = topics_data['keywords']
|
||||
print(f"成功获取 {target_date} 的 {len(keywords)} 个关键词")
|
||||
logger.info(f"成功获取 {target_date} 的 {len(keywords)} 个关键词")
|
||||
|
||||
# 如果关键词太多,随机选择指定数量
|
||||
if len(keywords) > max_keywords:
|
||||
keywords = random.sample(keywords, max_keywords)
|
||||
print(f"随机选择了 {max_keywords} 个关键词")
|
||||
logger.info(f"随机选择了 {max_keywords} 个关键词")
|
||||
|
||||
return keywords
|
||||
|
||||
# 如果没有当天的关键词,尝试获取最近几天的
|
||||
print(f"{target_date} 没有关键词数据,尝试获取最近的关键词...")
|
||||
logger.info(f"{target_date} 没有关键词数据,尝试获取最近的关键词...")
|
||||
recent_topics = self.get_recent_topics(days=7)
|
||||
|
||||
if recent_topics:
|
||||
@@ -95,11 +103,11 @@ class KeywordManager:
|
||||
if len(unique_keywords) > max_keywords:
|
||||
unique_keywords = random.sample(unique_keywords, max_keywords)
|
||||
|
||||
print(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词")
|
||||
logger.info(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词")
|
||||
return unique_keywords
|
||||
|
||||
# 如果都没有,返回默认关键词
|
||||
print("没有找到任何关键词数据,使用默认关键词")
|
||||
logger.info("没有找到任何关键词数据,使用默认关键词")
|
||||
return self._get_default_keywords()
|
||||
|
||||
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
|
||||
@@ -116,20 +124,22 @@ class KeywordManager:
|
||||
extract_date = date.today()
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = "SELECT * FROM daily_topics WHERE extract_date = %s"
|
||||
cursor.execute(query, (extract_date,))
|
||||
result = cursor.fetchone()
|
||||
with self.engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text("SELECT * FROM daily_topics WHERE extract_date = :d"),
|
||||
{"d": extract_date},
|
||||
).mappings().first()
|
||||
|
||||
if result:
|
||||
# 解析关键词JSON
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
# 转为可变dict再赋值
|
||||
result = dict(result)
|
||||
result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else []
|
||||
return result
|
||||
else:
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取话题分析失败: {e}")
|
||||
logger.exception(f"获取话题分析失败: {e}")
|
||||
return None
|
||||
|
||||
def get_recent_topics(self, days: int = 7) -> List[Dict]:
|
||||
@@ -143,23 +153,28 @@ class KeywordManager:
|
||||
话题分析列表
|
||||
"""
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = """
|
||||
SELECT * FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
cursor.execute(query, (days,))
|
||||
results = cursor.fetchall()
|
||||
start_date = date.today() - timedelta(days=days)
|
||||
with self.engine.connect() as conn:
|
||||
results = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT * FROM daily_topics
|
||||
WHERE extract_date >= :start_date
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).mappings().all()
|
||||
|
||||
# 解析每个结果的关键词JSON
|
||||
# 转为可变dict列表再处理
|
||||
results = [dict(r) for r in results]
|
||||
for result in results:
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
result['keywords'] = json.loads(result['keywords']) if result.get('keywords') else []
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取最近话题分析失败: {e}")
|
||||
logger.exception(f"获取最近话题分析失败: {e}")
|
||||
return []
|
||||
|
||||
def _get_default_keywords(self) -> List[str]:
|
||||
@@ -190,8 +205,8 @@ class KeywordManager:
|
||||
keywords = self.get_latest_keywords(target_date, max_keywords)
|
||||
|
||||
if keywords:
|
||||
print(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词")
|
||||
print(f"每个关键词将在所有平台上进行爬取")
|
||||
logger.info(f"为 {len(platforms)} 个平台准备了相同的 {len(keywords)} 个关键词")
|
||||
logger.info(f"每个关键词将在所有平台上进行爬取")
|
||||
|
||||
return keywords
|
||||
|
||||
@@ -210,7 +225,7 @@ class KeywordManager:
|
||||
"""
|
||||
keywords = self.get_latest_keywords(target_date, max_keywords)
|
||||
|
||||
print(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)")
|
||||
logger.info(f"为平台 {platform} 准备了 {len(keywords)} 个关键词(与其他平台相同)")
|
||||
return keywords
|
||||
|
||||
def _filter_keywords_by_platform(self, keywords: List[str], platform: str) -> List[str]:
|
||||
@@ -290,9 +305,9 @@ class KeywordManager:
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接"""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
print("关键词管理器数据库连接已关闭")
|
||||
if self.engine:
|
||||
self.engine.dispose()
|
||||
logger.info("关键词管理器数据库连接已关闭")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@@ -305,16 +320,16 @@ if __name__ == "__main__":
|
||||
with KeywordManager() as km:
|
||||
# 测试获取关键词
|
||||
keywords = km.get_latest_keywords(max_keywords=20)
|
||||
print(f"获取到的关键词: {keywords}")
|
||||
logger.info(f"获取到的关键词: {keywords}")
|
||||
|
||||
# 测试平台分配
|
||||
platforms = ['xhs', 'dy', 'bili']
|
||||
distribution = km.distribute_keywords_by_platform(keywords, platforms)
|
||||
for platform, kws in distribution.items():
|
||||
print(f"{platform}: {kws}")
|
||||
logger.info(f"{platform}: {kws}")
|
||||
|
||||
# 测试爬取摘要
|
||||
summary = km.get_crawling_summary()
|
||||
print(f"爬取摘要: {summary}")
|
||||
logger.info(f"爬取摘要: {summary}")
|
||||
|
||||
print("关键词管理器测试完成!")
|
||||
logger.info("关键词管理器测试完成!")
|
||||
|
||||
@@ -13,6 +13,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
import json
|
||||
from loguru import logger
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -36,11 +37,15 @@ class PlatformCrawler:
|
||||
if not self.mediacrawler_path.exists():
|
||||
raise FileNotFoundError(f"MediaCrawler目录不存在: {self.mediacrawler_path}")
|
||||
|
||||
print(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}")
|
||||
logger.info(f"初始化平台爬虫管理器,MediaCrawler路径: {self.mediacrawler_path}")
|
||||
|
||||
def configure_mediacrawler_db(self):
|
||||
"""配置MediaCrawler使用我们的MySQL数据库"""
|
||||
"""配置MediaCrawler使用我们的数据库(MySQL或PostgreSQL)"""
|
||||
try:
|
||||
# 判断数据库类型
|
||||
db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
|
||||
is_postgresql = db_dialect in ("postgresql", "postgres")
|
||||
|
||||
# 修改MediaCrawler的数据库配置
|
||||
db_config_path = self.mediacrawler_path / "config" / "db_config.py"
|
||||
|
||||
@@ -48,7 +53,14 @@ class PlatformCrawler:
|
||||
with open(db_config_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 替换数据库配置
|
||||
# PostgreSQL配置值:如果使用PostgreSQL则使用MindSpider配置,否则使用默认值或环境变量
|
||||
pg_password = config.settings.DB_PASSWORD if is_postgresql else "bettafish"
|
||||
pg_user = config.settings.DB_USER if is_postgresql else "bettafish"
|
||||
pg_host = config.settings.DB_HOST if is_postgresql else "127.0.0.1"
|
||||
pg_port = config.settings.DB_PORT if is_postgresql else 5432
|
||||
pg_db_name = config.settings.DB_NAME if is_postgresql else "bettafish"
|
||||
|
||||
# 替换数据库配置 - 使用MindSpider的数据库配置
|
||||
new_config = f'''# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
@@ -63,11 +75,19 @@ class PlatformCrawler:
|
||||
import os
|
||||
|
||||
# mysql config - 使用MindSpider的数据库配置
|
||||
MYSQL_DB_PWD = "{config.DB_PASSWORD}"
|
||||
MYSQL_DB_USER = "{config.DB_USER}"
|
||||
MYSQL_DB_HOST = "{config.DB_HOST}"
|
||||
MYSQL_DB_PORT = {config.DB_PORT}
|
||||
MYSQL_DB_NAME = "{config.DB_NAME}"
|
||||
MYSQL_DB_PWD = "{config.settings.DB_PASSWORD}"
|
||||
MYSQL_DB_USER = "{config.settings.DB_USER}"
|
||||
MYSQL_DB_HOST = "{config.settings.DB_HOST}"
|
||||
MYSQL_DB_PORT = {config.settings.DB_PORT}
|
||||
MYSQL_DB_NAME = "{config.settings.DB_NAME}"
|
||||
|
||||
mysql_db_config = {{
|
||||
"user": MYSQL_DB_USER,
|
||||
"password": MYSQL_DB_PWD,
|
||||
"host": MYSQL_DB_HOST,
|
||||
"port": MYSQL_DB_PORT,
|
||||
"db_name": MYSQL_DB_NAME,
|
||||
}}
|
||||
|
||||
|
||||
# redis config
|
||||
@@ -81,17 +101,39 @@ CACHE_TYPE_REDIS = "redis"
|
||||
CACHE_TYPE_MEMORY = "memory"
|
||||
|
||||
# sqlite config
|
||||
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")'''
|
||||
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
|
||||
|
||||
sqlite_db_config = {{
|
||||
"db_path": SQLITE_DB_PATH
|
||||
}}
|
||||
|
||||
# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
|
||||
POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "{pg_password}")
|
||||
POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "{pg_user}")
|
||||
POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "{pg_host}")
|
||||
POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "{pg_port}")
|
||||
POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "{pg_db_name}")
|
||||
|
||||
postgresql_db_config = {{
|
||||
"user": POSTGRESQL_DB_USER,
|
||||
"password": POSTGRESQL_DB_PWD,
|
||||
"host": POSTGRESQL_DB_HOST,
|
||||
"port": POSTGRESQL_DB_PORT,
|
||||
"db_name": POSTGRESQL_DB_NAME,
|
||||
}}
|
||||
|
||||
'''
|
||||
|
||||
# 写入新配置
|
||||
with open(db_config_path, 'w', encoding='utf-8') as f:
|
||||
f.write(new_config)
|
||||
|
||||
print("已配置MediaCrawler使用MindSpider数据库")
|
||||
db_type = "PostgreSQL" if is_postgresql else "MySQL"
|
||||
logger.info(f"已配置MediaCrawler使用MindSpider {db_type}数据库")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"配置MediaCrawler数据库失败: {e}")
|
||||
logger.exception(f"配置MediaCrawler数据库失败: {e}")
|
||||
return False
|
||||
|
||||
def create_base_config(self, platform: str, keywords: List[str],
|
||||
@@ -109,6 +151,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
是否配置成功
|
||||
"""
|
||||
try:
|
||||
# 判断数据库类型,确定 SAVE_DATA_OPTION
|
||||
db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
|
||||
is_postgresql = db_dialect in ("postgresql", "postgres")
|
||||
save_data_option = "postgresql" if is_postgresql else "db"
|
||||
|
||||
base_config_path = self.mediacrawler_path / "config" / "base_config.py"
|
||||
|
||||
# 将关键词列表转换为逗号分隔的字符串
|
||||
@@ -130,7 +177,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
elif line.startswith('CRAWLER_TYPE = '):
|
||||
new_lines.append(f'CRAWLER_TYPE = "{crawler_type}" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)')
|
||||
elif line.startswith('SAVE_DATA_OPTION = '):
|
||||
new_lines.append('SAVE_DATA_OPTION = "db" # csv or db or json or sqlite')
|
||||
new_lines.append(f'SAVE_DATA_OPTION = "{save_data_option}" # csv or db or json or sqlite or postgresql')
|
||||
elif line.startswith('CRAWLER_MAX_NOTES_COUNT = '):
|
||||
new_lines.append(f'CRAWLER_MAX_NOTES_COUNT = {max_notes}')
|
||||
elif line.startswith('ENABLE_GET_COMMENTS = '):
|
||||
@@ -146,11 +193,11 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
with open(base_config_path, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(new_lines))
|
||||
|
||||
print(f"已配置 {platform} 平台,关键词数量: {len(keywords)}")
|
||||
logger.info(f"已配置 {platform} 平台,爬取类型: {crawler_type},关键词数量: {len(keywords)},最大爬取数量: {max_notes},保存数据方式: {save_data_option}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"创建基础配置失败: {e}")
|
||||
logger.exception(f"创建基础配置失败: {e}")
|
||||
return False
|
||||
|
||||
def run_crawler(self, platform: str, keywords: List[str],
|
||||
@@ -173,8 +220,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
if not keywords:
|
||||
raise ValueError("关键词列表不能为空")
|
||||
|
||||
print(f"\n开始爬取平台: {platform}")
|
||||
print(f"关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)")
|
||||
start_message = f"\n开始爬取平台: {platform}"
|
||||
start_message += f"\n关键词: {keywords[:5]}{'...' if len(keywords) > 5 else ''} (共{len(keywords)}个)"
|
||||
logger.info(start_message)
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
@@ -187,22 +235,27 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
if not self.create_base_config(platform, keywords, "search", max_notes):
|
||||
return {"success": False, "error": "基础配置创建失败"}
|
||||
|
||||
# 判断数据库类型,确定 save_data_option
|
||||
db_dialect = (config.settings.DB_DIALECT or "mysql").lower()
|
||||
is_postgresql = db_dialect in ("postgresql", "postgres")
|
||||
save_data_option = "postgresql" if is_postgresql else "db"
|
||||
|
||||
# 构建命令
|
||||
cmd = [
|
||||
sys.executable, "main.py",
|
||||
"--platform", platform,
|
||||
"--lt", login_type,
|
||||
"--type", "search",
|
||||
"--save_data_option", "db"
|
||||
"--save_data_option", save_data_option
|
||||
]
|
||||
|
||||
print(f"执行命令: {' '.join(cmd)}")
|
||||
logger.info(f"执行命令: {' '.join(cmd)}")
|
||||
|
||||
# 切换到MediaCrawler目录并执行
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=self.mediacrawler_path,
|
||||
timeout=1800 # 30分钟超时
|
||||
timeout=3600 # 60分钟超时
|
||||
)
|
||||
|
||||
end_time = datetime.now()
|
||||
@@ -226,17 +279,17 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
self.crawl_stats[platform] = crawl_stats
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒")
|
||||
logger.info(f"✅ {platform} 爬取完成,耗时: {duration:.1f}秒")
|
||||
else:
|
||||
print(f"❌ {platform} 爬取失败,返回码: {result.returncode}")
|
||||
logger.error(f"❌ {platform} 爬取失败,返回码: {result.returncode}")
|
||||
|
||||
return crawl_stats
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f"❌ {platform} 爬取超时")
|
||||
logger.exception(f"❌ {platform} 爬取超时")
|
||||
return {"success": False, "error": "爬取超时", "platform": platform}
|
||||
except Exception as e:
|
||||
print(f"❌ {platform} 爬取异常: {e}")
|
||||
logger.exception(f"❌ {platform} 爬取异常: {e}")
|
||||
return {"success": False, "error": str(e), "platform": platform}
|
||||
|
||||
def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
|
||||
@@ -291,10 +344,14 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
Returns:
|
||||
总体爬取统计
|
||||
"""
|
||||
print(f"\n🚀 开始全平台关键词爬取")
|
||||
print(f" 关键词数量: {len(keywords)}")
|
||||
print(f" 平台数量: {len(platforms)}")
|
||||
print(f" 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}")
|
||||
|
||||
start_message = f"\n🚀 开始全平台关键词爬取"
|
||||
start_message += f"\n 关键词数量: {len(keywords)}"
|
||||
start_message += f"\n 平台数量: {len(platforms)}"
|
||||
start_message += f"\n 登录方式: {login_type}"
|
||||
start_message += f"\n 每个关键词在每个平台的最大爬取数量: {max_notes_per_keyword}"
|
||||
start_message += f"\n 总爬取任务: {len(keywords)} × {len(platforms)} = {len(keywords) * len(platforms)}"
|
||||
logger.info(start_message)
|
||||
|
||||
total_stats = {
|
||||
"total_keywords": len(keywords),
|
||||
@@ -319,8 +376,8 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
|
||||
# 对每个平台一次性爬取所有关键词
|
||||
for platform in platforms:
|
||||
print(f"\n📝 在 {platform} 平台爬取所有关键词")
|
||||
print(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}")
|
||||
logger.info(f"\n📝 在 {platform} 平台爬取所有关键词")
|
||||
logger.info(f" 关键词: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}")
|
||||
|
||||
try:
|
||||
# 一次性传递所有关键词给平台
|
||||
@@ -344,7 +401,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
total_stats["keyword_results"][keyword] = {}
|
||||
total_stats["keyword_results"][keyword][platform] = result
|
||||
|
||||
print(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论")
|
||||
logger.info(f" ✅ 成功: {notes_count} 条内容, {comments_count} 条评论")
|
||||
else:
|
||||
total_stats["failed_tasks"] += len(keywords)
|
||||
total_stats["platform_summary"][platform]["failed_keywords"] = len(keywords)
|
||||
@@ -355,7 +412,7 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
total_stats["keyword_results"][keyword] = {}
|
||||
total_stats["keyword_results"][keyword][platform] = result
|
||||
|
||||
print(f" ❌ 失败: {result.get('error', '未知错误')}")
|
||||
logger.error(f" ❌ 失败: {result.get('error', '未知错误')}")
|
||||
|
||||
except Exception as e:
|
||||
total_stats["failed_tasks"] += len(keywords)
|
||||
@@ -368,22 +425,24 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
total_stats["keyword_results"][keyword] = {}
|
||||
total_stats["keyword_results"][keyword][platform] = error_result
|
||||
|
||||
print(f" ❌ 异常: {e}")
|
||||
logger.error(f" ❌ 异常: {e}")
|
||||
|
||||
# 打印详细统计
|
||||
print(f"\n📊 全平台关键词爬取完成!")
|
||||
print(f" 总任务: {total_stats['total_tasks']}")
|
||||
print(f" 成功: {total_stats['successful_tasks']}")
|
||||
print(f" 失败: {total_stats['failed_tasks']}")
|
||||
print(f" 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%")
|
||||
print(f" 总内容: {total_stats['total_notes']} 条")
|
||||
print(f" 总评论: {total_stats['total_comments']} 条")
|
||||
finish_message = f"\n📊 全平台关键词爬取完成!"
|
||||
finish_message += f"\n 总任务: {total_stats['total_tasks']}"
|
||||
finish_message += f"\n 成功: {total_stats['successful_tasks']}"
|
||||
finish_message += f"\n 失败: {total_stats['failed_tasks']}"
|
||||
finish_message += f"\n 成功率: {total_stats['successful_tasks']/total_stats['total_tasks']*100:.1f}%"
|
||||
finish_message += f"\n 总内容: {total_stats['total_notes']} 条"
|
||||
finish_message += f"\n 总评论: {total_stats['total_comments']} 条"
|
||||
logger.info(finish_message)
|
||||
|
||||
print(f"\n📈 各平台统计:")
|
||||
platform_summary_message = f"\n� 各平台统计:"
|
||||
for platform, stats in total_stats["platform_summary"].items():
|
||||
success_rate = stats["successful_keywords"] / len(keywords) * 100 if keywords else 0
|
||||
print(f" {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "
|
||||
f"{stats['total_notes']} 条内容")
|
||||
platform_summary_message += f"\n {platform}: {stats['successful_keywords']}/{len(keywords)} 关键词成功 ({success_rate:.1f}%), "
|
||||
platform_summary_message += f"{stats['total_notes']} 条内容"
|
||||
logger.info(platform_summary_message)
|
||||
|
||||
return total_stats
|
||||
|
||||
@@ -403,9 +462,9 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schem
|
||||
try:
|
||||
with open(log_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.crawl_stats, f, ensure_ascii=False, indent=2)
|
||||
print(f"爬取日志已保存到: {log_path}")
|
||||
logger.info(f"爬取日志已保存到: {log_path}")
|
||||
except Exception as e:
|
||||
print(f"保存爬取日志失败: {e}")
|
||||
logger.exception(f"保存爬取日志失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试平台爬虫管理器
|
||||
@@ -415,5 +474,5 @@ if __name__ == "__main__":
|
||||
test_keywords = ["科技", "AI", "编程"]
|
||||
result = crawler.run_crawler("xhs", test_keywords, max_notes=5)
|
||||
|
||||
print(f"测试结果: {result}")
|
||||
print("平台爬虫管理器测试完成!")
|
||||
logger.info(f"测试结果: {result}")
|
||||
logger.info("平台爬虫管理器测试完成!")
|
||||
|
||||
Reference in New Issue
Block a user