1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 3ad807778c
commit 96c7d2d3b7
11 changed files with 737 additions and 574 deletions
+47 -54
View File
@@ -11,6 +11,7 @@ import argparse
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -21,8 +22,8 @@ try:
from BroadTopicExtraction.topic_extractor import TopicExtractor
from BroadTopicExtraction.database_manager import DatabaseManager
except ImportError as e:
print(f"导入模块失败: {e}")
print("请确保在项目根目录运行,并且已安装所有依赖")
logger.exception(f"导入模块失败: {e}")
logger.error("请确保在项目根目录运行,并且已安装所有依赖")
sys.exit(1)
class BroadTopicExtraction:
@@ -34,7 +35,7 @@ class BroadTopicExtraction:
self.topic_extractor = TopicExtractor()
self.db_manager = DatabaseManager()
print("BroadTopicExtraction 初始化完成")
logger.info("BroadTopicExtraction 初始化完成")
def close(self):
"""关闭资源"""
@@ -68,21 +69,22 @@ class BroadTopicExtraction:
Returns:
包含完整提取结果的字典
"""
print("\n" + "=" * 80)
print("MindSpider AI爬虫 - 每日话题提取")
print("=" * 80)
print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"目标日期: {date.today()}")
extraction_result_message = ""
extraction_result_message += "\nMindSpider AI爬虫 - 每日话题提取\n"
extraction_result_message += f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
extraction_result_message += f"目标日期: {date.today()}\n"
if news_sources:
print(f"指定平台: {len(news_sources)}")
extraction_result_message += f"指定平台: {len(news_sources)}\n"
for source in news_sources:
source_name = SOURCE_NAMES.get(source, source)
print(f" - {source_name}")
extraction_result_message += f" - {source_name}\n"
else:
print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
extraction_result_message += f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台\n"
print(f"关键词数: 最多 {max_keywords}")
extraction_result_message += f"关键词数: 最多 {max_keywords}\n"
logger.info(extraction_result_message)
extraction_result = {
'success': False,
@@ -96,7 +98,7 @@ class BroadTopicExtraction:
try:
# 步骤1: 收集新闻
print("\n【步骤1】收集热点新闻...")
logger.info("【步骤1】收集热点新闻...")
news_result = await self.news_collector.collect_and_save_news(
sources=news_sources
)
@@ -112,7 +114,7 @@ class BroadTopicExtraction:
raise Exception("新闻收集失败或没有获取到新闻")
# 步骤2: 提取关键词和生成总结
print("\n【步骤2】提取关键词和生成总结...")
logger.info("【步骤2】提取关键词和生成总结...")
keywords, summary = self.topic_extractor.extract_keywords_and_summary(
news_result['news_list'],
max_keywords=max_keywords
@@ -126,10 +128,10 @@ class BroadTopicExtraction:
}
if not keywords:
print("警告: 没有提取到有效关键词")
logger.warning("警告: 没有提取到有效关键词")
# 步骤3: 保存到数据库
print("\n【步骤3】保存分析结果到数据库...")
logger.info("【步骤3】保存分析结果到数据库...")
save_success = self.db_manager.save_daily_topics(
keywords, summary, date.today()
)
@@ -141,56 +143,47 @@ class BroadTopicExtraction:
extraction_result['success'] = True
extraction_result['end_time'] = datetime.now().isoformat()
print("\n" + "=" * 80)
print("每日话题提取流程完成!")
print("=" * 80)
logger.info("每日话题提取流程完成!")
return extraction_result
except Exception as e:
print(f"\n话题提取流程失败: {e}")
logger.exception(f"话题提取流程失败: {e}")
extraction_result['error'] = str(e)
extraction_result['end_time'] = datetime.now().isoformat()
return extraction_result
def print_extraction_results(self, extraction_result: Dict):
"""打印提取结果"""
print("\n" + "=" * 80)
print("话题提取结果报告")
print("=" * 80)
if not extraction_result['success']:
print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
return
extraction_result_message = ""
# 新闻收集结果
news_data = extraction_result.get('news_collection', {})
print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
extraction_result_message += f"\n📰 新闻收集: {news_data.get('total_news', 0)} 条新闻\n"
extraction_result_message += f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}\n"
# 话题提取结果
topic_data = extraction_result.get('topic_extraction', {})
keywords = topic_data.get('keywords', [])
summary = topic_data.get('summary', '')
print(f"\n🔑 提取关键词: {len(keywords)}")
extraction_result_message += f"\n🔑 提取关键词: {len(keywords)}\n"
if keywords:
# 每行显示5个关键词
for i in range(0, len(keywords), 5):
keyword_group = keywords[i:i+5]
print(f" {', '.join(keyword_group)}")
extraction_result_message += f" {', '.join(keyword_group)}\n"
print(f"\n📝 新闻总结:")
print(f" {summary}")
extraction_result_message += f"\n📝 新闻总结:\n {summary}\n"
# 数据库保存结果
db_data = extraction_result.get('database_save', {})
if db_data.get('success'):
print(f"\n💾 数据库保存: 成功")
extraction_result_message += f"\n💾 数据库保存: 成功\n"
else:
print(f"\n💾 数据库保存: 失败")
extraction_result_message += f"\n💾 数据库保存: 失败\n"
print("\n" + "=" * 80)
logger.info(extraction_result_message)
def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
"""
@@ -207,7 +200,7 @@ class BroadTopicExtraction:
topics_data = self.db_manager.get_daily_topics(extract_date)
if not topics_data:
print(f"没有找到 {extract_date or date.today()} 的话题数据")
logger.info(f"没有找到 {extract_date or date.today()} 的话题数据")
return []
keywords = topics_data['keywords']
@@ -215,11 +208,11 @@ class BroadTopicExtraction:
# 生成搜索关键词
search_keywords = self.topic_extractor.get_search_keywords(keywords)
print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
logger.info(f"准备了 {len(search_keywords)} 个关键词用于爬取")
return search_keywords
except Exception as e:
print(f"获取爬取关键词失败: {e}")
logger.error(f"获取爬取关键词失败: {e}")
return []
def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
@@ -227,7 +220,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_daily_topics(target_date)
except Exception as e:
print(f"获取每日分析失败: {e}")
logger.error(f"获取每日分析失败: {e}")
return None
def get_recent_analysis(self, days: int = 7) -> List[Dict]:
@@ -235,7 +228,7 @@ class BroadTopicExtraction:
try:
return self.db_manager.get_recent_topics(days)
except Exception as e:
print(f"获取最近分析失败: {e}")
logger.error(f"获取最近分析失败: {e}")
return []
# ==================== 命令行工具 ====================
@@ -260,17 +253,17 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
news_data = result.get('news_collection', {})
topic_data = result.get('topic_extraction', {})
print(f"✅ 话题提取成功完成!")
print(f" 收集新闻: {news_data.get('total_news', 0)}")
print(f" 提取关键词: {len(topic_data.get('keywords', []))}")
print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
logger.info(f"✅ 话题提取成功完成!")
logger.info(f" 收集新闻: {news_data.get('total_news', 0)}")
logger.info(f" 提取关键词: {len(topic_data.get('keywords', []))}")
logger.info(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
# 获取爬取关键词
crawling_keywords = extractor.get_keywords_for_crawling()
if crawling_keywords:
print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
print(f" {', '.join(crawling_keywords)}")
logger.info(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
logger.info(f" {', '.join(crawling_keywords)}")
# 保存关键词到文件
keywords_file = project_root / "data" / "daily_keywords.txt"
@@ -279,16 +272,16 @@ async def run_extraction_command(sources=None, keywords_count=100, show_details=
with open(keywords_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(crawling_keywords))
print(f" 关键词已保存到: {keywords_file}")
logger.info(f" 关键词已保存到: {keywords_file}")
return True
else:
print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
logger.error(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
return False
except Exception as e:
print(f"❌ 执行过程中发生错误: {e}")
logger.error(f"❌ 执行过程中发生错误: {e}")
return False
def main():
@@ -304,14 +297,14 @@ def main():
# 显示支持的新闻源
if args.list_sources:
print("支持的新闻源平台:")
logger.info("支持的新闻源平台:")
for source, name in SOURCE_NAMES.items():
print(f" {source:<25} {name}")
logger.info(f" {source:<25} {name}")
return
# 验证参数
if args.keywords < 1 or args.keywords > 200:
print("关键词数量应在1-200之间")
logger.error("关键词数量应在1-200之间")
sys.exit(1)
# 运行提取
@@ -325,7 +318,7 @@ def main():
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n用户中断操作")
logger.info("用户中断操作")
sys.exit(1)
if __name__ == "__main__":