Complete the part of the system crawler2.

This commit is contained in:
戒酒的李白
2025-08-20 22:01:15 +08:00
parent 047bbf8c26
commit 15b3a3343b
173 changed files with 34543 additions and 0 deletions
@@ -0,0 +1,319 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
BroadTopicExtraction模块 - 数据库管理器
只负责新闻数据和话题分析的存储和查询
"""
import sys
import json
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
import pymysql
from pymysql.cursors import DictCursor
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
try:
import config
except ImportError:
raise ImportError("无法导入config.py配置文件")
class DatabaseManager:
"""数据库管理器"""
def __init__(self):
"""初始化数据库管理器"""
self.connection = None
self.connect()
def connect(self):
"""连接数据库"""
try:
self.connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
database=config.DB_NAME,
charset=config.DB_CHARSET,
autocommit=True,
cursorclass=DictCursor
)
print(f"成功连接到数据库: {config.DB_NAME}")
except Exception as e:
print(f"数据库连接失败: {e}")
raise
def close(self):
"""关闭数据库连接"""
if self.connection:
self.connection.close()
print("数据库连接已关闭")
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
# ==================== 新闻数据操作 ====================
def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int:
"""
保存每日新闻数据,如果当天已有数据则覆盖
Args:
news_data: 新闻数据列表
crawl_date: 爬取日期,默认为今天
Returns:
保存的新闻数量
"""
if not crawl_date:
crawl_date = date.today()
current_timestamp = int(datetime.now().timestamp())
try:
cursor = self.connection.cursor()
# 先删除当天所有的新闻记录(覆盖模式)
delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
deleted_count = cursor.execute(delete_query, (crawl_date,))
if deleted_count > 0:
print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")
# 批量插入新记录
saved_count = 0
for news_item in news_data:
try:
# 简化的新闻ID生成
news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
# 插入新记录
insert_query = """
INSERT INTO daily_news (
news_id, source_platform, title, url, crawl_date,
rank_position, add_ts
) VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_query, (
news_id,
news_item.get('source', 'unknown'),
news_item.get('title', ''),
news_item.get('url', ''),
crawl_date,
news_item.get('rank', None),
current_timestamp
))
saved_count += 1
except Exception as e:
print(f"保存单条新闻失败: {e}")
continue
print(f"成功保存 {saved_count} 条新闻记录")
return saved_count
except Exception as e:
print(f"保存新闻数据失败: {e}")
return 0
def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
"""
获取每日新闻数据
Args:
crawl_date: 爬取日期,默认为今天
Returns:
新闻列表
"""
if not crawl_date:
crawl_date = date.today()
query = """
SELECT * FROM daily_news
WHERE crawl_date = %s
ORDER BY rank_position ASC
"""
cursor = self.connection.cursor()
cursor.execute(query, (crawl_date,))
return cursor.fetchall()
# ==================== 话题数据操作 ====================
def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool:
"""
保存每日话题分析
Args:
keywords: 话题关键词列表
summary: 新闻分析总结
extract_date: 提取日期,默认为今天
Returns:
是否保存成功
"""
if not extract_date:
extract_date = date.today()
current_timestamp = int(datetime.now().timestamp())
try:
cursor = self.connection.cursor()
# 检查今天是否已有记录
check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
cursor.execute(check_query, (extract_date,))
existing = cursor.fetchone()
keywords_json = json.dumps(keywords, ensure_ascii=False)
if existing:
# 更新现有记录
update_query = """
UPDATE daily_topics
SET keywords = %s, summary = %s, add_ts = %s
WHERE extract_date = %s
"""
cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
print(f"更新了 {extract_date} 的话题分析")
else:
# 插入新记录
insert_query = """
INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
VALUES (%s, %s, %s, %s)
"""
cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
print(f"保存了 {extract_date} 的话题分析")
return True
except Exception as e:
print(f"保存话题分析失败: {e}")
return False
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
"""
获取每日话题分析
Args:
extract_date: 提取日期,默认为今天
Returns:
话题分析数据,如果不存在返回None
"""
if not extract_date:
extract_date = date.today()
try:
cursor = self.connection.cursor()
query = "SELECT * FROM daily_topics WHERE extract_date = %s"
cursor.execute(query, (extract_date,))
result = cursor.fetchone()
if result:
# 解析关键词JSON
result['keywords'] = json.loads(result['keywords'])
return result
else:
return None
except Exception as e:
print(f"获取话题分析失败: {e}")
return None
def get_recent_topics(self, days: int = 7) -> List[Dict]:
"""
获取最近几天的话题分析
Args:
days: 天数
Returns:
话题分析列表
"""
try:
cursor = self.connection.cursor()
query = """
SELECT * FROM daily_topics
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
ORDER BY extract_date DESC
"""
cursor.execute(query, (days,))
results = cursor.fetchall()
# 解析每个结果的关键词JSON
for result in results:
result['keywords'] = json.loads(result['keywords'])
return results
except Exception as e:
print(f"获取最近话题分析失败: {e}")
return []
# ==================== 统计查询 ====================
def get_summary_stats(self, days: int = 7) -> Dict:
"""获取统计摘要"""
try:
cursor = self.connection.cursor()
# 新闻统计
news_query = """
SELECT
crawl_date,
COUNT(*) as news_count,
COUNT(DISTINCT source_platform) as platforms_count
FROM daily_news
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
GROUP BY crawl_date
ORDER BY crawl_date DESC
"""
cursor.execute(news_query, (days,))
news_stats = cursor.fetchall()
# 话题统计
topics_query = """
SELECT
extract_date,
keywords,
CHAR_LENGTH(summary) as summary_length
FROM daily_topics
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
ORDER BY extract_date DESC
"""
cursor.execute(topics_query, (days,))
topics_stats = cursor.fetchall()
return {
'news_stats': news_stats,
'topics_stats': topics_stats
}
except Exception as e:
print(f"获取统计摘要失败: {e}")
return {'news_stats': [], 'topics_stats': []}
if __name__ == "__main__":
# 测试数据库管理器
with DatabaseManager() as db:
# 测试获取新闻
news = db.get_daily_news()
print(f"今日新闻数量: {len(news)}")
# 测试获取话题
topics = db.get_daily_topics()
if topics:
print(f"今日话题关键词: {topics['keywords']}")
else:
print("今日暂无话题分析")
print("简化数据库管理器测试完成!")
@@ -0,0 +1,300 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
BroadTopicExtraction模块 - 新闻获取和收集
整合新闻API调用和数据库存储功能
"""
import sys
import asyncio
import httpx
import json
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
try:
from BroadTopicExtraction.database_manager import DatabaseManager
except ImportError as e:
raise ImportError(f"导入模块失败: {e}")
# 新闻API基础URL
BASE_URL = "https://newsnow.busiyi.world"
# 新闻源中文名称映射
SOURCE_NAMES = {
"weibo": "微博热搜",
"zhihu": "知乎热榜",
"bilibili-hot-search": "B站热搜",
"toutiao": "今日头条",
"douyin": "抖音热榜",
"github-trending-today": "GitHub趋势",
"coolapk": "酷安热榜",
"tieba": "百度贴吧",
"wallstreetcn": "华尔街见闻",
"thepaper": "澎湃新闻",
"cls-hot": "财联社",
"xueqiu": "雪球热榜",
"kuaishou": "快手热榜"
}
class NewsCollector:
"""新闻收集器 - 整合API调用和数据库存储"""
def __init__(self):
"""初始化新闻收集器"""
self.db_manager = DatabaseManager()
self.supported_sources = list(SOURCE_NAMES.keys())
def close(self):
"""关闭资源"""
if self.db_manager:
self.db_manager.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
self.close()
# ==================== 新闻API调用 ====================
async def fetch_news(self, source: str) -> dict:
"""从指定源获取最新新闻"""
url = f"{BASE_URL}/api/s?id={source}&latest"
headers = {"Accept": "application/json"}
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
# 解析JSON响应
data = json.loads(response.text)
return {
"source": source,
"status": "success",
"data": data,
"timestamp": datetime.now().isoformat()
}
except httpx.TimeoutException:
return {
"source": source,
"status": "timeout",
"error": "请求超时",
"timestamp": datetime.now().isoformat()
}
except httpx.HTTPStatusError as e:
return {
"source": source,
"status": "http_error",
"error": f"HTTP错误: {e.response.status_code}",
"timestamp": datetime.now().isoformat()
}
except Exception as e:
return {
"source": source,
"status": "error",
"error": f"未知错误: {str(e)}",
"timestamp": datetime.now().isoformat()
}
async def get_popular_news(self, sources: List[str] = None) -> List[dict]:
"""获取热门新闻"""
if sources is None:
sources = list(SOURCE_NAMES.keys())
print(f"正在获取 {len(sources)} 个新闻源的最新内容...")
print("=" * 80)
results = []
for source in sources:
source_name = SOURCE_NAMES.get(source, source)
print(f"正在获取 {source_name} 的新闻...")
result = await self.fetch_news(source)
results.append(result)
if result["status"] == "success":
data = result["data"]
if 'items' in data and isinstance(data['items'], list):
count = len(data['items'])
print(f"{source_name}: 获取成功,共 {count} 条新闻")
else:
print(f"{source_name}: 获取成功")
else:
print(f"{source_name}: {result.get('error', '获取失败')}")
# 避免请求过快
await asyncio.sleep(0.5)
return results
# ==================== 数据处理和存储 ====================
async def collect_and_save_news(self, sources: Optional[List[str]] = None) -> Dict:
"""
收集并保存每日热点新闻
Args:
sources: 指定的新闻源列表,None表示使用所有支持的源
Returns:
包含收集结果的字典
"""
print(f"开始收集每日热点新闻...")
print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# 选择新闻源
if sources is None:
# 使用所有支持的新闻源
sources = list(SOURCE_NAMES.keys())
print(f"将从 {len(sources)} 个新闻源收集数据:")
for source in sources:
source_name = SOURCE_NAMES.get(source, source)
print(f" - {source_name}")
try:
# 获取新闻数据
results = await self.get_popular_news(sources)
# 处理结果
processed_data = self._process_news_results(results)
# 保存到数据库(覆盖模式)
if processed_data['news_list']:
saved_count = self.db_manager.save_daily_news(
processed_data['news_list'],
date.today()
)
processed_data['saved_count'] = saved_count
# 打印统计信息
self._print_collection_summary(processed_data)
return processed_data
except Exception as e:
print(f"收集新闻失败: {e}")
return {
'success': False,
'error': str(e),
'news_list': [],
'total_news': 0
}
def _process_news_results(self, results: List[Dict]) -> Dict:
"""处理新闻获取结果"""
news_list = []
successful_sources = 0
total_news = 0
for result in results:
source = result['source']
status = result['status']
if status == 'success':
successful_sources += 1
data = result['data']
if 'items' in data and isinstance(data['items'], list):
source_news_count = len(data['items'])
total_news += source_news_count
# 处理该源的新闻
for i, item in enumerate(data['items'], 1):
processed_news = self._process_news_item(item, source, i)
if processed_news:
news_list.append(processed_news)
return {
'success': True,
'news_list': news_list,
'successful_sources': successful_sources,
'total_sources': len(results),
'total_news': total_news,
'collection_time': datetime.now().isoformat()
}
def _process_news_item(self, item: Dict, source: str, rank: int) -> Optional[Dict]:
"""处理单条新闻"""
try:
if isinstance(item, dict):
title = item.get('title', '无标题').strip()
url = item.get('url', '')
# 生成新闻ID
news_id = f"{source}_{item.get('id', f'rank_{rank}')}"
return {
'id': news_id,
'title': title,
'url': url,
'source': source,
'rank': rank
}
else:
# 处理字符串类型的新闻
title = str(item)[:100] if len(str(item)) > 100 else str(item)
return {
'id': f"{source}_rank_{rank}",
'title': title,
'url': '',
'source': source,
'rank': rank
}
except Exception as e:
print(f"处理新闻项失败: {e}")
return None
def _print_collection_summary(self, data: Dict):
"""打印收集摘要"""
print("\n" + "=" * 50)
print("新闻收集摘要")
print("=" * 50)
print(f"总新闻源: {data['total_sources']}")
print(f"成功源数: {data['successful_sources']}")
print(f"总新闻数: {data['total_news']}")
if 'saved_count' in data:
print(f"已保存数: {data['saved_count']}")
print("=" * 50)
def get_today_news(self) -> List[Dict]:
"""获取今天的新闻"""
try:
return self.db_manager.get_daily_news(date.today())
except Exception as e:
print(f"获取今日新闻失败: {e}")
return []
async def main():
"""测试新闻收集器"""
print("测试新闻收集器...")
async with NewsCollector() as collector:
# 收集新闻
result = await collector.collect_and_save_news(
sources=["weibo", "zhihu"] # 测试用,只使用两个源
)
if result['success']:
print(f"收集成功!共获取 {result['total_news']} 条新闻")
else:
print(f"收集失败: {result.get('error', '未知错误')}")
if __name__ == "__main__":
asyncio.run(main())
+332
View File
@@ -0,0 +1,332 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
BroadTopicExtraction模块 - 主程序
整合话题提取的完整工作流程和命令行工具
"""
import sys
import asyncio
import argparse
from datetime import datetime, date
from pathlib import Path
from typing import List, Dict, Optional
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
try:
from BroadTopicExtraction.get_today_news import NewsCollector, SOURCE_NAMES
from BroadTopicExtraction.topic_extractor import TopicExtractor
from BroadTopicExtraction.database_manager import DatabaseManager
except ImportError as e:
print(f"导入模块失败: {e}")
print("请确保在项目根目录运行,并且已安装所有依赖")
sys.exit(1)
class BroadTopicExtraction:
"""BroadTopicExtraction主要工作流程"""
def __init__(self):
"""初始化"""
self.news_collector = NewsCollector()
self.topic_extractor = TopicExtractor()
self.db_manager = DatabaseManager()
print("BroadTopicExtraction 初始化完成")
def close(self):
"""关闭资源"""
if self.news_collector:
self.news_collector.close()
if self.db_manager:
self.db_manager.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
self.close()
async def run_daily_extraction(self,
news_sources: Optional[List[str]] = None,
max_keywords: int = 100) -> Dict:
"""
运行每日话题提取流程
Args:
news_sources: 新闻源列表,None表示使用所有支持的源
max_keywords: 最大关键词数量
Returns:
包含完整提取结果的字典
"""
print("\n" + "=" * 80)
print("MindSpider AI爬虫 - 每日话题提取")
print("=" * 80)
print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"目标日期: {date.today()}")
if news_sources:
print(f"指定平台: {len(news_sources)}")
for source in news_sources:
source_name = SOURCE_NAMES.get(source, source)
print(f" - {source_name}")
else:
print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
print(f"关键词数: 最多 {max_keywords}")
extraction_result = {
'success': False,
'extraction_date': date.today().isoformat(),
'start_time': datetime.now().isoformat(),
'news_collection': {},
'topic_extraction': {},
'database_save': {},
'error': None
}
try:
# 步骤1: 收集新闻
print("\n【步骤1】收集热点新闻...")
news_result = await self.news_collector.collect_and_save_news(
sources=news_sources
)
extraction_result['news_collection'] = {
'success': news_result['success'],
'total_news': news_result.get('total_news', 0),
'successful_sources': news_result.get('successful_sources', 0),
'total_sources': news_result.get('total_sources', 0)
}
if not news_result['success'] or not news_result['news_list']:
raise Exception("新闻收集失败或没有获取到新闻")
# 步骤2: 提取关键词和生成总结
print("\n【步骤2】提取关键词和生成总结...")
keywords, summary = self.topic_extractor.extract_keywords_and_summary(
news_result['news_list'],
max_keywords=max_keywords
)
extraction_result['topic_extraction'] = {
'success': len(keywords) > 0,
'keywords_count': len(keywords),
'keywords': keywords,
'summary': summary
}
if not keywords:
print("警告: 没有提取到有效关键词")
# 步骤3: 保存到数据库
print("\n【步骤3】保存分析结果到数据库...")
save_success = self.db_manager.save_daily_topics(
keywords, summary, date.today()
)
extraction_result['database_save'] = {
'success': save_success
}
extraction_result['success'] = True
extraction_result['end_time'] = datetime.now().isoformat()
print("\n" + "=" * 80)
print("每日话题提取流程完成!")
print("=" * 80)
return extraction_result
except Exception as e:
print(f"\n话题提取流程失败: {e}")
extraction_result['error'] = str(e)
extraction_result['end_time'] = datetime.now().isoformat()
return extraction_result
def print_extraction_results(self, extraction_result: Dict):
"""打印提取结果"""
print("\n" + "=" * 80)
print("话题提取结果报告")
print("=" * 80)
if not extraction_result['success']:
print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
return
# 新闻收集结果
news_data = extraction_result.get('news_collection', {})
print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
# 话题提取结果
topic_data = extraction_result.get('topic_extraction', {})
keywords = topic_data.get('keywords', [])
summary = topic_data.get('summary', '')
print(f"\n🔑 提取关键词: {len(keywords)}")
if keywords:
# 每行显示5个关键词
for i in range(0, len(keywords), 5):
keyword_group = keywords[i:i+5]
print(f" {', '.join(keyword_group)}")
print(f"\n📝 新闻总结:")
print(f" {summary}")
# 数据库保存结果
db_data = extraction_result.get('database_save', {})
if db_data.get('success'):
print(f"\n💾 数据库保存: 成功")
else:
print(f"\n💾 数据库保存: 失败")
print("\n" + "=" * 80)
def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
"""
获取用于爬取的关键词列表
Args:
extract_date: 提取日期,默认为今天
Returns:
关键词列表
"""
try:
# 从数据库获取话题分析
topics_data = self.db_manager.get_daily_topics(extract_date)
if not topics_data:
print(f"没有找到 {extract_date or date.today()} 的话题数据")
return []
keywords = topics_data['keywords']
# 生成搜索关键词
search_keywords = self.topic_extractor.get_search_keywords(keywords)
print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
return search_keywords
except Exception as e:
print(f"获取爬取关键词失败: {e}")
return []
def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
"""获取指定日期的分析结果"""
try:
return self.db_manager.get_daily_topics(target_date)
except Exception as e:
print(f"获取每日分析失败: {e}")
return None
def get_recent_analysis(self, days: int = 7) -> List[Dict]:
"""获取最近几天的分析结果"""
try:
return self.db_manager.get_recent_topics(days)
except Exception as e:
print(f"获取最近分析失败: {e}")
return []
# ==================== 命令行工具 ====================
async def run_extraction_command(sources=None, keywords_count=100, show_details=True):
"""运行话题提取命令"""
try:
async with BroadTopicExtraction() as extractor:
# 运行话题提取
result = await extractor.run_daily_extraction(
news_sources=sources,
max_keywords=keywords_count
)
if result['success']:
if show_details:
# 显示详细结果
extractor.print_extraction_results(result)
else:
# 只显示简要结果
news_data = result.get('news_collection', {})
topic_data = result.get('topic_extraction', {})
print(f"✅ 话题提取成功完成!")
print(f" 收集新闻: {news_data.get('total_news', 0)}")
print(f" 提取关键词: {len(topic_data.get('keywords', []))}")
print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
# 获取爬取关键词
crawling_keywords = extractor.get_keywords_for_crawling()
if crawling_keywords:
print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
print(f" {', '.join(crawling_keywords)}")
# 保存关键词到文件
keywords_file = project_root / "data" / "daily_keywords.txt"
keywords_file.parent.mkdir(exist_ok=True)
with open(keywords_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(crawling_keywords))
print(f" 关键词已保存到: {keywords_file}")
return True
else:
print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
return False
except Exception as e:
print(f"❌ 执行过程中发生错误: {e}")
return False
def main():
"""主函数"""
parser = argparse.ArgumentParser(description="MindSpider每日话题提取工具")
parser.add_argument("--sources", nargs="+", help="指定新闻源平台",
choices=list(SOURCE_NAMES.keys()))
parser.add_argument("--keywords", type=int, default=100, help="最大关键词数量 (默认100)")
parser.add_argument("--quiet", action="store_true", help="简化输出模式")
parser.add_argument("--list-sources", action="store_true", help="显示支持的新闻源")
args = parser.parse_args()
# 显示支持的新闻源
if args.list_sources:
print("支持的新闻源平台:")
for source, name in SOURCE_NAMES.items():
print(f" {source:<25} {name}")
return
# 验证参数
if args.keywords < 1 or args.keywords > 200:
print("关键词数量应在1-200之间")
sys.exit(1)
# 运行提取
try:
success = asyncio.run(run_extraction_command(
sources=args.sources,
keywords_count=args.keywords,
show_details=not args.quiet
))
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n用户中断操作")
sys.exit(1)
if __name__ == "__main__":
main()
@@ -0,0 +1,288 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
BroadTopicExtraction模块 - 话题提取器
基于DeepSeek直接提取关键词和生成新闻总结
"""
import sys
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple
from openai import OpenAI
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
try:
import config
except ImportError:
raise ImportError("无法导入config.py配置文件")
class TopicExtractor:
"""话题提取器"""
def __init__(self):
"""初始化话题提取器"""
self.client = OpenAI(
api_key=config.DEEPSEEK_API_KEY,
base_url="https://api.deepseek.com"
)
self.model = "deepseek-chat"
def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
"""
从新闻列表中提取关键词和生成总结
Args:
news_list: 新闻列表
max_keywords: 最大关键词数量
Returns:
(关键词列表, 新闻分析总结)
"""
if not news_list:
return [], "今日暂无热点新闻"
# 构建新闻摘要文本
news_text = self._build_news_summary(news_list)
# 构建提示词
prompt = self._build_analysis_prompt(news_text, max_keywords)
try:
# 调用DeepSeek API
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "你是一个专业的新闻分析师,擅长从热点新闻中提取关键词和撰写分析总结。"},
{"role": "user", "content": prompt}
],
max_tokens=1500,
temperature=0.3
)
# 解析返回结果
result_text = response.choices[0].message.content
keywords, summary = self._parse_analysis_result(result_text)
print(f"成功提取 {len(keywords)} 个关键词并生成新闻总结")
return keywords[:max_keywords], summary
except Exception as e:
print(f"话题提取失败: {e}")
# 返回简单的fallback结果
fallback_keywords = self._extract_simple_keywords(news_list)
fallback_summary = f"今日共收集到 {len(news_list)} 条热点新闻,涵盖多个平台的热门话题。"
return fallback_keywords[:max_keywords], fallback_summary
def _build_news_summary(self, news_list: List[Dict]) -> str:
"""构建新闻摘要文本"""
news_items = []
for i, news in enumerate(news_list, 1):
title = news.get('title', '无标题')
source = news.get('source_platform', news.get('source', '未知'))
# 清理标题中的特殊字符
title = re.sub(r'[#@]', '', title).strip()
news_items.append(f"{i}. 【{source}{title}")
return "\n".join(news_items)
def _build_analysis_prompt(self, news_text: str, max_keywords: int) -> str:
"""构建分析提示词"""
news_count = len(news_text.split('\n'))
prompt = f"""
请分析以下{news_count}条今日热点新闻,完成两个任务:
新闻列表:
{news_text}
任务1:提取关键词(最多{max_keywords}个)
- 提取能代表今日热点话题的关键词
- 关键词应该适合用于社交媒体平台搜索
- 优先选择热度高、讨论量大的话题
- 避免过于宽泛或过于具体的词汇
任务2:撰写新闻分析总结(150-300字)
- 简要概括今日热点新闻的主要内容
- 指出当前社会关注的重点话题方向
- 分析这些热点反映的社会现象或趋势
- 语言简洁明了,客观中性
请严格按照以下JSON格式输出:
```json
{{
"keywords": ["关键词1", "关键词2", "关键词3"],
"summary": "今日新闻分析总结内容..."
}}
```
请直接输出JSON格式的结果,不要包含其他文字说明。
"""
return prompt
def _parse_analysis_result(self, result_text: str) -> Tuple[List[str], str]:
"""解析分析结果"""
try:
# 尝试提取JSON部分
json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL)
if json_match:
json_text = json_match.group(1)
else:
# 如果没有代码块,尝试直接解析
json_text = result_text.strip()
# 解析JSON
data = json.loads(json_text)
keywords = data.get('keywords', [])
summary = data.get('summary', '')
# 验证和清理关键词
clean_keywords = []
for keyword in keywords:
keyword = str(keyword).strip()
if keyword and len(keyword) > 1 and keyword not in clean_keywords:
clean_keywords.append(keyword)
# 验证总结
if not summary or len(summary.strip()) < 10:
summary = "今日热点新闻涵盖多个领域,反映了当前社会的多元化关注点。"
return clean_keywords, summary.strip()
except json.JSONDecodeError as e:
print(f"解析JSON失败: {e}")
print(f"原始返回: {result_text}")
# 尝试手动解析
return self._manual_parse_result(result_text)
except Exception as e:
print(f"处理分析结果失败: {e}")
return [], "分析结果处理失败,请稍后重试。"
def _manual_parse_result(self, text: str) -> Tuple[List[str], str]:
"""手动解析结果(当JSON解析失败时的后备方案)"""
print("尝试手动解析结果...")
keywords = []
summary = ""
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# 寻找关键词
if '关键词' in line or 'keywords' in line.lower():
# 提取关键词
keyword_match = re.findall(r'[""](.*?)["""]', line)
if keyword_match:
keywords.extend(keyword_match)
else:
# 尝试其他分隔符
parts = re.split(r'[,,、]', line)
for part in parts:
clean_part = re.sub(r'[关键词::keywords\[\]"]', '', part).strip()
if clean_part and len(clean_part) > 1:
keywords.append(clean_part)
# 寻找总结
elif '总结' in line or '分析' in line or 'summary' in line.lower():
if '' in line or ':' in line:
summary = line.split('')[-1].split(':')[-1].strip()
# 如果这一行看起来像总结内容
elif len(line) > 50 and ('今日' in line or '热点' in line or '新闻' in line):
if not summary:
summary = line
# 清理关键词
clean_keywords = []
for keyword in keywords:
keyword = keyword.strip()
if keyword and len(keyword) > 1 and keyword not in clean_keywords:
clean_keywords.append(keyword)
# 如果没有找到总结,生成一个简单的
if not summary:
summary = "今日热点新闻内容丰富,涵盖了社会各个层面的关注点。"
return clean_keywords[:max_keywords], summary
def _extract_simple_keywords(self, news_list: List[Dict]) -> List[str]:
"""简单关键词提取(fallback方案)"""
keywords = []
for news in news_list:
title = news.get('title', '')
# 简单的关键词提取
# 移除常见的无意义词汇
title_clean = re.sub(r'[#@【】\[\]()()]', ' ', title)
words = title_clean.split()
for word in words:
word = word.strip()
if (len(word) > 1 and
word not in ['', '', '', '', '', '', '', '', '', '', '', '', '正在'] and
word not in keywords):
keywords.append(word)
return keywords[:10]
def get_search_keywords(self, keywords: List[str], limit: int = 10) -> List[str]:
"""
获取用于搜索的关键词
Args:
keywords: 关键词列表
limit: 限制数量
Returns:
适合搜索的关键词列表
"""
# 过滤和优化关键词
search_keywords = []
for keyword in keywords:
keyword = str(keyword).strip()
# 过滤条件
if (len(keyword) > 1 and
len(keyword) < 20 and # 不能太长
keyword not in search_keywords and
not keyword.isdigit() and # 不是纯数字
not re.match(r'^[a-zA-Z]+$', keyword)): # 不是纯英文(除非是专有名词)
search_keywords.append(keyword)
return search_keywords[:limit]
if __name__ == "__main__":
# 测试话题提取器
extractor = TopicExtractor()
# 模拟新闻数据
test_news = [
{"title": "AI技术发展迅速", "source_platform": "科技新闻"},
{"title": "股市行情分析", "source_platform": "财经新闻"},
{"title": "明星最新动态", "source_platform": "娱乐新闻"}
]
keywords, summary = extractor.extract_keywords_and_summary(test_news)
print(f"提取的关键词: {keywords}")
print(f"新闻总结: {summary}")
search_keywords = extractor.get_search_keywords(keywords)
print(f"搜索关键词: {search_keywords}")