The framework has been restructured again, and the Flask framework has been abandoned.
This commit is contained in:
@@ -1,319 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
BroadTopicExtraction模块 - 数据库管理器
|
||||
只负责新闻数据和话题分析的存储和查询
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
import pymysql
|
||||
from pymysql.cursors import DictCursor
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
try:
|
||||
import config
|
||||
except ImportError:
|
||||
raise ImportError("无法导入config.py配置文件")
|
||||
|
||||
class DatabaseManager:
|
||||
"""数据库管理器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化数据库管理器"""
|
||||
self.connection = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""连接数据库"""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
database=config.DB_NAME,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True,
|
||||
cursorclass=DictCursor
|
||||
)
|
||||
print(f"成功连接到数据库: {config.DB_NAME}")
|
||||
except Exception as e:
|
||||
print(f"数据库连接失败: {e}")
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接"""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
print("数据库连接已关闭")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
# ==================== 新闻数据操作 ====================
|
||||
|
||||
def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int:
|
||||
"""
|
||||
保存每日新闻数据,如果当天已有数据则覆盖
|
||||
|
||||
Args:
|
||||
news_data: 新闻数据列表
|
||||
crawl_date: 爬取日期,默认为今天
|
||||
|
||||
Returns:
|
||||
保存的新闻数量
|
||||
"""
|
||||
if not crawl_date:
|
||||
crawl_date = date.today()
|
||||
|
||||
current_timestamp = int(datetime.now().timestamp())
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 先删除当天所有的新闻记录(覆盖模式)
|
||||
delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
|
||||
deleted_count = cursor.execute(delete_query, (crawl_date,))
|
||||
if deleted_count > 0:
|
||||
print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")
|
||||
|
||||
# 批量插入新记录
|
||||
saved_count = 0
|
||||
for news_item in news_data:
|
||||
try:
|
||||
# 简化的新闻ID生成
|
||||
news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
|
||||
|
||||
# 插入新记录
|
||||
insert_query = """
|
||||
INSERT INTO daily_news (
|
||||
news_id, source_platform, title, url, crawl_date,
|
||||
rank_position, add_ts
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(insert_query, (
|
||||
news_id,
|
||||
news_item.get('source', 'unknown'),
|
||||
news_item.get('title', ''),
|
||||
news_item.get('url', ''),
|
||||
crawl_date,
|
||||
news_item.get('rank', None),
|
||||
current_timestamp
|
||||
))
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存单条新闻失败: {e}")
|
||||
continue
|
||||
|
||||
print(f"成功保存 {saved_count} 条新闻记录")
|
||||
return saved_count
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存新闻数据失败: {e}")
|
||||
return 0
|
||||
|
||||
def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
|
||||
"""
|
||||
获取每日新闻数据
|
||||
|
||||
Args:
|
||||
crawl_date: 爬取日期,默认为今天
|
||||
|
||||
Returns:
|
||||
新闻列表
|
||||
"""
|
||||
if not crawl_date:
|
||||
crawl_date = date.today()
|
||||
|
||||
query = """
|
||||
SELECT * FROM daily_news
|
||||
WHERE crawl_date = %s
|
||||
ORDER BY rank_position ASC
|
||||
"""
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(query, (crawl_date,))
|
||||
return cursor.fetchall()
|
||||
|
||||
# ==================== 话题数据操作 ====================
|
||||
|
||||
def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool:
|
||||
"""
|
||||
保存每日话题分析
|
||||
|
||||
Args:
|
||||
keywords: 话题关键词列表
|
||||
summary: 新闻分析总结
|
||||
extract_date: 提取日期,默认为今天
|
||||
|
||||
Returns:
|
||||
是否保存成功
|
||||
"""
|
||||
if not extract_date:
|
||||
extract_date = date.today()
|
||||
|
||||
current_timestamp = int(datetime.now().timestamp())
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 检查今天是否已有记录
|
||||
check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
|
||||
cursor.execute(check_query, (extract_date,))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
keywords_json = json.dumps(keywords, ensure_ascii=False)
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
update_query = """
|
||||
UPDATE daily_topics
|
||||
SET keywords = %s, summary = %s, add_ts = %s
|
||||
WHERE extract_date = %s
|
||||
"""
|
||||
cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
|
||||
print(f"更新了 {extract_date} 的话题分析")
|
||||
else:
|
||||
# 插入新记录
|
||||
insert_query = """
|
||||
INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
|
||||
print(f"保存了 {extract_date} 的话题分析")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存话题分析失败: {e}")
|
||||
return False
|
||||
|
||||
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
|
||||
"""
|
||||
获取每日话题分析
|
||||
|
||||
Args:
|
||||
extract_date: 提取日期,默认为今天
|
||||
|
||||
Returns:
|
||||
话题分析数据,如果不存在返回None
|
||||
"""
|
||||
if not extract_date:
|
||||
extract_date = date.today()
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = "SELECT * FROM daily_topics WHERE extract_date = %s"
|
||||
cursor.execute(query, (extract_date,))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
# 解析关键词JSON
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
return result
|
||||
else:
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取话题分析失败: {e}")
|
||||
return None
|
||||
|
||||
def get_recent_topics(self, days: int = 7) -> List[Dict]:
|
||||
"""
|
||||
获取最近几天的话题分析
|
||||
|
||||
Args:
|
||||
days: 天数
|
||||
|
||||
Returns:
|
||||
话题分析列表
|
||||
"""
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = """
|
||||
SELECT * FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
cursor.execute(query, (days,))
|
||||
results = cursor.fetchall()
|
||||
|
||||
# 解析每个结果的关键词JSON
|
||||
for result in results:
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取最近话题分析失败: {e}")
|
||||
return []
|
||||
|
||||
# ==================== 统计查询 ====================
|
||||
|
||||
def get_summary_stats(self, days: int = 7) -> Dict:
|
||||
"""获取统计摘要"""
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 新闻统计
|
||||
news_query = """
|
||||
SELECT
|
||||
crawl_date,
|
||||
COUNT(*) as news_count,
|
||||
COUNT(DISTINCT source_platform) as platforms_count
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
"""
|
||||
cursor.execute(news_query, (days,))
|
||||
news_stats = cursor.fetchall()
|
||||
|
||||
# 话题统计
|
||||
topics_query = """
|
||||
SELECT
|
||||
extract_date,
|
||||
keywords,
|
||||
CHAR_LENGTH(summary) as summary_length
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
cursor.execute(topics_query, (days,))
|
||||
topics_stats = cursor.fetchall()
|
||||
|
||||
return {
|
||||
'news_stats': news_stats,
|
||||
'topics_stats': topics_stats
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取统计摘要失败: {e}")
|
||||
return {'news_stats': [], 'topics_stats': []}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试数据库管理器
|
||||
with DatabaseManager() as db:
|
||||
# 测试获取新闻
|
||||
news = db.get_daily_news()
|
||||
print(f"今日新闻数量: {len(news)}")
|
||||
|
||||
# 测试获取话题
|
||||
topics = db.get_daily_topics()
|
||||
if topics:
|
||||
print(f"今日话题关键词: {topics['keywords']}")
|
||||
else:
|
||||
print("今日暂无话题分析")
|
||||
|
||||
print("简化数据库管理器测试完成!")
|
||||
@@ -1,300 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
BroadTopicExtraction模块 - 新闻获取和收集
|
||||
整合新闻API调用和数据库存储功能
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
try:
|
||||
from BroadTopicExtraction.database_manager import DatabaseManager
|
||||
except ImportError as e:
|
||||
raise ImportError(f"导入模块失败: {e}")
|
||||
|
||||
# 新闻API基础URL
|
||||
BASE_URL = "https://newsnow.busiyi.world"
|
||||
|
||||
# 新闻源中文名称映射
|
||||
SOURCE_NAMES = {
|
||||
"weibo": "微博热搜",
|
||||
"zhihu": "知乎热榜",
|
||||
"bilibili-hot-search": "B站热搜",
|
||||
"toutiao": "今日头条",
|
||||
"douyin": "抖音热榜",
|
||||
"github-trending-today": "GitHub趋势",
|
||||
"coolapk": "酷安热榜",
|
||||
"tieba": "百度贴吧",
|
||||
"wallstreetcn": "华尔街见闻",
|
||||
"thepaper": "澎湃新闻",
|
||||
"cls-hot": "财联社",
|
||||
"xueqiu": "雪球热榜",
|
||||
"kuaishou": "快手热榜"
|
||||
}
|
||||
|
||||
class NewsCollector:
|
||||
"""新闻收集器 - 整合API调用和数据库存储"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化新闻收集器"""
|
||||
self.db_manager = DatabaseManager()
|
||||
self.supported_sources = list(SOURCE_NAMES.keys())
|
||||
|
||||
def close(self):
|
||||
"""关闭资源"""
|
||||
if self.db_manager:
|
||||
self.db_manager.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
# ==================== 新闻API调用 ====================
|
||||
|
||||
async def fetch_news(self, source: str) -> dict:
|
||||
"""从指定源获取最新新闻"""
|
||||
url = f"{BASE_URL}/api/s?id={source}&latest"
|
||||
headers = {"Accept": "application/json"}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析JSON响应
|
||||
data = json.loads(response.text)
|
||||
return {
|
||||
"source": source,
|
||||
"status": "success",
|
||||
"data": data,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
except httpx.TimeoutException:
|
||||
return {
|
||||
"source": source,
|
||||
"status": "timeout",
|
||||
"error": "请求超时",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
except httpx.HTTPStatusError as e:
|
||||
return {
|
||||
"source": source,
|
||||
"status": "http_error",
|
||||
"error": f"HTTP错误: {e.response.status_code}",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"source": source,
|
||||
"status": "error",
|
||||
"error": f"未知错误: {str(e)}",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
async def get_popular_news(self, sources: List[str] = None) -> List[dict]:
|
||||
"""获取热门新闻"""
|
||||
if sources is None:
|
||||
sources = list(SOURCE_NAMES.keys())
|
||||
|
||||
print(f"正在获取 {len(sources)} 个新闻源的最新内容...")
|
||||
print("=" * 80)
|
||||
|
||||
results = []
|
||||
for source in sources:
|
||||
source_name = SOURCE_NAMES.get(source, source)
|
||||
print(f"正在获取 {source_name} 的新闻...")
|
||||
result = await self.fetch_news(source)
|
||||
results.append(result)
|
||||
|
||||
if result["status"] == "success":
|
||||
data = result["data"]
|
||||
if 'items' in data and isinstance(data['items'], list):
|
||||
count = len(data['items'])
|
||||
print(f"✓ {source_name}: 获取成功,共 {count} 条新闻")
|
||||
else:
|
||||
print(f"✓ {source_name}: 获取成功")
|
||||
else:
|
||||
print(f"✗ {source_name}: {result.get('error', '获取失败')}")
|
||||
|
||||
# 避免请求过快
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return results
|
||||
|
||||
# ==================== 数据处理和存储 ====================
|
||||
|
||||
async def collect_and_save_news(self, sources: Optional[List[str]] = None) -> Dict:
|
||||
"""
|
||||
收集并保存每日热点新闻
|
||||
|
||||
Args:
|
||||
sources: 指定的新闻源列表,None表示使用所有支持的源
|
||||
|
||||
Returns:
|
||||
包含收集结果的字典
|
||||
"""
|
||||
print(f"开始收集每日热点新闻...")
|
||||
print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 选择新闻源
|
||||
if sources is None:
|
||||
# 使用所有支持的新闻源
|
||||
sources = list(SOURCE_NAMES.keys())
|
||||
|
||||
print(f"将从 {len(sources)} 个新闻源收集数据:")
|
||||
for source in sources:
|
||||
source_name = SOURCE_NAMES.get(source, source)
|
||||
print(f" - {source_name}")
|
||||
|
||||
try:
|
||||
# 获取新闻数据
|
||||
results = await self.get_popular_news(sources)
|
||||
|
||||
# 处理结果
|
||||
processed_data = self._process_news_results(results)
|
||||
|
||||
# 保存到数据库(覆盖模式)
|
||||
if processed_data['news_list']:
|
||||
saved_count = self.db_manager.save_daily_news(
|
||||
processed_data['news_list'],
|
||||
date.today()
|
||||
)
|
||||
processed_data['saved_count'] = saved_count
|
||||
|
||||
# 打印统计信息
|
||||
self._print_collection_summary(processed_data)
|
||||
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"收集新闻失败: {e}")
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e),
|
||||
'news_list': [],
|
||||
'total_news': 0
|
||||
}
|
||||
|
||||
def _process_news_results(self, results: List[Dict]) -> Dict:
|
||||
"""处理新闻获取结果"""
|
||||
news_list = []
|
||||
successful_sources = 0
|
||||
total_news = 0
|
||||
|
||||
for result in results:
|
||||
source = result['source']
|
||||
status = result['status']
|
||||
|
||||
if status == 'success':
|
||||
successful_sources += 1
|
||||
data = result['data']
|
||||
|
||||
if 'items' in data and isinstance(data['items'], list):
|
||||
source_news_count = len(data['items'])
|
||||
total_news += source_news_count
|
||||
|
||||
# 处理该源的新闻
|
||||
for i, item in enumerate(data['items'], 1):
|
||||
processed_news = self._process_news_item(item, source, i)
|
||||
if processed_news:
|
||||
news_list.append(processed_news)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'news_list': news_list,
|
||||
'successful_sources': successful_sources,
|
||||
'total_sources': len(results),
|
||||
'total_news': total_news,
|
||||
'collection_time': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def _process_news_item(self, item: Dict, source: str, rank: int) -> Optional[Dict]:
|
||||
"""处理单条新闻"""
|
||||
try:
|
||||
if isinstance(item, dict):
|
||||
title = item.get('title', '无标题').strip()
|
||||
url = item.get('url', '')
|
||||
|
||||
# 生成新闻ID
|
||||
news_id = f"{source}_{item.get('id', f'rank_{rank}')}"
|
||||
|
||||
return {
|
||||
'id': news_id,
|
||||
'title': title,
|
||||
'url': url,
|
||||
'source': source,
|
||||
'rank': rank
|
||||
}
|
||||
else:
|
||||
# 处理字符串类型的新闻
|
||||
title = str(item)[:100] if len(str(item)) > 100 else str(item)
|
||||
return {
|
||||
'id': f"{source}_rank_{rank}",
|
||||
'title': title,
|
||||
'url': '',
|
||||
'source': source,
|
||||
'rank': rank
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理新闻项失败: {e}")
|
||||
return None
|
||||
|
||||
def _print_collection_summary(self, data: Dict):
|
||||
"""打印收集摘要"""
|
||||
print("\n" + "=" * 50)
|
||||
print("新闻收集摘要")
|
||||
print("=" * 50)
|
||||
|
||||
print(f"总新闻源: {data['total_sources']}")
|
||||
print(f"成功源数: {data['successful_sources']}")
|
||||
print(f"总新闻数: {data['total_news']}")
|
||||
|
||||
if 'saved_count' in data:
|
||||
print(f"已保存数: {data['saved_count']}")
|
||||
|
||||
print("=" * 50)
|
||||
|
||||
def get_today_news(self) -> List[Dict]:
|
||||
"""获取今天的新闻"""
|
||||
try:
|
||||
return self.db_manager.get_daily_news(date.today())
|
||||
except Exception as e:
|
||||
print(f"获取今日新闻失败: {e}")
|
||||
return []
|
||||
|
||||
async def main():
|
||||
"""测试新闻收集器"""
|
||||
print("测试新闻收集器...")
|
||||
|
||||
async with NewsCollector() as collector:
|
||||
# 收集新闻
|
||||
result = await collector.collect_and_save_news(
|
||||
sources=["weibo", "zhihu"] # 测试用,只使用两个源
|
||||
)
|
||||
|
||||
if result['success']:
|
||||
print(f"收集成功!共获取 {result['total_news']} 条新闻")
|
||||
else:
|
||||
print(f"收集失败: {result.get('error', '未知错误')}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,332 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
BroadTopicExtraction模块 - 主程序
|
||||
整合话题提取的完整工作流程和命令行工具
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import argparse
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
try:
|
||||
from BroadTopicExtraction.get_today_news import NewsCollector, SOURCE_NAMES
|
||||
from BroadTopicExtraction.topic_extractor import TopicExtractor
|
||||
from BroadTopicExtraction.database_manager import DatabaseManager
|
||||
except ImportError as e:
|
||||
print(f"导入模块失败: {e}")
|
||||
print("请确保在项目根目录运行,并且已安装所有依赖")
|
||||
sys.exit(1)
|
||||
|
||||
class BroadTopicExtraction:
|
||||
"""BroadTopicExtraction主要工作流程"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化"""
|
||||
self.news_collector = NewsCollector()
|
||||
self.topic_extractor = TopicExtractor()
|
||||
self.db_manager = DatabaseManager()
|
||||
|
||||
print("BroadTopicExtraction 初始化完成")
|
||||
|
||||
def close(self):
|
||||
"""关闭资源"""
|
||||
if self.news_collector:
|
||||
self.news_collector.close()
|
||||
if self.db_manager:
|
||||
self.db_manager.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
async def run_daily_extraction(self,
|
||||
news_sources: Optional[List[str]] = None,
|
||||
max_keywords: int = 100) -> Dict:
|
||||
"""
|
||||
运行每日话题提取流程
|
||||
|
||||
Args:
|
||||
news_sources: 新闻源列表,None表示使用所有支持的源
|
||||
max_keywords: 最大关键词数量
|
||||
|
||||
Returns:
|
||||
包含完整提取结果的字典
|
||||
"""
|
||||
print("\n" + "=" * 80)
|
||||
print("MindSpider AI爬虫 - 每日话题提取")
|
||||
print("=" * 80)
|
||||
print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"目标日期: {date.today()}")
|
||||
|
||||
if news_sources:
|
||||
print(f"指定平台: {len(news_sources)} 个")
|
||||
for source in news_sources:
|
||||
source_name = SOURCE_NAMES.get(source, source)
|
||||
print(f" - {source_name}")
|
||||
else:
|
||||
print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
|
||||
|
||||
print(f"关键词数: 最多 {max_keywords} 个")
|
||||
|
||||
extraction_result = {
|
||||
'success': False,
|
||||
'extraction_date': date.today().isoformat(),
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'news_collection': {},
|
||||
'topic_extraction': {},
|
||||
'database_save': {},
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
# 步骤1: 收集新闻
|
||||
print("\n【步骤1】收集热点新闻...")
|
||||
news_result = await self.news_collector.collect_and_save_news(
|
||||
sources=news_sources
|
||||
)
|
||||
|
||||
extraction_result['news_collection'] = {
|
||||
'success': news_result['success'],
|
||||
'total_news': news_result.get('total_news', 0),
|
||||
'successful_sources': news_result.get('successful_sources', 0),
|
||||
'total_sources': news_result.get('total_sources', 0)
|
||||
}
|
||||
|
||||
if not news_result['success'] or not news_result['news_list']:
|
||||
raise Exception("新闻收集失败或没有获取到新闻")
|
||||
|
||||
# 步骤2: 提取关键词和生成总结
|
||||
print("\n【步骤2】提取关键词和生成总结...")
|
||||
keywords, summary = self.topic_extractor.extract_keywords_and_summary(
|
||||
news_result['news_list'],
|
||||
max_keywords=max_keywords
|
||||
)
|
||||
|
||||
extraction_result['topic_extraction'] = {
|
||||
'success': len(keywords) > 0,
|
||||
'keywords_count': len(keywords),
|
||||
'keywords': keywords,
|
||||
'summary': summary
|
||||
}
|
||||
|
||||
if not keywords:
|
||||
print("警告: 没有提取到有效关键词")
|
||||
|
||||
# 步骤3: 保存到数据库
|
||||
print("\n【步骤3】保存分析结果到数据库...")
|
||||
save_success = self.db_manager.save_daily_topics(
|
||||
keywords, summary, date.today()
|
||||
)
|
||||
|
||||
extraction_result['database_save'] = {
|
||||
'success': save_success
|
||||
}
|
||||
|
||||
extraction_result['success'] = True
|
||||
extraction_result['end_time'] = datetime.now().isoformat()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("每日话题提取流程完成!")
|
||||
print("=" * 80)
|
||||
|
||||
return extraction_result
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n话题提取流程失败: {e}")
|
||||
extraction_result['error'] = str(e)
|
||||
extraction_result['end_time'] = datetime.now().isoformat()
|
||||
return extraction_result
|
||||
|
||||
def print_extraction_results(self, extraction_result: Dict):
|
||||
"""打印提取结果"""
|
||||
print("\n" + "=" * 80)
|
||||
print("话题提取结果报告")
|
||||
print("=" * 80)
|
||||
|
||||
if not extraction_result['success']:
|
||||
print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
|
||||
return
|
||||
|
||||
# 新闻收集结果
|
||||
news_data = extraction_result.get('news_collection', {})
|
||||
print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
|
||||
print(f" 成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
|
||||
|
||||
# 话题提取结果
|
||||
topic_data = extraction_result.get('topic_extraction', {})
|
||||
keywords = topic_data.get('keywords', [])
|
||||
summary = topic_data.get('summary', '')
|
||||
|
||||
print(f"\n🔑 提取关键词: {len(keywords)} 个")
|
||||
if keywords:
|
||||
# 每行显示5个关键词
|
||||
for i in range(0, len(keywords), 5):
|
||||
keyword_group = keywords[i:i+5]
|
||||
print(f" {', '.join(keyword_group)}")
|
||||
|
||||
print(f"\n📝 新闻总结:")
|
||||
print(f" {summary}")
|
||||
|
||||
# 数据库保存结果
|
||||
db_data = extraction_result.get('database_save', {})
|
||||
if db_data.get('success'):
|
||||
print(f"\n💾 数据库保存: 成功")
|
||||
else:
|
||||
print(f"\n💾 数据库保存: 失败")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
|
||||
"""
|
||||
获取用于爬取的关键词列表
|
||||
|
||||
Args:
|
||||
extract_date: 提取日期,默认为今天
|
||||
|
||||
Returns:
|
||||
关键词列表
|
||||
"""
|
||||
try:
|
||||
# 从数据库获取话题分析
|
||||
topics_data = self.db_manager.get_daily_topics(extract_date)
|
||||
|
||||
if not topics_data:
|
||||
print(f"没有找到 {extract_date or date.today()} 的话题数据")
|
||||
return []
|
||||
|
||||
keywords = topics_data['keywords']
|
||||
|
||||
# 生成搜索关键词
|
||||
search_keywords = self.topic_extractor.get_search_keywords(keywords)
|
||||
|
||||
print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
|
||||
return search_keywords
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取爬取关键词失败: {e}")
|
||||
return []
|
||||
|
||||
def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
|
||||
"""获取指定日期的分析结果"""
|
||||
try:
|
||||
return self.db_manager.get_daily_topics(target_date)
|
||||
except Exception as e:
|
||||
print(f"获取每日分析失败: {e}")
|
||||
return None
|
||||
|
||||
def get_recent_analysis(self, days: int = 7) -> List[Dict]:
|
||||
"""获取最近几天的分析结果"""
|
||||
try:
|
||||
return self.db_manager.get_recent_topics(days)
|
||||
except Exception as e:
|
||||
print(f"获取最近分析失败: {e}")
|
||||
return []
|
||||
|
||||
# ==================== 命令行工具 ====================
|
||||
|
||||
async def run_extraction_command(sources=None, keywords_count=100, show_details=True):
|
||||
"""运行话题提取命令"""
|
||||
|
||||
try:
|
||||
async with BroadTopicExtraction() as extractor:
|
||||
# 运行话题提取
|
||||
result = await extractor.run_daily_extraction(
|
||||
news_sources=sources,
|
||||
max_keywords=keywords_count
|
||||
)
|
||||
|
||||
if result['success']:
|
||||
if show_details:
|
||||
# 显示详细结果
|
||||
extractor.print_extraction_results(result)
|
||||
else:
|
||||
# 只显示简要结果
|
||||
news_data = result.get('news_collection', {})
|
||||
topic_data = result.get('topic_extraction', {})
|
||||
|
||||
print(f"✅ 话题提取成功完成!")
|
||||
print(f" 收集新闻: {news_data.get('total_news', 0)} 条")
|
||||
print(f" 提取关键词: {len(topic_data.get('keywords', []))} 个")
|
||||
print(f" 生成总结: {len(topic_data.get('summary', ''))} 字符")
|
||||
|
||||
# 获取爬取关键词
|
||||
crawling_keywords = extractor.get_keywords_for_crawling()
|
||||
|
||||
if crawling_keywords:
|
||||
print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
|
||||
print(f" {', '.join(crawling_keywords)}")
|
||||
|
||||
# 保存关键词到文件
|
||||
keywords_file = project_root / "data" / "daily_keywords.txt"
|
||||
keywords_file.parent.mkdir(exist_ok=True)
|
||||
|
||||
with open(keywords_file, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(crawling_keywords))
|
||||
|
||||
print(f" 关键词已保存到: {keywords_file}")
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 执行过程中发生错误: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
parser = argparse.ArgumentParser(description="MindSpider每日话题提取工具")
|
||||
parser.add_argument("--sources", nargs="+", help="指定新闻源平台",
|
||||
choices=list(SOURCE_NAMES.keys()))
|
||||
parser.add_argument("--keywords", type=int, default=100, help="最大关键词数量 (默认100)")
|
||||
parser.add_argument("--quiet", action="store_true", help="简化输出模式")
|
||||
parser.add_argument("--list-sources", action="store_true", help="显示支持的新闻源")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 显示支持的新闻源
|
||||
if args.list_sources:
|
||||
print("支持的新闻源平台:")
|
||||
for source, name in SOURCE_NAMES.items():
|
||||
print(f" {source:<25} {name}")
|
||||
return
|
||||
|
||||
# 验证参数
|
||||
if args.keywords < 1 or args.keywords > 200:
|
||||
print("关键词数量应在1-200之间")
|
||||
sys.exit(1)
|
||||
|
||||
# 运行提取
|
||||
try:
|
||||
success = asyncio.run(run_extraction_command(
|
||||
sources=args.sources,
|
||||
keywords_count=args.keywords,
|
||||
show_details=not args.quiet
|
||||
))
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n用户中断操作")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,288 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
BroadTopicExtraction模块 - 话题提取器
|
||||
基于DeepSeek直接提取关键词和生成新闻总结
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple
|
||||
from openai import OpenAI
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
try:
|
||||
import config
|
||||
except ImportError:
|
||||
raise ImportError("无法导入config.py配置文件")
|
||||
|
||||
class TopicExtractor:
|
||||
"""话题提取器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化话题提取器"""
|
||||
self.client = OpenAI(
|
||||
api_key=config.DEEPSEEK_API_KEY,
|
||||
base_url="https://api.deepseek.com"
|
||||
)
|
||||
self.model = "deepseek-chat"
|
||||
|
||||
def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
|
||||
"""
|
||||
从新闻列表中提取关键词和生成总结
|
||||
|
||||
Args:
|
||||
news_list: 新闻列表
|
||||
max_keywords: 最大关键词数量
|
||||
|
||||
Returns:
|
||||
(关键词列表, 新闻分析总结)
|
||||
"""
|
||||
if not news_list:
|
||||
return [], "今日暂无热点新闻"
|
||||
|
||||
# 构建新闻摘要文本
|
||||
news_text = self._build_news_summary(news_list)
|
||||
|
||||
# 构建提示词
|
||||
prompt = self._build_analysis_prompt(news_text, max_keywords)
|
||||
|
||||
try:
|
||||
# 调用DeepSeek API
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "你是一个专业的新闻分析师,擅长从热点新闻中提取关键词和撰写分析总结。"},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
max_tokens=1500,
|
||||
temperature=0.3
|
||||
)
|
||||
|
||||
# 解析返回结果
|
||||
result_text = response.choices[0].message.content
|
||||
keywords, summary = self._parse_analysis_result(result_text)
|
||||
|
||||
print(f"成功提取 {len(keywords)} 个关键词并生成新闻总结")
|
||||
return keywords[:max_keywords], summary
|
||||
|
||||
except Exception as e:
|
||||
print(f"话题提取失败: {e}")
|
||||
# 返回简单的fallback结果
|
||||
fallback_keywords = self._extract_simple_keywords(news_list)
|
||||
fallback_summary = f"今日共收集到 {len(news_list)} 条热点新闻,涵盖多个平台的热门话题。"
|
||||
return fallback_keywords[:max_keywords], fallback_summary
|
||||
|
||||
def _build_news_summary(self, news_list: List[Dict]) -> str:
|
||||
"""构建新闻摘要文本"""
|
||||
news_items = []
|
||||
|
||||
for i, news in enumerate(news_list, 1):
|
||||
title = news.get('title', '无标题')
|
||||
source = news.get('source_platform', news.get('source', '未知'))
|
||||
|
||||
# 清理标题中的特殊字符
|
||||
title = re.sub(r'[#@]', '', title).strip()
|
||||
|
||||
news_items.append(f"{i}. 【{source}】{title}")
|
||||
|
||||
return "\n".join(news_items)
|
||||
|
||||
def _build_analysis_prompt(self, news_text: str, max_keywords: int) -> str:
|
||||
"""构建分析提示词"""
|
||||
news_count = len(news_text.split('\n'))
|
||||
|
||||
prompt = f"""
|
||||
请分析以下{news_count}条今日热点新闻,完成两个任务:
|
||||
|
||||
新闻列表:
|
||||
{news_text}
|
||||
|
||||
任务1:提取关键词(最多{max_keywords}个)
|
||||
- 提取能代表今日热点话题的关键词
|
||||
- 关键词应该适合用于社交媒体平台搜索
|
||||
- 优先选择热度高、讨论量大的话题
|
||||
- 避免过于宽泛或过于具体的词汇
|
||||
|
||||
任务2:撰写新闻分析总结(150-300字)
|
||||
- 简要概括今日热点新闻的主要内容
|
||||
- 指出当前社会关注的重点话题方向
|
||||
- 分析这些热点反映的社会现象或趋势
|
||||
- 语言简洁明了,客观中性
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
```json
|
||||
{{
|
||||
"keywords": ["关键词1", "关键词2", "关键词3"],
|
||||
"summary": "今日新闻分析总结内容..."
|
||||
}}
|
||||
```
|
||||
|
||||
请直接输出JSON格式的结果,不要包含其他文字说明。
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def _parse_analysis_result(self, result_text: str) -> Tuple[List[str], str]:
|
||||
"""解析分析结果"""
|
||||
try:
|
||||
# 尝试提取JSON部分
|
||||
json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_text = json_match.group(1)
|
||||
else:
|
||||
# 如果没有代码块,尝试直接解析
|
||||
json_text = result_text.strip()
|
||||
|
||||
# 解析JSON
|
||||
data = json.loads(json_text)
|
||||
|
||||
keywords = data.get('keywords', [])
|
||||
summary = data.get('summary', '')
|
||||
|
||||
# 验证和清理关键词
|
||||
clean_keywords = []
|
||||
for keyword in keywords:
|
||||
keyword = str(keyword).strip()
|
||||
if keyword and len(keyword) > 1 and keyword not in clean_keywords:
|
||||
clean_keywords.append(keyword)
|
||||
|
||||
# 验证总结
|
||||
if not summary or len(summary.strip()) < 10:
|
||||
summary = "今日热点新闻涵盖多个领域,反映了当前社会的多元化关注点。"
|
||||
|
||||
return clean_keywords, summary.strip()
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"解析JSON失败: {e}")
|
||||
print(f"原始返回: {result_text}")
|
||||
|
||||
# 尝试手动解析
|
||||
return self._manual_parse_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理分析结果失败: {e}")
|
||||
return [], "分析结果处理失败,请稍后重试。"
|
||||
|
||||
def _manual_parse_result(self, text: str) -> Tuple[List[str], str]:
|
||||
"""手动解析结果(当JSON解析失败时的后备方案)"""
|
||||
print("尝试手动解析结果...")
|
||||
|
||||
keywords = []
|
||||
summary = ""
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# 寻找关键词
|
||||
if '关键词' in line or 'keywords' in line.lower():
|
||||
# 提取关键词
|
||||
keyword_match = re.findall(r'[""](.*?)["""]', line)
|
||||
if keyword_match:
|
||||
keywords.extend(keyword_match)
|
||||
else:
|
||||
# 尝试其他分隔符
|
||||
parts = re.split(r'[,,、]', line)
|
||||
for part in parts:
|
||||
clean_part = re.sub(r'[关键词::keywords\[\]"]', '', part).strip()
|
||||
if clean_part and len(clean_part) > 1:
|
||||
keywords.append(clean_part)
|
||||
|
||||
# 寻找总结
|
||||
elif '总结' in line or '分析' in line or 'summary' in line.lower():
|
||||
if ':' in line or ':' in line:
|
||||
summary = line.split(':')[-1].split(':')[-1].strip()
|
||||
|
||||
# 如果这一行看起来像总结内容
|
||||
elif len(line) > 50 and ('今日' in line or '热点' in line or '新闻' in line):
|
||||
if not summary:
|
||||
summary = line
|
||||
|
||||
# 清理关键词
|
||||
clean_keywords = []
|
||||
for keyword in keywords:
|
||||
keyword = keyword.strip()
|
||||
if keyword and len(keyword) > 1 and keyword not in clean_keywords:
|
||||
clean_keywords.append(keyword)
|
||||
|
||||
# 如果没有找到总结,生成一个简单的
|
||||
if not summary:
|
||||
summary = "今日热点新闻内容丰富,涵盖了社会各个层面的关注点。"
|
||||
|
||||
return clean_keywords[:max_keywords], summary
|
||||
|
||||
def _extract_simple_keywords(self, news_list: List[Dict]) -> List[str]:
|
||||
"""简单关键词提取(fallback方案)"""
|
||||
keywords = []
|
||||
|
||||
for news in news_list:
|
||||
title = news.get('title', '')
|
||||
|
||||
# 简单的关键词提取
|
||||
# 移除常见的无意义词汇
|
||||
title_clean = re.sub(r'[#@【】\[\]()()]', ' ', title)
|
||||
words = title_clean.split()
|
||||
|
||||
for word in words:
|
||||
word = word.strip()
|
||||
if (len(word) > 1 and
|
||||
word not in ['的', '了', '在', '和', '与', '或', '但', '是', '有', '被', '将', '已', '正在'] and
|
||||
word not in keywords):
|
||||
keywords.append(word)
|
||||
|
||||
return keywords[:10]
|
||||
|
||||
def get_search_keywords(self, keywords: List[str], limit: int = 10) -> List[str]:
|
||||
"""
|
||||
获取用于搜索的关键词
|
||||
|
||||
Args:
|
||||
keywords: 关键词列表
|
||||
limit: 限制数量
|
||||
|
||||
Returns:
|
||||
适合搜索的关键词列表
|
||||
"""
|
||||
# 过滤和优化关键词
|
||||
search_keywords = []
|
||||
|
||||
for keyword in keywords:
|
||||
keyword = str(keyword).strip()
|
||||
|
||||
# 过滤条件
|
||||
if (len(keyword) > 1 and
|
||||
len(keyword) < 20 and # 不能太长
|
||||
keyword not in search_keywords and
|
||||
not keyword.isdigit() and # 不是纯数字
|
||||
not re.match(r'^[a-zA-Z]+$', keyword)): # 不是纯英文(除非是专有名词)
|
||||
|
||||
search_keywords.append(keyword)
|
||||
|
||||
return search_keywords[:limit]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试话题提取器
|
||||
extractor = TopicExtractor()
|
||||
|
||||
# 模拟新闻数据
|
||||
test_news = [
|
||||
{"title": "AI技术发展迅速", "source_platform": "科技新闻"},
|
||||
{"title": "股市行情分析", "source_platform": "财经新闻"},
|
||||
{"title": "明星最新动态", "source_platform": "娱乐新闻"}
|
||||
]
|
||||
|
||||
keywords, summary = extractor.extract_keywords_and_summary(test_news)
|
||||
|
||||
print(f"提取的关键词: {keywords}")
|
||||
print(f"新闻总结: {summary}")
|
||||
|
||||
search_keywords = extractor.get_search_keywords(keywords)
|
||||
print(f"搜索关键词: {search_keywords}")
|
||||
Reference in New Issue
Block a user