The framework has been restructured again, and the Flask framework has been abandoned.

2025-08-22 13:52:05 +08:00
parent 15b3a3343b
commit 0c31be4287
279 changed files with 2725 additions and 1648837 deletions
@@ -1,319 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-BroadTopicExtraction模块 - 数据库管理器
-只负责新闻数据和话题分析的存储和查询
-"""
-
-import sys
-import json
-from datetime import datetime, date
-from pathlib import Path
-from typing import List, Dict, Optional
-import pymysql
-from pymysql.cursors import DictCursor
-
-# 添加项目根目录到路径
-project_root = Path(__file__).parent.parent
-sys.path.append(str(project_root))
-
-try:
-    import config
-except ImportError:
-    raise ImportError("无法导入config.py配置文件")
-
-class DatabaseManager:
-    """数据库管理器"""
-    
-    def __init__(self):
-        """初始化数据库管理器"""
-        self.connection = None
-        self.connect()
-    
-    def connect(self):
-        """连接数据库"""
-        try:
-            self.connection = pymysql.connect(
-                host=config.DB_HOST,
-                port=config.DB_PORT,
-                user=config.DB_USER,
-                password=config.DB_PASSWORD,
-                database=config.DB_NAME,
-                charset=config.DB_CHARSET,
-                autocommit=True,
-                cursorclass=DictCursor
-            )
-            print(f"成功连接到数据库: {config.DB_NAME}")
-        except Exception as e:
-            print(f"数据库连接失败: {e}")
-            raise
-    
-    def close(self):
-        """关闭数据库连接"""
-        if self.connection:
-            self.connection.close()
-            print("数据库连接已关闭")
-    
-    def __enter__(self):
-        return self
-    
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    
-    # ==================== 新闻数据操作 ====================
-    
-    def save_daily_news(self, news_data: List[Dict], crawl_date: date = None) -> int:
-        """
-        保存每日新闻数据，如果当天已有数据则覆盖
-        
-        Args:
-            news_data: 新闻数据列表
-            crawl_date: 爬取日期，默认为今天
-        
-        Returns:
-            保存的新闻数量
-        """
-        if not crawl_date:
-            crawl_date = date.today()
-        
-        current_timestamp = int(datetime.now().timestamp())
-        
-        try:
-            cursor = self.connection.cursor()
-            
-            # 先删除当天所有的新闻记录（覆盖模式）
-            delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
-            deleted_count = cursor.execute(delete_query, (crawl_date,))
-            if deleted_count > 0:
-                print(f"覆盖模式：删除了当天已有的 {deleted_count} 条新闻记录")
-            
-            # 批量插入新记录
-            saved_count = 0
-            for news_item in news_data:
-                try:
-                    # 简化的新闻ID生成
-                    news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
-                    
-                    # 插入新记录
-                    insert_query = """
-                        INSERT INTO daily_news (
-                            news_id, source_platform, title, url, crawl_date, 
-                            rank_position, add_ts
-                        ) VALUES (%s, %s, %s, %s, %s, %s, %s)
-                    """
-                    cursor.execute(insert_query, (
-                        news_id,
-                        news_item.get('source', 'unknown'),
-                        news_item.get('title', ''),
-                        news_item.get('url', ''),
-                        crawl_date,
-                        news_item.get('rank', None),
-                        current_timestamp
-                    ))
-                    saved_count += 1
-                    
-                except Exception as e:
-                    print(f"保存单条新闻失败: {e}")
-                    continue
-            
-            print(f"成功保存 {saved_count} 条新闻记录")
-            return saved_count
-            
-        except Exception as e:
-            print(f"保存新闻数据失败: {e}")
-            return 0
-    
-    def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
-        """
-        获取每日新闻数据
-        
-        Args:
-            crawl_date: 爬取日期，默认为今天
-        
-        Returns:
-            新闻列表
-        """
-        if not crawl_date:
-            crawl_date = date.today()
-        
-        query = """
-            SELECT * FROM daily_news 
-            WHERE crawl_date = %s 
-            ORDER BY rank_position ASC
-        """
-        
-        cursor = self.connection.cursor()
-        cursor.execute(query, (crawl_date,))
-        return cursor.fetchall()
-    
-    # ==================== 话题数据操作 ====================
-    
-    def save_daily_topics(self, keywords: List[str], summary: str, extract_date: date = None) -> bool:
-        """
-        保存每日话题分析
-        
-        Args:
-            keywords: 话题关键词列表
-            summary: 新闻分析总结
-            extract_date: 提取日期，默认为今天
-        
-        Returns:
-            是否保存成功
-        """
-        if not extract_date:
-            extract_date = date.today()
-        
-        current_timestamp = int(datetime.now().timestamp())
-        
-        try:
-            cursor = self.connection.cursor()
-            
-            # 检查今天是否已有记录
-            check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
-            cursor.execute(check_query, (extract_date,))
-            existing = cursor.fetchone()
-            
-            keywords_json = json.dumps(keywords, ensure_ascii=False)
-            
-            if existing:
-                # 更新现有记录
-                update_query = """
-                    UPDATE daily_topics 
-                    SET keywords = %s, summary = %s, add_ts = %s
-                    WHERE extract_date = %s
-                """
-                cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
-                print(f"更新了 {extract_date} 的话题分析")
-            else:
-                # 插入新记录
-                insert_query = """
-                    INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
-                    VALUES (%s, %s, %s, %s)
-                """
-                cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
-                print(f"保存了 {extract_date} 的话题分析")
-            
-            return True
-            
-        except Exception as e:
-            print(f"保存话题分析失败: {e}")
-            return False
-    
-    def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
-        """
-        获取每日话题分析
-        
-        Args:
-            extract_date: 提取日期，默认为今天
-        
-        Returns:
-            话题分析数据，如果不存在返回None
-        """
-        if not extract_date:
-            extract_date = date.today()
-        
-        try:
-            cursor = self.connection.cursor()
-            query = "SELECT * FROM daily_topics WHERE extract_date = %s"
-            cursor.execute(query, (extract_date,))
-            result = cursor.fetchone()
-            
-            if result:
-                # 解析关键词JSON
-                result['keywords'] = json.loads(result['keywords'])
-                return result
-            else:
-                return None
-                
-        except Exception as e:
-            print(f"获取话题分析失败: {e}")
-            return None
-    
-    def get_recent_topics(self, days: int = 7) -> List[Dict]:
-        """
-        获取最近几天的话题分析
-        
-        Args:
-            days: 天数
-        
-        Returns:
-            话题分析列表
-        """
-        try:
-            cursor = self.connection.cursor()
-            query = """
-                SELECT * FROM daily_topics 
-                WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
-                ORDER BY extract_date DESC
-            """
-            cursor.execute(query, (days,))
-            results = cursor.fetchall()
-            
-            # 解析每个结果的关键词JSON
-            for result in results:
-                result['keywords'] = json.loads(result['keywords'])
-            
-            return results
-            
-        except Exception as e:
-            print(f"获取最近话题分析失败: {e}")
-            return []
-    
-    # ==================== 统计查询 ====================
-    
-    def get_summary_stats(self, days: int = 7) -> Dict:
-        """获取统计摘要"""
-        try:
-            cursor = self.connection.cursor()
-            
-            # 新闻统计
-            news_query = """
-                SELECT 
-                    crawl_date,
-                    COUNT(*) as news_count,
-                    COUNT(DISTINCT source_platform) as platforms_count
-                FROM daily_news 
-                WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
-                GROUP BY crawl_date
-                ORDER BY crawl_date DESC
-            """
-            cursor.execute(news_query, (days,))
-            news_stats = cursor.fetchall()
-            
-            # 话题统计
-            topics_query = """
-                SELECT 
-                    extract_date,
-                    keywords,
-                    CHAR_LENGTH(summary) as summary_length
-                FROM daily_topics 
-                WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
-                ORDER BY extract_date DESC
-            """
-            cursor.execute(topics_query, (days,))
-            topics_stats = cursor.fetchall()
-            
-            return {
-                'news_stats': news_stats,
-                'topics_stats': topics_stats
-            }
-            
-        except Exception as e:
-            print(f"获取统计摘要失败: {e}")
-            return {'news_stats': [], 'topics_stats': []}
-
-if __name__ == "__main__":
-    # 测试数据库管理器
-    with DatabaseManager() as db:
-        # 测试获取新闻
-        news = db.get_daily_news()
-        print(f"今日新闻数量: {len(news)}")
-        
-        # 测试获取话题
-        topics = db.get_daily_topics()
-        if topics:
-            print(f"今日话题关键词: {topics['keywords']}")
-        else:
-            print("今日暂无话题分析")
-        
-        print("简化数据库管理器测试完成！")
@@ -1,300 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-BroadTopicExtraction模块 - 新闻获取和收集
-整合新闻API调用和数据库存储功能
-"""
-
-import sys
-import asyncio
-import httpx
-import json
-from datetime import datetime, date
-from pathlib import Path
-from typing import List, Dict, Optional
-
-# 添加项目根目录到路径
-project_root = Path(__file__).parent.parent
-sys.path.append(str(project_root))
-
-try:
-    from BroadTopicExtraction.database_manager import DatabaseManager
-except ImportError as e:
-    raise ImportError(f"导入模块失败: {e}")
-
-# 新闻API基础URL
-BASE_URL = "https://newsnow.busiyi.world"
-
-# 新闻源中文名称映射
-SOURCE_NAMES = {
-    "weibo": "微博热搜",
-    "zhihu": "知乎热榜",
-    "bilibili-hot-search": "B站热搜",
-    "toutiao": "今日头条",
-    "douyin": "抖音热榜",
-    "github-trending-today": "GitHub趋势",
-    "coolapk": "酷安热榜",
-    "tieba": "百度贴吧",
-    "wallstreetcn": "华尔街见闻",
-    "thepaper": "澎湃新闻",
-    "cls-hot": "财联社",
-    "xueqiu": "雪球热榜",
-    "kuaishou": "快手热榜"
-}
-
-class NewsCollector:
-    """新闻收集器 - 整合API调用和数据库存储"""
-    
-    def __init__(self):
-        """初始化新闻收集器"""
-        self.db_manager = DatabaseManager()
-        self.supported_sources = list(SOURCE_NAMES.keys())
-    
-    def close(self):
-        """关闭资源"""
-        if self.db_manager:
-            self.db_manager.close()
-    
-    def __enter__(self):
-        return self
-    
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    
-    async def __aenter__(self):
-        return self
-    
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    
-    # ==================== 新闻API调用 ====================
-    
-    async def fetch_news(self, source: str) -> dict:
-        """从指定源获取最新新闻"""
-        url = f"{BASE_URL}/api/s?id={source}&latest"
-        headers = {"Accept": "application/json"}
-        
-        try:
-            async with httpx.AsyncClient(timeout=30.0) as client:
-                response = await client.get(url, headers=headers)
-                response.raise_for_status()
-                
-                # 解析JSON响应
-                data = json.loads(response.text)
-                return {
-                    "source": source,
-                    "status": "success",
-                    "data": data,
-                    "timestamp": datetime.now().isoformat()
-                }
-        except httpx.TimeoutException:
-            return {
-                "source": source,
-                "status": "timeout",
-                "error": "请求超时",
-                "timestamp": datetime.now().isoformat()
-            }
-        except httpx.HTTPStatusError as e:
-            return {
-                "source": source,
-                "status": "http_error",
-                "error": f"HTTP错误: {e.response.status_code}",
-                "timestamp": datetime.now().isoformat()
-            }
-        except Exception as e:
-            return {
-                "source": source,
-                "status": "error",
-                "error": f"未知错误: {str(e)}",
-                "timestamp": datetime.now().isoformat()
-            }
-    
-    async def get_popular_news(self, sources: List[str] = None) -> List[dict]:
-        """获取热门新闻"""
-        if sources is None:
-            sources = list(SOURCE_NAMES.keys())
-        
-        print(f"正在获取 {len(sources)} 个新闻源的最新内容...")
-        print("=" * 80)
-        
-        results = []
-        for source in sources:
-            source_name = SOURCE_NAMES.get(source, source)
-            print(f"正在获取 {source_name} 的新闻...")
-            result = await self.fetch_news(source)
-            results.append(result)
-            
-            if result["status"] == "success":
-                data = result["data"]
-                if 'items' in data and isinstance(data['items'], list):
-                    count = len(data['items'])
-                    print(f"✓ {source_name}: 获取成功，共 {count} 条新闻")
-                else:
-                    print(f"✓ {source_name}: 获取成功")
-            else:
-                print(f"✗ {source_name}: {result.get('error', '获取失败')}")
-            
-            # 避免请求过快
-            await asyncio.sleep(0.5)
-        
-        return results
-    
-    # ==================== 数据处理和存储 ====================
-    
-    async def collect_and_save_news(self, sources: Optional[List[str]] = None) -> Dict:
-        """
-        收集并保存每日热点新闻
-        
-        Args:
-            sources: 指定的新闻源列表，None表示使用所有支持的源
-            
-        Returns:
-            包含收集结果的字典
-        """
-        print(f"开始收集每日热点新闻...")
-        print(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-        
-        # 选择新闻源
-        if sources is None:
-            # 使用所有支持的新闻源
-            sources = list(SOURCE_NAMES.keys())
-        
-        print(f"将从 {len(sources)} 个新闻源收集数据:")
-        for source in sources:
-            source_name = SOURCE_NAMES.get(source, source)
-            print(f"  - {source_name}")
-        
-        try:
-            # 获取新闻数据
-            results = await self.get_popular_news(sources)
-            
-            # 处理结果
-            processed_data = self._process_news_results(results)
-            
-            # 保存到数据库（覆盖模式）
-            if processed_data['news_list']:
-                saved_count = self.db_manager.save_daily_news(
-                    processed_data['news_list'], 
-                    date.today()
-                )
-                processed_data['saved_count'] = saved_count
-            
-            # 打印统计信息
-            self._print_collection_summary(processed_data)
-            
-            return processed_data
-            
-        except Exception as e:
-            print(f"收集新闻失败: {e}")
-            return {
-                'success': False,
-                'error': str(e),
-                'news_list': [],
-                'total_news': 0
-            }
-    
-    def _process_news_results(self, results: List[Dict]) -> Dict:
-        """处理新闻获取结果"""
-        news_list = []
-        successful_sources = 0
-        total_news = 0
-        
-        for result in results:
-            source = result['source']
-            status = result['status']
-            
-            if status == 'success':
-                successful_sources += 1
-                data = result['data']
-                
-                if 'items' in data and isinstance(data['items'], list):
-                    source_news_count = len(data['items'])
-                    total_news += source_news_count
-                    
-                    # 处理该源的新闻
-                    for i, item in enumerate(data['items'], 1):
-                        processed_news = self._process_news_item(item, source, i)
-                        if processed_news:
-                            news_list.append(processed_news)
-        
-        return {
-            'success': True,
-            'news_list': news_list,
-            'successful_sources': successful_sources,
-            'total_sources': len(results),
-            'total_news': total_news,
-            'collection_time': datetime.now().isoformat()
-        }
-    
-    def _process_news_item(self, item: Dict, source: str, rank: int) -> Optional[Dict]:
-        """处理单条新闻"""
-        try:
-            if isinstance(item, dict):
-                title = item.get('title', '无标题').strip()
-                url = item.get('url', '')
-                
-                # 生成新闻ID
-                news_id = f"{source}_{item.get('id', f'rank_{rank}')}"
-                
-                return {
-                    'id': news_id,
-                    'title': title,
-                    'url': url,
-                    'source': source,
-                    'rank': rank
-                }
-            else:
-                # 处理字符串类型的新闻
-                title = str(item)[:100] if len(str(item)) > 100 else str(item)
-                return {
-                    'id': f"{source}_rank_{rank}",
-                    'title': title,
-                    'url': '',
-                    'source': source,
-                    'rank': rank
-                }
-                
-        except Exception as e:
-            print(f"处理新闻项失败: {e}")
-            return None
-    
-    def _print_collection_summary(self, data: Dict):
-        """打印收集摘要"""
-        print("\n" + "=" * 50)
-        print("新闻收集摘要")
-        print("=" * 50)
-        
-        print(f"总新闻源: {data['total_sources']}")
-        print(f"成功源数: {data['successful_sources']}")
-        print(f"总新闻数: {data['total_news']}")
-        
-        if 'saved_count' in data:
-            print(f"已保存数: {data['saved_count']}")
-        
-        print("=" * 50)
-    
-    def get_today_news(self) -> List[Dict]:
-        """获取今天的新闻"""
-        try:
-            return self.db_manager.get_daily_news(date.today())
-        except Exception as e:
-            print(f"获取今日新闻失败: {e}")
-            return []
-
-async def main():
-    """测试新闻收集器"""
-    print("测试新闻收集器...")
-    
-    async with NewsCollector() as collector:
-        # 收集新闻
-        result = await collector.collect_and_save_news(
-            sources=["weibo", "zhihu"]  # 测试用，只使用两个源
-        )
-        
-        if result['success']:
-            print(f"收集成功！共获取 {result['total_news']} 条新闻")
-        else:
-            print(f"收集失败: {result.get('error', '未知错误')}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
@@ -1,332 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-BroadTopicExtraction模块 - 主程序
-整合话题提取的完整工作流程和命令行工具
-"""
-
-import sys
-import asyncio
-import argparse
-from datetime import datetime, date
-from pathlib import Path
-from typing import List, Dict, Optional
-
-# 添加项目根目录到路径
-project_root = Path(__file__).parent.parent
-sys.path.append(str(project_root))
-
-try:
-    from BroadTopicExtraction.get_today_news import NewsCollector, SOURCE_NAMES
-    from BroadTopicExtraction.topic_extractor import TopicExtractor
-    from BroadTopicExtraction.database_manager import DatabaseManager
-except ImportError as e:
-    print(f"导入模块失败: {e}")
-    print("请确保在项目根目录运行，并且已安装所有依赖")
-    sys.exit(1)
-
-class BroadTopicExtraction:
-    """BroadTopicExtraction主要工作流程"""
-    
-    def __init__(self):
-        """初始化"""
-        self.news_collector = NewsCollector()
-        self.topic_extractor = TopicExtractor()
-        self.db_manager = DatabaseManager()
-        
-        print("BroadTopicExtraction 初始化完成")
-    
-    def close(self):
-        """关闭资源"""
-        if self.news_collector:
-            self.news_collector.close()
-        if self.db_manager:
-            self.db_manager.close()
-    
-    def __enter__(self):
-        return self
-    
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    
-    async def __aenter__(self):
-        return self
-    
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    
-    async def run_daily_extraction(self, 
-                                  news_sources: Optional[List[str]] = None,
-                                  max_keywords: int = 100) -> Dict:
-        """
-        运行每日话题提取流程
-        
-        Args:
-            news_sources: 新闻源列表，None表示使用所有支持的源
-            max_keywords: 最大关键词数量
-            
-        Returns:
-            包含完整提取结果的字典
-        """
-        print("\n" + "=" * 80)
-        print("MindSpider AI爬虫 - 每日话题提取")
-        print("=" * 80)
-        print(f"执行时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-        print(f"目标日期: {date.today()}")
-        
-        if news_sources:
-            print(f"指定平台: {len(news_sources)} 个")
-            for source in news_sources:
-                source_name = SOURCE_NAMES.get(source, source)
-                print(f"  - {source_name}")
-        else:
-            print(f"爬取平台: 全部 {len(SOURCE_NAMES)} 个平台")
-        
-        print(f"关键词数: 最多 {max_keywords} 个")
-        
-        extraction_result = {
-            'success': False,
-            'extraction_date': date.today().isoformat(),
-            'start_time': datetime.now().isoformat(),
-            'news_collection': {},
-            'topic_extraction': {},
-            'database_save': {},
-            'error': None
-        }
-        
-        try:
-            # 步骤1: 收集新闻
-            print("\n【步骤1】收集热点新闻...")
-            news_result = await self.news_collector.collect_and_save_news(
-                sources=news_sources
-            )
-            
-            extraction_result['news_collection'] = {
-                'success': news_result['success'],
-                'total_news': news_result.get('total_news', 0),
-                'successful_sources': news_result.get('successful_sources', 0),
-                'total_sources': news_result.get('total_sources', 0)
-            }
-            
-            if not news_result['success'] or not news_result['news_list']:
-                raise Exception("新闻收集失败或没有获取到新闻")
-            
-            # 步骤2: 提取关键词和生成总结
-            print("\n【步骤2】提取关键词和生成总结...")
-            keywords, summary = self.topic_extractor.extract_keywords_and_summary(
-                news_result['news_list'], 
-                max_keywords=max_keywords
-            )
-            
-            extraction_result['topic_extraction'] = {
-                'success': len(keywords) > 0,
-                'keywords_count': len(keywords),
-                'keywords': keywords,
-                'summary': summary
-            }
-            
-            if not keywords:
-                print("警告: 没有提取到有效关键词")
-            
-            # 步骤3: 保存到数据库
-            print("\n【步骤3】保存分析结果到数据库...")
-            save_success = self.db_manager.save_daily_topics(
-                keywords, summary, date.today()
-            )
-            
-            extraction_result['database_save'] = {
-                'success': save_success
-            }
-            
-            extraction_result['success'] = True
-            extraction_result['end_time'] = datetime.now().isoformat()
-            
-            print("\n" + "=" * 80)
-            print("每日话题提取流程完成!")
-            print("=" * 80)
-            
-            return extraction_result
-            
-        except Exception as e:
-            print(f"\n话题提取流程失败: {e}")
-            extraction_result['error'] = str(e)
-            extraction_result['end_time'] = datetime.now().isoformat()
-            return extraction_result
-    
-    def print_extraction_results(self, extraction_result: Dict):
-        """打印提取结果"""
-        print("\n" + "=" * 80)
-        print("话题提取结果报告")
-        print("=" * 80)
-        
-        if not extraction_result['success']:
-            print(f"❌ 提取失败: {extraction_result.get('error', '未知错误')}")
-            return
-        
-        # 新闻收集结果
-        news_data = extraction_result.get('news_collection', {})
-        print(f"📰 新闻收集: {news_data.get('total_news', 0)} 条新闻")
-        print(f"   成功源数: {news_data.get('successful_sources', 0)}/{news_data.get('total_sources', 0)}")
-        
-        # 话题提取结果
-        topic_data = extraction_result.get('topic_extraction', {})
-        keywords = topic_data.get('keywords', [])
-        summary = topic_data.get('summary', '')
-        
-        print(f"\n🔑 提取关键词: {len(keywords)} 个")
-        if keywords:
-            # 每行显示5个关键词
-            for i in range(0, len(keywords), 5):
-                keyword_group = keywords[i:i+5]
-                print(f"   {', '.join(keyword_group)}")
-        
-        print(f"\n📝 新闻总结:")
-        print(f"   {summary}")
-        
-        # 数据库保存结果
-        db_data = extraction_result.get('database_save', {})
-        if db_data.get('success'):
-            print(f"\n💾 数据库保存: 成功")
-        else:
-            print(f"\n💾 数据库保存: 失败")
-        
-        print("\n" + "=" * 80)
-    
-    def get_keywords_for_crawling(self, extract_date: date = None) -> List[str]:
-        """
-        获取用于爬取的关键词列表
-        
-        Args:
-            extract_date: 提取日期，默认为今天
-            
-        Returns:
-            关键词列表
-        """
-        try:
-            # 从数据库获取话题分析
-            topics_data = self.db_manager.get_daily_topics(extract_date)
-            
-            if not topics_data:
-                print(f"没有找到 {extract_date or date.today()} 的话题数据")
-                return []
-            
-            keywords = topics_data['keywords']
-            
-            # 生成搜索关键词
-            search_keywords = self.topic_extractor.get_search_keywords(keywords)
-            
-            print(f"准备了 {len(search_keywords)} 个关键词用于爬取")
-            return search_keywords
-            
-        except Exception as e:
-            print(f"获取爬取关键词失败: {e}")
-            return []
-    
-    def get_daily_analysis(self, target_date: date = None) -> Optional[Dict]:
-        """获取指定日期的分析结果"""
-        try:
-            return self.db_manager.get_daily_topics(target_date)
-        except Exception as e:
-            print(f"获取每日分析失败: {e}")
-            return None
-    
-    def get_recent_analysis(self, days: int = 7) -> List[Dict]:
-        """获取最近几天的分析结果"""
-        try:
-            return self.db_manager.get_recent_topics(days)
-        except Exception as e:
-            print(f"获取最近分析失败: {e}")
-            return []
-
-# ==================== 命令行工具 ====================
-
-async def run_extraction_command(sources=None, keywords_count=100, show_details=True):
-    """运行话题提取命令"""
-    
-    try:
-        async with BroadTopicExtraction() as extractor:
-            # 运行话题提取
-            result = await extractor.run_daily_extraction(
-                news_sources=sources,
-                max_keywords=keywords_count
-            )
-            
-            if result['success']:
-                if show_details:
-                    # 显示详细结果
-                    extractor.print_extraction_results(result)
-                else:
-                    # 只显示简要结果
-                    news_data = result.get('news_collection', {})
-                    topic_data = result.get('topic_extraction', {})
-                    
-                    print(f"✅ 话题提取成功完成!")
-                    print(f"   收集新闻: {news_data.get('total_news', 0)} 条")
-                    print(f"   提取关键词: {len(topic_data.get('keywords', []))} 个")
-                    print(f"   生成总结: {len(topic_data.get('summary', ''))} 字符")
-                
-                # 获取爬取关键词
-                crawling_keywords = extractor.get_keywords_for_crawling()
-                
-                if crawling_keywords:
-                    print(f"\n🔑 为DeepSentimentCrawling准备的搜索关键词:")
-                    print(f"   {', '.join(crawling_keywords)}")
-                    
-                    # 保存关键词到文件
-                    keywords_file = project_root / "data" / "daily_keywords.txt"
-                    keywords_file.parent.mkdir(exist_ok=True)
-                    
-                    with open(keywords_file, 'w', encoding='utf-8') as f:
-                        f.write('\n'.join(crawling_keywords))
-                    
-                    print(f"   关键词已保存到: {keywords_file}")
-                
-                return True
-                
-            else:
-                print(f"❌ 话题提取失败: {result.get('error', '未知错误')}")
-                return False
-                
-    except Exception as e:
-        print(f"❌ 执行过程中发生错误: {e}")
-        return False
-
-def main():
-    """主函数"""
-    parser = argparse.ArgumentParser(description="MindSpider每日话题提取工具")
-    parser.add_argument("--sources", nargs="+", help="指定新闻源平台", 
-                       choices=list(SOURCE_NAMES.keys()))
-    parser.add_argument("--keywords", type=int, default=100, help="最大关键词数量 (默认100)")
-    parser.add_argument("--quiet", action="store_true", help="简化输出模式")
-    parser.add_argument("--list-sources", action="store_true", help="显示支持的新闻源")
-    
-    args = parser.parse_args()
-    
-    # 显示支持的新闻源
-    if args.list_sources:
-        print("支持的新闻源平台:")
-        for source, name in SOURCE_NAMES.items():
-            print(f"  {source:<25} {name}")
-        return
-    
-    # 验证参数
-    if args.keywords < 1 or args.keywords > 200:
-        print("关键词数量应在1-200之间")
-        sys.exit(1)
-    
-    # 运行提取
-    try:
-        success = asyncio.run(run_extraction_command(
-            sources=args.sources,
-            keywords_count=args.keywords,
-            show_details=not args.quiet
-        ))
-        
-        sys.exit(0 if success else 1)
-        
-    except KeyboardInterrupt:
-        print("\n用户中断操作")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    main()
@@ -1,288 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-BroadTopicExtraction模块 - 话题提取器
-基于DeepSeek直接提取关键词和生成新闻总结
-"""
-
-import sys
-import json
-import re
-from pathlib import Path
-from typing import List, Dict, Tuple
-from openai import OpenAI
-
-# 添加项目根目录到路径
-project_root = Path(__file__).parent.parent
-sys.path.append(str(project_root))
-
-try:
-    import config
-except ImportError:
-    raise ImportError("无法导入config.py配置文件")
-
-class TopicExtractor:
-    """话题提取器"""
-    
-    def __init__(self):
-        """初始化话题提取器"""
-        self.client = OpenAI(
-            api_key=config.DEEPSEEK_API_KEY,
-            base_url="https://api.deepseek.com"
-        )
-        self.model = "deepseek-chat"
-    
-    def extract_keywords_and_summary(self, news_list: List[Dict], max_keywords: int = 100) -> Tuple[List[str], str]:
-        """
-        从新闻列表中提取关键词和生成总结
-        
-        Args:
-            news_list: 新闻列表
-            max_keywords: 最大关键词数量
-            
-        Returns:
-            (关键词列表, 新闻分析总结)
-        """
-        if not news_list:
-            return [], "今日暂无热点新闻"
-        
-        # 构建新闻摘要文本
-        news_text = self._build_news_summary(news_list)
-        
-        # 构建提示词
-        prompt = self._build_analysis_prompt(news_text, max_keywords)
-        
-        try:
-            # 调用DeepSeek API
-            response = self.client.chat.completions.create(
-                model=self.model,
-                messages=[
-                    {"role": "system", "content": "你是一个专业的新闻分析师，擅长从热点新闻中提取关键词和撰写分析总结。"},
-                    {"role": "user", "content": prompt}
-                ],
-                max_tokens=1500,
-                temperature=0.3
-            )
-            
-            # 解析返回结果
-            result_text = response.choices[0].message.content
-            keywords, summary = self._parse_analysis_result(result_text)
-            
-            print(f"成功提取 {len(keywords)} 个关键词并生成新闻总结")
-            return keywords[:max_keywords], summary
-            
-        except Exception as e:
-            print(f"话题提取失败: {e}")
-            # 返回简单的fallback结果
-            fallback_keywords = self._extract_simple_keywords(news_list)
-            fallback_summary = f"今日共收集到 {len(news_list)} 条热点新闻，涵盖多个平台的热门话题。"
-            return fallback_keywords[:max_keywords], fallback_summary
-    
-    def _build_news_summary(self, news_list: List[Dict]) -> str:
-        """构建新闻摘要文本"""
-        news_items = []
-        
-        for i, news in enumerate(news_list, 1):
-            title = news.get('title', '无标题')
-            source = news.get('source_platform', news.get('source', '未知'))
-            
-            # 清理标题中的特殊字符
-            title = re.sub(r'[#@]', '', title).strip()
-            
-            news_items.append(f"{i}. 【{source}】{title}")
-        
-        return "\n".join(news_items)
-    
-    def _build_analysis_prompt(self, news_text: str, max_keywords: int) -> str:
-        """构建分析提示词"""
-        news_count = len(news_text.split('\n'))
-        
-        prompt = f"""
-请分析以下{news_count}条今日热点新闻，完成两个任务：
-
-新闻列表：
-{news_text}
-
-任务1：提取关键词（最多{max_keywords}个）
- 提取能代表今日热点话题的关键词
- 关键词应该适合用于社交媒体平台搜索
- 优先选择热度高、讨论量大的话题
- 避免过于宽泛或过于具体的词汇
-
-任务2：撰写新闻分析总结（150-300字）
- 简要概括今日热点新闻的主要内容
- 指出当前社会关注的重点话题方向
- 分析这些热点反映的社会现象或趋势
- 语言简洁明了，客观中性
-
-请严格按照以下JSON格式输出：
-```json
-{{
-  "keywords": ["关键词1", "关键词2", "关键词3"],
-  "summary": "今日新闻分析总结内容..."
-}}
-```
-
-请直接输出JSON格式的结果，不要包含其他文字说明。
-"""
-        return prompt
-    
-    def _parse_analysis_result(self, result_text: str) -> Tuple[List[str], str]:
-        """解析分析结果"""
-        try:
-            # 尝试提取JSON部分
-            json_match = re.search(r'```json\s*(.*?)\s*```', result_text, re.DOTALL)
-            if json_match:
-                json_text = json_match.group(1)
-            else:
-                # 如果没有代码块，尝试直接解析
-                json_text = result_text.strip()
-            
-            # 解析JSON
-            data = json.loads(json_text)
-            
-            keywords = data.get('keywords', [])
-            summary = data.get('summary', '')
-            
-            # 验证和清理关键词
-            clean_keywords = []
-            for keyword in keywords:
-                keyword = str(keyword).strip()
-                if keyword and len(keyword) > 1 and keyword not in clean_keywords:
-                    clean_keywords.append(keyword)
-            
-            # 验证总结
-            if not summary or len(summary.strip()) < 10:
-                summary = "今日热点新闻涵盖多个领域，反映了当前社会的多元化关注点。"
-            
-            return clean_keywords, summary.strip()
-            
-        except json.JSONDecodeError as e:
-            print(f"解析JSON失败: {e}")
-            print(f"原始返回: {result_text}")
-            
-            # 尝试手动解析
-            return self._manual_parse_result(result_text)
-        
-        except Exception as e:
-            print(f"处理分析结果失败: {e}")
-            return [], "分析结果处理失败，请稍后重试。"
-    
-    def _manual_parse_result(self, text: str) -> Tuple[List[str], str]:
-        """手动解析结果（当JSON解析失败时的后备方案）"""
-        print("尝试手动解析结果...")
-        
-        keywords = []
-        summary = ""
-        
-        lines = text.split('\n')
-        
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            
-            # 寻找关键词
-            if '关键词' in line or 'keywords' in line.lower():
-                # 提取关键词
-                keyword_match = re.findall(r'[""](.*?)["""]', line)
-                if keyword_match:
-                    keywords.extend(keyword_match)
-                else:
-                    # 尝试其他分隔符
-                    parts = re.split(r'[,，、]', line)
-                    for part in parts:
-                        clean_part = re.sub(r'[关键词：:keywords\[\]"]', '', part).strip()
-                        if clean_part and len(clean_part) > 1:
-                            keywords.append(clean_part)
-            
-            # 寻找总结
-            elif '总结' in line or '分析' in line or 'summary' in line.lower():
-                if '：' in line or ':' in line:
-                    summary = line.split('：')[-1].split(':')[-1].strip()
-            
-            # 如果这一行看起来像总结内容
-            elif len(line) > 50 and ('今日' in line or '热点' in line or '新闻' in line):
-                if not summary:
-                    summary = line
-        
-        # 清理关键词
-        clean_keywords = []
-        for keyword in keywords:
-            keyword = keyword.strip()
-            if keyword and len(keyword) > 1 and keyword not in clean_keywords:
-                clean_keywords.append(keyword)
-        
-        # 如果没有找到总结，生成一个简单的
-        if not summary:
-            summary = "今日热点新闻内容丰富，涵盖了社会各个层面的关注点。"
-        
-        return clean_keywords[:max_keywords], summary
-    
-    def _extract_simple_keywords(self, news_list: List[Dict]) -> List[str]:
-        """简单关键词提取（fallback方案）"""
-        keywords = []
-        
-        for news in news_list:
-            title = news.get('title', '')
-            
-            # 简单的关键词提取
-            # 移除常见的无意义词汇
-            title_clean = re.sub(r'[#@【】\[\]()（）]', ' ', title)
-            words = title_clean.split()
-            
-            for word in words:
-                word = word.strip()
-                if (len(word) > 1 and 
-                    word not in ['的', '了', '在', '和', '与', '或', '但', '是', '有', '被', '将', '已', '正在'] and
-                    word not in keywords):
-                    keywords.append(word)
-        
-        return keywords[:10]
-    
-    def get_search_keywords(self, keywords: List[str], limit: int = 10) -> List[str]:
-        """
-        获取用于搜索的关键词
-        
-        Args:
-            keywords: 关键词列表
-            limit: 限制数量
-            
-        Returns:
-            适合搜索的关键词列表
-        """
-        # 过滤和优化关键词
-        search_keywords = []
-        
-        for keyword in keywords:
-            keyword = str(keyword).strip()
-            
-            # 过滤条件
-            if (len(keyword) > 1 and 
-                len(keyword) < 20 and  # 不能太长
-                keyword not in search_keywords and
-                not keyword.isdigit() and  # 不是纯数字
-                not re.match(r'^[a-zA-Z]+$', keyword)):  # 不是纯英文（除非是专有名词）
-                
-                search_keywords.append(keyword)
-        
-        return search_keywords[:limit]
-
-if __name__ == "__main__":
-    # 测试话题提取器
-    extractor = TopicExtractor()
-    
-    # 模拟新闻数据
-    test_news = [
-        {"title": "AI技术发展迅速", "source_platform": "科技新闻"},
-        {"title": "股市行情分析", "source_platform": "财经新闻"},
-        {"title": "明星最新动态", "source_platform": "娱乐新闻"}
-    ]
-    
-    keywords, summary = extractor.extract_keywords_and_summary(test_news)
-    
-    print(f"提取的关键词: {keywords}")
-    print(f"新闻总结: {summary}")
-    
-    search_keywords = extractor.get_search_keywords(keywords)
-    print(f"搜索关键词: {search_keywords}")