1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
2025-11-03 22:38:34 +08:00
parent 3ad807778c
commit 96c7d2d3b7
11 changed files with 737 additions and 574 deletions
@@ -11,8 +11,13 @@ import argparse
 from datetime import date, datetime
 from pathlib import Path
 import subprocess
+import asyncio
 import pymysql
 from pymysql.cursors import DictCursor
+from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
+from sqlalchemy import inspect, text
+from config import settings
+from loguru import logger

 # 添加项目根目录到路径
 project_root = Path(__file__).parent
@@ -21,8 +26,8 @@ sys.path.append(str(project_root))
 try:
    import config
 except ImportError:
-    print("错误：无法导入config.py配置文件")
-    print("请确保项目根目录下存在config.py文件，并包含数据库和API配置信息")
+    logger.error("错误：无法导入config.py配置文件")
+    logger.error("请确保项目根目录下存在config.py文件，并包含数据库和API配置信息")
    sys.exit(1)

 class MindSpider:
@@ -35,99 +40,110 @@ class MindSpider:
        self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling"
        self.schema_path = self.project_root / "schema"
        
-        print("MindSpider AI爬虫项目")
-        print(f"项目路径: {self.project_root}")
+        logger.info("MindSpider AI爬虫项目")
+        logger.info(f"项目路径: {self.project_root}")
    
    def check_config(self) -> bool:
        """检查基础配置"""
-        print("\n检查基础配置...")
+        logger.info("检查基础配置...")
        
-        # 检查config.py配置项
+        # 检查settings配置项
        required_configs = [
            'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET',
-            'DEEPSEEK_API_KEY'
+            'MINDSPIDER_API_KEY', 'MINDSPIDER_BASE_URL', 'MINDSPIDER_MODEL_NAME'
        ]
        
        missing_configs = []
        for config_name in required_configs:
-            if not hasattr(config, config_name) or not getattr(config, config_name):
+            if not hasattr(settings, config_name) or not getattr(settings, config_name):
                missing_configs.append(config_name)
        
        if missing_configs:
-            print(f"配置缺失: {', '.join(missing_configs)}")
-            print("请检查config.py文件中的配置信息")
+            logger.error(f"配置缺失: {', '.join(missing_configs)}")
+            logger.error("请检查config.py文件中的配置信息")
            return False
        
-        print("基础配置检查通过")
+        logger.info("基础配置检查通过")
        return True
    
    def check_database_connection(self) -> bool:
        """检查数据库连接"""
-        print("\n检查数据库连接...")
+        logger.info("检查数据库连接...")
        
-        try:
-            connection = pymysql.connect(
-                host=config.DB_HOST,
-                port=config.DB_PORT,
-                user=config.DB_USER,
-                password=config.DB_PASSWORD,
-                database=config.DB_NAME,
-                charset=config.DB_CHARSET,
-                cursorclass=DictCursor
+        def build_async_url() -> str:
+            dialect = (settings.DB_DIALECT or "mysql").lower()
+            if dialect == "postgresql":
+                return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
+            # 默认使用 mysql 异步驱动 asyncmy
+            return (
+                f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
+                f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
            )
-            connection.close()
-            print("数据库连接正常")
+
+        async def _test_connection(db_url: str) -> None:
+            engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
+            try:
+                async with engine.connect() as conn:
+                    await conn.execute(text("SELECT 1"))
+            finally:
+                await engine.dispose()
+
+        try:
+            db_url: str = build_async_url()
+            asyncio.run(_test_connection(db_url))
+            logger.info("数据库连接正常")
            return True
        except Exception as e:
-            print(f"数据库连接失败: {e}")
+            logger.exception(f"数据库连接失败: {e}")
            return False
    
    def check_database_tables(self) -> bool:
        """检查数据库表是否存在"""
-        print("\n检查数据库表...")
+        logger.info("检查数据库表...")
        
-        try:
-            connection = pymysql.connect(
-                host=config.DB_HOST,
-                port=config.DB_PORT,
-                user=config.DB_USER,
-                password=config.DB_PASSWORD,
-                database=config.DB_NAME,
-                charset=config.DB_CHARSET,
-                cursorclass=DictCursor
+        def build_async_url() -> str:
+            dialect = (settings.DB_DIALECT or "mysql").lower()
+            if dialect == "postgresql":
+                return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
+            return (
+                f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
+                f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
            )
-            
-            cursor = connection.cursor()
-            
-            # 检查核心表是否存在
+
+        async def _check_tables(db_url: str) -> list[str]:
+            engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
+            try:
+                async with engine.connect() as conn:
+                    def _get_tables(sync_conn):
+                        return inspect(sync_conn).get_table_names()
+                    tables = await conn.run_sync(_get_tables)
+                    return tables
+            finally:
+                await engine.dispose()
+
+        try:
+            db_url: str = build_async_url()
+            existing_tables = asyncio.run(_check_tables(db_url))
            required_tables = ['daily_news', 'daily_topics']
-            cursor.execute("SHOW TABLES")
-            existing_tables = [row[f'Tables_in_{config.DB_NAME}'] for row in cursor.fetchall()]
-            
-            missing_tables = [table for table in required_tables if table not in existing_tables]
-            
-            connection.close()
-            
+            missing_tables = [t for t in required_tables if t not in existing_tables]
            if missing_tables:
-                print(f"缺少数据库表: {', '.join(missing_tables)}")
+                logger.error(f"缺少数据库表: {', '.join(missing_tables)}")
                return False
-            else:
-                print("数据库表检查通过")
-                return True
-                
+            logger.info("数据库表检查通过")
+            return True
        except Exception as e:
-            print(f"检查数据库表失败: {e}")
+            logger.exception(f"检查数据库表失败: {e}")
            return False
    
    def initialize_database(self) -> bool:
        """初始化数据库"""
-        print("\n初始化数据库...")
+        logger.info("初始化数据库...")
        
        try:
            # 运行数据库初始化脚本
            init_script = self.schema_path / "init_database.py"
            if not init_script.exists():
-                print("错误：找不到数据库初始化脚本")
+                logger.error("错误：找不到数据库初始化脚本")
                return False
            
            result = subprocess.run(
@@ -138,19 +154,19 @@ class MindSpider:
            )
            
            if result.returncode == 0:
-                print("数据库初始化成功")
+                logger.info("数据库初始化成功")
                return True
            else:
-                print(f"数据库初始化失败: {result.stderr}")
+                logger.error(f"数据库初始化失败: {result.stderr}")
                return False
                
        except Exception as e:
-            print(f"数据库初始化异常: {e}")
+            logger.exception(f"数据库初始化异常: {e}")
            return False
    
    def check_dependencies(self) -> bool:
        """检查依赖环境"""
-        print("\n检查依赖环境...")
+        logger.info("检查依赖环境...")
        
        # 检查Python包
        required_packages = ['pymysql', 'requests', 'playwright']
@@ -163,22 +179,22 @@ class MindSpider:
                missing_packages.append(package)
        
        if missing_packages:
-            print(f"缺少Python包: {', '.join(missing_packages)}")
-            print("请运行: pip install -r requirements.txt")
+            logger.error(f"缺少Python包: {', '.join(missing_packages)}")
+            logger.info("请运行: pip install -r requirements.txt")
            return False
        
        # 检查MediaCrawler依赖
        mediacrawler_path = self.deep_sentiment_path / "MediaCrawler"
        if not mediacrawler_path.exists():
-            print("错误：找不到MediaCrawler目录")
+            logger.error("错误：找不到MediaCrawler目录")
            return False
        
-        print("依赖环境检查通过")
+        logger.info("依赖环境检查通过")
        return True
    
    def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool:
        """运行BroadTopicExtraction模块"""
-        print(f"\n运行BroadTopicExtraction模块...")
+        logger.info("运行BroadTopicExtraction模块...")
        
        if not extract_date:
            extract_date = date.today()
@@ -186,11 +202,10 @@ class MindSpider:
        try:
            cmd = [
                sys.executable, "main.py",
-                "--date", extract_date.strftime("%Y-%m-%d"),
                "--keywords", str(keywords_count)
            ]
            
-            print(f"执行命令: {' '.join(cmd)}")
+            logger.info(f"执行命令: {' '.join(cmd)}")
            
            result = subprocess.run(
                cmd,
@@ -199,24 +214,24 @@ class MindSpider:
            )
            
            if result.returncode == 0:
-                print("BroadTopicExtraction模块执行成功")
+                logger.info("BroadTopicExtraction模块执行成功")
                return True
            else:
-                print(f"BroadTopicExtraction模块执行失败，返回码: {result.returncode}")
+                logger.error(f"BroadTopicExtraction模块执行失败，返回码: {result.returncode}")
                return False
                
        except subprocess.TimeoutExpired:
-            print("BroadTopicExtraction模块执行超时")
+            logger.error("BroadTopicExtraction模块执行超时")
            return False
        except Exception as e:
-            print(f"BroadTopicExtraction模块执行异常: {e}")
+            logger.exception(f"BroadTopicExtraction模块执行异常: {e}")
            return False
    
    def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None,
                                   max_keywords: int = 50, max_notes: int = 50,
                                   test_mode: bool = False) -> bool:
        """运行DeepSentimentCrawling模块"""
-        print(f"\n运行DeepSentimentCrawling模块...")
+        logger.info("运行DeepSentimentCrawling模块...")
        
        if not target_date:
            target_date = date.today()
@@ -238,7 +253,7 @@ class MindSpider:
            if test_mode:
                cmd.append("--test")
            
-            print(f"执行命令: {' '.join(cmd)}")
+            logger.info(f"执行命令: {' '.join(cmd)}")
            
            result = subprocess.run(
                cmd,
@@ -247,78 +262,78 @@ class MindSpider:
            )
            
            if result.returncode == 0:
-                print("DeepSentimentCrawling模块执行成功")
+                logger.info("DeepSentimentCrawling模块执行成功")
                return True
            else:
-                print(f"DeepSentimentCrawling模块执行失败，返回码: {result.returncode}")
+                logger.error(f"DeepSentimentCrawling模块执行失败，返回码: {result.returncode}")
                return False
                
        except subprocess.TimeoutExpired:
-            print("DeepSentimentCrawling模块执行超时")
+            logger.error("DeepSentimentCrawling模块执行超时")
            return False
        except Exception as e:
-            print(f"DeepSentimentCrawling模块执行异常: {e}")
+            logger.exception(f"DeepSentimentCrawling模块执行异常: {e}")
            return False
    
    def run_complete_workflow(self, target_date: date = None, platforms: list = None,
                             keywords_count: int = 100, max_keywords: int = 50,
                             max_notes: int = 50, test_mode: bool = False) -> bool:
        """运行完整工作流程"""
-        print(f"\n开始完整的MindSpider工作流程")
+        logger.info("开始完整的MindSpider工作流程")
        
        if not target_date:
            target_date = date.today()
        
-        print(f"目标日期: {target_date}")
-        print(f"平台列表: {platforms if platforms else '所有支持的平台'}")
-        print(f"测试模式: {'是' if test_mode else '否'}")
+        logger.info(f"目标日期: {target_date}")
+        logger.info(f"平台列表: {platforms if platforms else '所有支持的平台'}")
+        logger.info(f"测试模式: {'是' if test_mode else '否'}")
        
        # 第一步：运行话题提取
-        print(f"\n=== 第一步：话题提取 ===")
+        logger.info("=== 第一步：话题提取 ===")
        if not self.run_broad_topic_extraction(target_date, keywords_count):
-            print("话题提取失败，终止流程")
+            logger.error("话题提取失败，终止流程")
            return False
        
        # 第二步：运行情感爬取
-        print(f"\n=== 第二步：情感爬取 ===")
+        logger.info("=== 第二步：情感爬取 ===")
        if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode):
-            print("情感爬取失败，但话题提取已完成")
+            logger.error("情感爬取失败，但话题提取已完成")
            return False
        
-        print(f"\n完整工作流程执行成功！")
+        logger.info("完整工作流程执行成功！")
        return True
    
    def show_status(self):
        """显示项目状态"""
-        print(f"\nMindSpider项目状态:")
-        print(f"项目路径: {self.project_root}")
+        logger.info("MindSpider项目状态:")
+        logger.info(f"项目路径: {self.project_root}")
        
        # 配置状态
        config_ok = self.check_config()
-        print(f"配置状态: {'正常' if config_ok else '异常'}")
+        logger.info(f"配置状态: {'正常' if config_ok else '异常'}")
        
        # 数据库状态
        if config_ok:
            db_conn_ok = self.check_database_connection()
-            print(f"数据库连接: {'正常' if db_conn_ok else '异常'}")
+            logger.info(f"数据库连接: {'正常' if db_conn_ok else '异常'}")
            
            if db_conn_ok:
                db_tables_ok = self.check_database_tables()
-                print(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}")
+                logger.info(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}")
        
        # 依赖状态
        deps_ok = self.check_dependencies()
-        print(f"依赖环境: {'正常' if deps_ok else '异常'}")
+        logger.info(f"依赖环境: {'正常' if deps_ok else '异常'}")
        
        # 模块状态
        broad_topic_exists = self.broad_topic_path.exists()
        deep_sentiment_exists = self.deep_sentiment_path.exists()
-        print(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")
-        print(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}")
+        logger.info(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")
+        logger.info(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}")
    
    def setup_project(self) -> bool:
        """项目初始化设置"""
-        print(f"\n开始MindSpider项目初始化...")
+        logger.info("开始MindSpider项目初始化...")
        
        # 1. 检查配置
        if not self.check_config():
@@ -334,11 +349,11 @@ class MindSpider:
        
        # 4. 检查并初始化数据库表
        if not self.check_database_tables():
-            print("需要初始化数据库表...")
+            logger.info("需要初始化数据库表...")
            if not self.initialize_database():
                return False
        
-        print(f"\nMindSpider项目初始化完成！")
+        logger.info("MindSpider项目初始化完成！")
        return True

 def main():
@@ -373,7 +388,7 @@ def main():
        try:
            target_date = datetime.strptime(args.date, "%Y-%m-%d").date()
        except ValueError:
-            print("错误：日期格式不正确，请使用 YYYY-MM-DD 格式")
+            logger.error("错误：日期格式不正确，请使用 YYYY-MM-DD 格式")
            return
    
    # 创建MindSpider实例
@@ -388,17 +403,17 @@ def main():
        # 项目设置
        if args.setup:
            if spider.setup_project():
-                print("项目设置完成，可以开始使用MindSpider！")
+                logger.info("项目设置完成，可以开始使用MindSpider！")
            else:
-                print("项目设置失败，请检查配置和环境")
+                logger.error("项目设置失败，请检查配置和环境")
            return
        
        # 初始化数据库
        if args.init_db:
            if spider.initialize_database():
-                print("数据库初始化成功")
+                logger.info("数据库初始化成功")
            else:
-                print("数据库初始化失败")
+                logger.error("数据库初始化失败")
            return
        
        # 运行模块
@@ -415,16 +430,16 @@ def main():
            )
        else:
            # 默认运行完整工作流程
-            print("运行完整MindSpider工作流程...")
+            logger.info("运行完整MindSpider工作流程...")
            spider.run_complete_workflow(
                target_date, args.platforms, args.keywords_count,
                args.max_keywords, args.max_notes, args.test
            )
    
    except KeyboardInterrupt:
-        print("\n用户中断操作")
+        logger.info("用户中断操作")
    except Exception as e:
-        print(f"\n执行出错: {e}")
+        logger.exception(f"执行出错: {e}")

 if __name__ == "__main__":
    main()