1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 3ad807778c
commit 96c7d2d3b7
11 changed files with 737 additions and 574 deletions
+117 -102
View File
@@ -11,8 +11,13 @@ import argparse
from datetime import date, datetime
from pathlib import Path
import subprocess
import asyncio
import pymysql
from pymysql.cursors import DictCursor
from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
from sqlalchemy import inspect, text
from config import settings
from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent
@@ -21,8 +26,8 @@ sys.path.append(str(project_root))
try:
import config
except ImportError:
print("错误:无法导入config.py配置文件")
print("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息")
logger.error("错误:无法导入config.py配置文件")
logger.error("请确保项目根目录下存在config.py文件,并包含数据库和API配置信息")
sys.exit(1)
class MindSpider:
@@ -35,99 +40,110 @@ class MindSpider:
self.deep_sentiment_path = self.project_root / "DeepSentimentCrawling"
self.schema_path = self.project_root / "schema"
print("MindSpider AI爬虫项目")
print(f"项目路径: {self.project_root}")
logger.info("MindSpider AI爬虫项目")
logger.info(f"项目路径: {self.project_root}")
def check_config(self) -> bool:
"""检查基础配置"""
print("\n检查基础配置...")
logger.info("检查基础配置...")
# 检查config.py配置项
# 检查settings配置项
required_configs = [
'DB_HOST', 'DB_PORT', 'DB_USER', 'DB_PASSWORD', 'DB_NAME', 'DB_CHARSET',
'DEEPSEEK_API_KEY'
'MINDSPIDER_API_KEY', 'MINDSPIDER_BASE_URL', 'MINDSPIDER_MODEL_NAME'
]
missing_configs = []
for config_name in required_configs:
if not hasattr(config, config_name) or not getattr(config, config_name):
if not hasattr(settings, config_name) or not getattr(settings, config_name):
missing_configs.append(config_name)
if missing_configs:
print(f"配置缺失: {', '.join(missing_configs)}")
print("请检查config.py文件中的配置信息")
logger.error(f"配置缺失: {', '.join(missing_configs)}")
logger.error("请检查config.py文件中的配置信息")
return False
print("基础配置检查通过")
logger.info("基础配置检查通过")
return True
def check_database_connection(self) -> bool:
"""检查数据库连接"""
print("\n检查数据库连接...")
logger.info("检查数据库连接...")
try:
connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
database=config.DB_NAME,
charset=config.DB_CHARSET,
cursorclass=DictCursor
def build_async_url() -> str:
dialect = (settings.DB_DIALECT or "mysql").lower()
if dialect == "postgresql":
return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
# 默认使用 mysql 异步驱动 asyncmy
return (
f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
)
connection.close()
print("数据库连接正常")
async def _test_connection(db_url: str) -> None:
engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
try:
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
finally:
await engine.dispose()
try:
db_url: str = build_async_url()
asyncio.run(_test_connection(db_url))
logger.info("数据库连接正常")
return True
except Exception as e:
print(f"数据库连接失败: {e}")
logger.exception(f"数据库连接失败: {e}")
return False
def check_database_tables(self) -> bool:
"""检查数据库表是否存在"""
print("\n检查数据库表...")
logger.info("检查数据库表...")
try:
connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
database=config.DB_NAME,
charset=config.DB_CHARSET,
cursorclass=DictCursor
def build_async_url() -> str:
dialect = (settings.DB_DIALECT or "mysql").lower()
if dialect == "postgresql":
return f"postgresql+asyncpg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
return (
f"mysql+asyncmy://{settings.DB_USER}:{settings.DB_PASSWORD}"
f"@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
)
cursor = connection.cursor()
# 检查核心表是否存在
async def _check_tables(db_url: str) -> list[str]:
engine: AsyncEngine = create_async_engine(db_url, pool_pre_ping=True)
try:
async with engine.connect() as conn:
def _get_tables(sync_conn):
return inspect(sync_conn).get_table_names()
tables = await conn.run_sync(_get_tables)
return tables
finally:
await engine.dispose()
try:
db_url: str = build_async_url()
existing_tables = asyncio.run(_check_tables(db_url))
required_tables = ['daily_news', 'daily_topics']
cursor.execute("SHOW TABLES")
existing_tables = [row[f'Tables_in_{config.DB_NAME}'] for row in cursor.fetchall()]
missing_tables = [table for table in required_tables if table not in existing_tables]
connection.close()
missing_tables = [t for t in required_tables if t not in existing_tables]
if missing_tables:
print(f"缺少数据库表: {', '.join(missing_tables)}")
logger.error(f"缺少数据库表: {', '.join(missing_tables)}")
return False
else:
print("数据库表检查通过")
return True
logger.info("数据库表检查通过")
return True
except Exception as e:
print(f"检查数据库表失败: {e}")
logger.exception(f"检查数据库表失败: {e}")
return False
def initialize_database(self) -> bool:
"""初始化数据库"""
print("\n初始化数据库...")
logger.info("初始化数据库...")
try:
# 运行数据库初始化脚本
init_script = self.schema_path / "init_database.py"
if not init_script.exists():
print("错误:找不到数据库初始化脚本")
logger.error("错误:找不到数据库初始化脚本")
return False
result = subprocess.run(
@@ -138,19 +154,19 @@ class MindSpider:
)
if result.returncode == 0:
print("数据库初始化成功")
logger.info("数据库初始化成功")
return True
else:
print(f"数据库初始化失败: {result.stderr}")
logger.error(f"数据库初始化失败: {result.stderr}")
return False
except Exception as e:
print(f"数据库初始化异常: {e}")
logger.exception(f"数据库初始化异常: {e}")
return False
def check_dependencies(self) -> bool:
"""检查依赖环境"""
print("\n检查依赖环境...")
logger.info("检查依赖环境...")
# 检查Python包
required_packages = ['pymysql', 'requests', 'playwright']
@@ -163,22 +179,22 @@ class MindSpider:
missing_packages.append(package)
if missing_packages:
print(f"缺少Python包: {', '.join(missing_packages)}")
print("请运行: pip install -r requirements.txt")
logger.error(f"缺少Python包: {', '.join(missing_packages)}")
logger.info("请运行: pip install -r requirements.txt")
return False
# 检查MediaCrawler依赖
mediacrawler_path = self.deep_sentiment_path / "MediaCrawler"
if not mediacrawler_path.exists():
print("错误:找不到MediaCrawler目录")
logger.error("错误:找不到MediaCrawler目录")
return False
print("依赖环境检查通过")
logger.info("依赖环境检查通过")
return True
def run_broad_topic_extraction(self, extract_date: date = None, keywords_count: int = 100) -> bool:
"""运行BroadTopicExtraction模块"""
print(f"\n运行BroadTopicExtraction模块...")
logger.info("运行BroadTopicExtraction模块...")
if not extract_date:
extract_date = date.today()
@@ -186,11 +202,10 @@ class MindSpider:
try:
cmd = [
sys.executable, "main.py",
"--date", extract_date.strftime("%Y-%m-%d"),
"--keywords", str(keywords_count)
]
print(f"执行命令: {' '.join(cmd)}")
logger.info(f"执行命令: {' '.join(cmd)}")
result = subprocess.run(
cmd,
@@ -199,24 +214,24 @@ class MindSpider:
)
if result.returncode == 0:
print("BroadTopicExtraction模块执行成功")
logger.info("BroadTopicExtraction模块执行成功")
return True
else:
print(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}")
logger.error(f"BroadTopicExtraction模块执行失败,返回码: {result.returncode}")
return False
except subprocess.TimeoutExpired:
print("BroadTopicExtraction模块执行超时")
logger.error("BroadTopicExtraction模块执行超时")
return False
except Exception as e:
print(f"BroadTopicExtraction模块执行异常: {e}")
logger.exception(f"BroadTopicExtraction模块执行异常: {e}")
return False
def run_deep_sentiment_crawling(self, target_date: date = None, platforms: list = None,
max_keywords: int = 50, max_notes: int = 50,
test_mode: bool = False) -> bool:
"""运行DeepSentimentCrawling模块"""
print(f"\n运行DeepSentimentCrawling模块...")
logger.info("运行DeepSentimentCrawling模块...")
if not target_date:
target_date = date.today()
@@ -238,7 +253,7 @@ class MindSpider:
if test_mode:
cmd.append("--test")
print(f"执行命令: {' '.join(cmd)}")
logger.info(f"执行命令: {' '.join(cmd)}")
result = subprocess.run(
cmd,
@@ -247,78 +262,78 @@ class MindSpider:
)
if result.returncode == 0:
print("DeepSentimentCrawling模块执行成功")
logger.info("DeepSentimentCrawling模块执行成功")
return True
else:
print(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}")
logger.error(f"DeepSentimentCrawling模块执行失败,返回码: {result.returncode}")
return False
except subprocess.TimeoutExpired:
print("DeepSentimentCrawling模块执行超时")
logger.error("DeepSentimentCrawling模块执行超时")
return False
except Exception as e:
print(f"DeepSentimentCrawling模块执行异常: {e}")
logger.exception(f"DeepSentimentCrawling模块执行异常: {e}")
return False
def run_complete_workflow(self, target_date: date = None, platforms: list = None,
keywords_count: int = 100, max_keywords: int = 50,
max_notes: int = 50, test_mode: bool = False) -> bool:
"""运行完整工作流程"""
print(f"\n开始完整的MindSpider工作流程")
logger.info("开始完整的MindSpider工作流程")
if not target_date:
target_date = date.today()
print(f"目标日期: {target_date}")
print(f"平台列表: {platforms if platforms else '所有支持的平台'}")
print(f"测试模式: {'' if test_mode else ''}")
logger.info(f"目标日期: {target_date}")
logger.info(f"平台列表: {platforms if platforms else '所有支持的平台'}")
logger.info(f"测试模式: {'' if test_mode else ''}")
# 第一步:运行话题提取
print(f"\n=== 第一步:话题提取 ===")
logger.info("=== 第一步:话题提取 ===")
if not self.run_broad_topic_extraction(target_date, keywords_count):
print("话题提取失败,终止流程")
logger.error("话题提取失败,终止流程")
return False
# 第二步:运行情感爬取
print(f"\n=== 第二步:情感爬取 ===")
logger.info("=== 第二步:情感爬取 ===")
if not self.run_deep_sentiment_crawling(target_date, platforms, max_keywords, max_notes, test_mode):
print("情感爬取失败,但话题提取已完成")
logger.error("情感爬取失败,但话题提取已完成")
return False
print(f"\n完整工作流程执行成功!")
logger.info("完整工作流程执行成功!")
return True
def show_status(self):
"""显示项目状态"""
print(f"\nMindSpider项目状态:")
print(f"项目路径: {self.project_root}")
logger.info("MindSpider项目状态:")
logger.info(f"项目路径: {self.project_root}")
# 配置状态
config_ok = self.check_config()
print(f"配置状态: {'正常' if config_ok else '异常'}")
logger.info(f"配置状态: {'正常' if config_ok else '异常'}")
# 数据库状态
if config_ok:
db_conn_ok = self.check_database_connection()
print(f"数据库连接: {'正常' if db_conn_ok else '异常'}")
logger.info(f"数据库连接: {'正常' if db_conn_ok else '异常'}")
if db_conn_ok:
db_tables_ok = self.check_database_tables()
print(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}")
logger.info(f"数据库表: {'正常' if db_tables_ok else '需要初始化'}")
# 依赖状态
deps_ok = self.check_dependencies()
print(f"依赖环境: {'正常' if deps_ok else '异常'}")
logger.info(f"依赖环境: {'正常' if deps_ok else '异常'}")
# 模块状态
broad_topic_exists = self.broad_topic_path.exists()
deep_sentiment_exists = self.deep_sentiment_path.exists()
print(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")
print(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}")
logger.info(f"BroadTopicExtraction模块: {'存在' if broad_topic_exists else '缺失'}")
logger.info(f"DeepSentimentCrawling模块: {'存在' if deep_sentiment_exists else '缺失'}")
def setup_project(self) -> bool:
"""项目初始化设置"""
print(f"\n开始MindSpider项目初始化...")
logger.info("开始MindSpider项目初始化...")
# 1. 检查配置
if not self.check_config():
@@ -334,11 +349,11 @@ class MindSpider:
# 4. 检查并初始化数据库表
if not self.check_database_tables():
print("需要初始化数据库表...")
logger.info("需要初始化数据库表...")
if not self.initialize_database():
return False
print(f"\nMindSpider项目初始化完成!")
logger.info("MindSpider项目初始化完成!")
return True
def main():
@@ -373,7 +388,7 @@ def main():
try:
target_date = datetime.strptime(args.date, "%Y-%m-%d").date()
except ValueError:
print("错误:日期格式不正确,请使用 YYYY-MM-DD 格式")
logger.error("错误:日期格式不正确,请使用 YYYY-MM-DD 格式")
return
# 创建MindSpider实例
@@ -388,17 +403,17 @@ def main():
# 项目设置
if args.setup:
if spider.setup_project():
print("项目设置完成,可以开始使用MindSpider!")
logger.info("项目设置完成,可以开始使用MindSpider!")
else:
print("项目设置失败,请检查配置和环境")
logger.error("项目设置失败,请检查配置和环境")
return
# 初始化数据库
if args.init_db:
if spider.initialize_database():
print("数据库初始化成功")
logger.info("数据库初始化成功")
else:
print("数据库初始化失败")
logger.error("数据库初始化失败")
return
# 运行模块
@@ -415,16 +430,16 @@ def main():
)
else:
# 默认运行完整工作流程
print("运行完整MindSpider工作流程...")
logger.info("运行完整MindSpider工作流程...")
spider.run_complete_workflow(
target_date, args.platforms, args.keywords_count,
args.max_keywords, args.max_notes, args.test
)
except KeyboardInterrupt:
print("\n用户中断操作")
logger.info("用户中断操作")
except Exception as e:
print(f"\n执行出错: {e}")
logger.exception(f"执行出错: {e}")
if __name__ == "__main__":
main()