1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
+32 -31
View File
@@ -9,6 +9,7 @@ import os
import sys
import pymysql
from pathlib import Path
from MindSpider.config import settings
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -26,14 +27,14 @@ def create_database_connection():
"""创建数据库连接"""
try:
connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
charset=config.DB_CHARSET,
host=settings.db_host,
port=settings.db_port,
user=settings.db_user,
password=settings.db_password,
charset=settings.db_charset,
autocommit=True
)
print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}")
print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}")
return connection
except Exception as e:
print(f"连接数据库失败: {e}")
@@ -43,9 +44,9 @@ def create_database(connection):
"""创建数据库"""
try:
cursor = connection.cursor()
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
cursor.execute(f"USE `{config.DB_NAME}`")
print(f"数据库 '{config.DB_NAME}' 创建/选择成功")
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
cursor.execute(f"USE `{settings.db_name}`")
print(f"数据库 '{settings.db_name}' 创建/选择成功")
return True
except Exception as e:
print(f"创建数据库失败: {e}")
@@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""):
if not os.path.exists(sql_file_path):
print(f"警告: SQL文件不存在: {sql_file_path}")
return False
try:
cursor = connection.cursor()
with open(sql_file_path, 'r', encoding='utf-8') as f:
sql_content = f.read()
# 分割SQL语句(简单实现,按分号分割)
sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]
success_count = 0
error_count = 0
for stmt in sql_statements:
if not stmt or stmt.startswith('--'):
continue
@@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""):
except Exception as e:
error_count += 1
print(f"执行SQL语句失败: {str(e)[:100]}...")
print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")
return error_count == 0
except Exception as e:
print(f"执行SQL文件失败 {sql_file_path}: {e}")
return False
@@ -90,44 +91,44 @@ def main():
print("=" * 60)
print("MindSpider AI爬虫项目 - 数据库初始化")
print("=" * 60)
# 检查配置
print("检查数据库配置...")
print(f"数据库主机: {config.DB_HOST}")
print(f"数据库端口: {config.DB_PORT}")
print(f"数据库名称: {config.DB_NAME}")
print(f"数据库用户: {config.DB_USER}")
print(f"字符集: {config.DB_CHARSET}")
print(f"数据库主机: {settings.db_host}")
print(f"数据库端口: {settings.db_port}")
print(f"数据库名称: {settings.db_name}")
print(f"数据库用户: {settings.db_user}")
print(f"字符集: {settings.db_charset}")
print()
# 创建数据库连接
print("正在连接数据库...")
connection = create_database_connection()
if not connection:
print("数据库初始化失败!")
return False
try:
# 创建数据库
print("正在创建/选择数据库...")
if not create_database(connection):
return False
# 获取SQL文件路径
schema_dir = Path(__file__).parent
mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"
mindspider_sql = schema_dir / "mindspider_tables.sql"
print()
print("开始执行SQL脚本...")
# 1. 执行MediaCrawler的原始表结构
if mediacrawler_sql.exists():
print("1. 创建MediaCrawler基础表...")
execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")
else:
print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")
# 2. 执行MindSpider扩展表结构
print("2. 创建MindSpider扩展表...")
if mindspider_sql.exists():
@@ -135,18 +136,18 @@ def main():
else:
print("错误: MindSpider SQL文件不存在")
return False
print()
print("=" * 60)
print("数据库初始化完成!")
print("=" * 60)
# 显示创建的表
cursor = connection.cursor()
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:")
print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:")
for table in tables:
print(f" - {table[0]}")