1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
from .models_bigdata import Base as BDBase
|
||||
from .models_sa import Base as SABase
|
||||
|
||||
__all__ = ["BDBase", "SABase"]
|
||||
+150
-118
@@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pymysql
|
||||
from sqlalchemy import create_engine, text, inspect
|
||||
from sqlalchemy.engine import Engine
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from loguru import logger
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -19,125 +21,132 @@ sys.path.append(str(project_root))
|
||||
try:
|
||||
import config
|
||||
except ImportError:
|
||||
print("错误: 无法导入config.py配置文件")
|
||||
logger.error("错误: 无法导入config.py配置文件")
|
||||
sys.exit(1)
|
||||
|
||||
from MindSpider.config import settings
|
||||
|
||||
class DatabaseManager:
|
||||
def __init__(self):
|
||||
self.connection = None
|
||||
self.engine: Engine = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""连接数据库"""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
database=config.DB_NAME,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True
|
||||
)
|
||||
print(f"成功连接到数据库: {config.DB_NAME}")
|
||||
dialect = (settings.DB_DIALECT or "mysql").lower()
|
||||
if dialect in ("postgresql", "postgres"):
|
||||
url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
|
||||
else:
|
||||
url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
|
||||
self.engine = create_engine(url, future=True)
|
||||
logger.info(f"成功连接到数据库: {settings.DB_NAME}")
|
||||
except Exception as e:
|
||||
print(f"数据库连接失败: {e}")
|
||||
logger.error(f"数据库连接失败: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接"""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
if self.engine:
|
||||
self.engine.dispose()
|
||||
|
||||
def show_tables(self):
|
||||
"""显示所有表"""
|
||||
print("\n" + "=" * 60)
|
||||
print("数据库表列表")
|
||||
print("=" * 60)
|
||||
data_list_message = ""
|
||||
data_list_message += "\n" + "=" * 60
|
||||
data_list_message += "数据库表列表"
|
||||
data_list_message += "=" * 60
|
||||
logger.info(data_list_message)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute("SHOW TABLES")
|
||||
tables = cursor.fetchall()
|
||||
inspector = inspect(self.engine)
|
||||
tables = inspector.get_table_names()
|
||||
|
||||
if not tables:
|
||||
print("数据库中没有表")
|
||||
logger.info("数据库中没有表")
|
||||
return
|
||||
|
||||
# 分类显示表
|
||||
mindspider_tables = []
|
||||
mediacrawler_tables = []
|
||||
|
||||
for table in tables:
|
||||
table_name = table[0]
|
||||
for table_name in tables:
|
||||
if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']:
|
||||
mindspider_tables.append(table_name)
|
||||
else:
|
||||
mediacrawler_tables.append(table_name)
|
||||
|
||||
print("MindSpider核心表:")
|
||||
data_list_message += "MindSpider核心表:"
|
||||
data_list_message += "\n"
|
||||
for table in mindspider_tables:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {table:<25} ({count:>6} 条记录)")
|
||||
with self.engine.connect() as conn:
|
||||
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
|
||||
data_list_message += f" - {table:<25} ({count:>6} 条记录)"
|
||||
data_list_message += "\n"
|
||||
|
||||
print("\nMediaCrawler平台表:")
|
||||
data_list_message += "\nMediaCrawler平台表:"
|
||||
data_list_message += "\n"
|
||||
for table in mediacrawler_tables:
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {table:<25} ({count:>6} 条记录)")
|
||||
with self.engine.connect() as conn:
|
||||
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
|
||||
data_list_message += f" - {table:<25} ({count:>6} 条记录)"
|
||||
data_list_message += "\n"
|
||||
except:
|
||||
print(f" - {table:<25} (查询失败)")
|
||||
data_list_message += f" - {table:<25} (查询失败)"
|
||||
data_list_message += "\n"
|
||||
logger.info(data_list_message)
|
||||
|
||||
def show_statistics(self):
|
||||
"""显示数据统计"""
|
||||
print("\n" + "=" * 60)
|
||||
print("数据统计")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
data_statistics_message = ""
|
||||
data_statistics_message += "\n" + "=" * 60
|
||||
data_statistics_message += "数据统计"
|
||||
data_statistics_message += "=" * 60
|
||||
data_statistics_message += "\n"
|
||||
|
||||
try:
|
||||
# 新闻统计
|
||||
cursor.execute("SELECT COUNT(*) FROM daily_news")
|
||||
news_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")
|
||||
news_days = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news")
|
||||
platforms = cursor.fetchone()[0]
|
||||
|
||||
print(f"新闻数据:")
|
||||
print(f" - 总新闻数: {news_count}")
|
||||
print(f" - 覆盖天数: {news_days}")
|
||||
print(f" - 新闻平台: {platforms}")
|
||||
with self.engine.connect() as conn:
|
||||
news_count = conn.execute(text("SELECT COUNT(*) FROM daily_news")).scalar_one()
|
||||
news_days = conn.execute(text("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")).scalar_one()
|
||||
platforms = conn.execute(text("SELECT COUNT(DISTINCT source_platform) FROM daily_news")).scalar_one()
|
||||
|
||||
data_statistics_message += "新闻数据:"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 总新闻数: {news_count}"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 覆盖天数: {news_days}"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 新闻平台: {platforms}"
|
||||
data_statistics_message += "\n"
|
||||
# 话题统计
|
||||
cursor.execute("SELECT COUNT(*) FROM daily_topics")
|
||||
topic_count = cursor.fetchone()[0]
|
||||
with self.engine.connect() as conn:
|
||||
topic_count = conn.execute(text("SELECT COUNT(*) FROM daily_topics")).scalar_one()
|
||||
topic_days = conn.execute(text("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")).scalar_one()
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")
|
||||
topic_days = cursor.fetchone()[0]
|
||||
|
||||
print(f"\n话题数据:")
|
||||
print(f" - 总话题数: {topic_count}")
|
||||
print(f" - 提取天数: {topic_days}")
|
||||
data_statistics_message += "话题数据:"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 总话题数: {topic_count}"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 提取天数: {topic_days}"
|
||||
data_statistics_message += "\n"
|
||||
|
||||
# 爬取任务统计
|
||||
cursor.execute("SELECT COUNT(*) FROM crawling_tasks")
|
||||
task_count = cursor.fetchone()[0]
|
||||
with self.engine.connect() as conn:
|
||||
task_count = conn.execute(text("SELECT COUNT(*) FROM crawling_tasks")).scalar_one()
|
||||
task_status = conn.execute(text("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")).all()
|
||||
|
||||
cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")
|
||||
task_status = cursor.fetchall()
|
||||
|
||||
print(f"\n爬取任务:")
|
||||
print(f" - 总任务数: {task_count}")
|
||||
data_statistics_message += "爬取任务:"
|
||||
data_statistics_message += "\n"
|
||||
data_statistics_message += f" - 总任务数: {task_count}"
|
||||
data_statistics_message += "\n"
|
||||
for status, count in task_status:
|
||||
print(f" - {status}: {count}")
|
||||
data_statistics_message += f" - {status}: {count}"
|
||||
data_statistics_message += "\n"
|
||||
|
||||
# 爬取内容统计
|
||||
print(f"\n平台内容统计:")
|
||||
data_statistics_message += "平台内容统计:"
|
||||
data_statistics_message += "\n"
|
||||
platform_tables = {
|
||||
'xhs_note': '小红书',
|
||||
'douyin_aweme': '抖音',
|
||||
@@ -150,60 +159,78 @@ class DatabaseManager:
|
||||
|
||||
for table, platform in platform_tables.items():
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {platform}: {count}")
|
||||
with self.engine.connect() as conn:
|
||||
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
|
||||
data_statistics_message += f" - {platform}: {count}"
|
||||
data_statistics_message += "\n"
|
||||
except:
|
||||
print(f" - {platform}: 表不存在")
|
||||
|
||||
data_statistics_message += f" - {platform}: 表不存在"
|
||||
data_statistics_message += "\n"
|
||||
logger.info(data_statistics_message)
|
||||
except Exception as e:
|
||||
print(f"统计查询失败: {e}")
|
||||
data_statistics_message += f"统计查询失败: {e}"
|
||||
data_statistics_message += "\n"
|
||||
logger.error(data_statistics_message)
|
||||
|
||||
def show_recent_data(self, days=7):
|
||||
"""显示最近几天的数据"""
|
||||
print(f"\n" + "=" * 60)
|
||||
print(f"最近{days}天的数据")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
data_recent_message = ""
|
||||
data_recent_message += "\n" + "=" * 60
|
||||
data_recent_message += "最近" + str(days) + "天的数据"
|
||||
data_recent_message += "=" * 60
|
||||
|
||||
from datetime import date, timedelta
|
||||
start_date = date.today() - timedelta(days=days)
|
||||
# 最近的新闻
|
||||
cursor.execute("""
|
||||
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
""", (days,))
|
||||
|
||||
news_data = cursor.fetchall()
|
||||
with self.engine.connect() as conn:
|
||||
news_data = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= :start_date
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).all()
|
||||
if news_data:
|
||||
print("每日新闻统计:")
|
||||
data_recent_message += "每日新闻统计:"
|
||||
data_recent_message += "\n"
|
||||
for date, count, platforms in news_data:
|
||||
print(f" {date}: {count} 条新闻, {platforms} 个平台")
|
||||
data_recent_message += f" {date}: {count} 条新闻, {platforms} 个平台"
|
||||
data_recent_message += "\n"
|
||||
|
||||
# 最近的话题
|
||||
cursor.execute("""
|
||||
SELECT extract_date, COUNT(*) as topic_count
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY extract_date
|
||||
ORDER BY extract_date DESC
|
||||
""", (days,))
|
||||
|
||||
topic_data = cursor.fetchall()
|
||||
with self.engine.connect() as conn:
|
||||
topic_data = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT extract_date, COUNT(*) as topic_count
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= :start_date
|
||||
GROUP BY extract_date
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).all()
|
||||
if topic_data:
|
||||
print("\n每日话题统计:")
|
||||
data_recent_message += "每日话题统计:"
|
||||
data_recent_message += "\n"
|
||||
for date, count in topic_data:
|
||||
print(f" {date}: {count} 个话题")
|
||||
data_recent_message += f" {date}: {count} 个话题"
|
||||
data_recent_message += "\n"
|
||||
logger.info(data_recent_message)
|
||||
|
||||
def cleanup_old_data(self, days=90, dry_run=True):
|
||||
"""清理旧数据"""
|
||||
print(f"\n" + "=" * 60)
|
||||
print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})")
|
||||
print("=" * 60)
|
||||
cleanup_message = ""
|
||||
cleanup_message += "\n" + "=" * 60
|
||||
cleanup_message += f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})"
|
||||
cleanup_message += "=" * 60
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cutoff_date = datetime.now() - timedelta(days=days)
|
||||
|
||||
# 检查要删除的数据
|
||||
@@ -213,20 +240,25 @@ class DatabaseManager:
|
||||
("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'")
|
||||
]
|
||||
|
||||
for table, query in cleanup_queries:
|
||||
cursor.execute(query)
|
||||
count = cursor.fetchone()[0]
|
||||
if count > 0:
|
||||
print(f" {table}: {count} 条记录将被删除")
|
||||
if not dry_run:
|
||||
delete_query = query.replace("SELECT COUNT(*)", "DELETE")
|
||||
cursor.execute(delete_query)
|
||||
print(f" 已删除 {count} 条记录")
|
||||
else:
|
||||
print(f" {table}: 无需清理")
|
||||
with self.engine.begin() as conn:
|
||||
for table, query in cleanup_queries:
|
||||
count = conn.execute(text(query)).scalar_one()
|
||||
if count > 0:
|
||||
cleanup_message += f" {table}: {count} 条记录将被删除"
|
||||
cleanup_message += "\n"
|
||||
if not dry_run:
|
||||
delete_query = query.replace("SELECT COUNT(*)", "DELETE")
|
||||
conn.execute(text(delete_query))
|
||||
cleanup_message += f" 已删除 {count} 条记录"
|
||||
cleanup_message += "\n"
|
||||
else:
|
||||
cleanup_message += f" {table}: 无需清理"
|
||||
cleanup_message += "\n"
|
||||
|
||||
if dry_run:
|
||||
print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。")
|
||||
cleanup_message += "\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。"
|
||||
cleanup_message += "\n"
|
||||
logger.info(cleanup_message)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="MindSpider数据库管理工具")
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import sys
|
||||
import pymysql
|
||||
from pathlib import Path
|
||||
from MindSpider.config import settings
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -26,14 +27,14 @@ def create_database_connection():
|
||||
"""创建数据库连接"""
|
||||
try:
|
||||
connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
charset=config.DB_CHARSET,
|
||||
host=settings.db_host,
|
||||
port=settings.db_port,
|
||||
user=settings.db_user,
|
||||
password=settings.db_password,
|
||||
charset=settings.db_charset,
|
||||
autocommit=True
|
||||
)
|
||||
print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}")
|
||||
print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}")
|
||||
return connection
|
||||
except Exception as e:
|
||||
print(f"连接数据库失败: {e}")
|
||||
@@ -43,9 +44,9 @@ def create_database(connection):
|
||||
"""创建数据库"""
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
|
||||
cursor.execute(f"USE `{config.DB_NAME}`")
|
||||
print(f"数据库 '{config.DB_NAME}' 创建/选择成功")
|
||||
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
|
||||
cursor.execute(f"USE `{settings.db_name}`")
|
||||
print(f"数据库 '{settings.db_name}' 创建/选择成功")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"创建数据库失败: {e}")
|
||||
@@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""):
|
||||
if not os.path.exists(sql_file_path):
|
||||
print(f"警告: SQL文件不存在: {sql_file_path}")
|
||||
return False
|
||||
|
||||
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
||||
sql_content = f.read()
|
||||
|
||||
|
||||
# 分割SQL语句(简单实现,按分号分割)
|
||||
sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]
|
||||
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
|
||||
|
||||
for stmt in sql_statements:
|
||||
if not stmt or stmt.startswith('--'):
|
||||
continue
|
||||
@@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""):
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
print(f"执行SQL语句失败: {str(e)[:100]}...")
|
||||
|
||||
|
||||
print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")
|
||||
return error_count == 0
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行SQL文件失败 {sql_file_path}: {e}")
|
||||
return False
|
||||
@@ -90,44 +91,44 @@ def main():
|
||||
print("=" * 60)
|
||||
print("MindSpider AI爬虫项目 - 数据库初始化")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# 检查配置
|
||||
print("检查数据库配置...")
|
||||
print(f"数据库主机: {config.DB_HOST}")
|
||||
print(f"数据库端口: {config.DB_PORT}")
|
||||
print(f"数据库名称: {config.DB_NAME}")
|
||||
print(f"数据库用户: {config.DB_USER}")
|
||||
print(f"字符集: {config.DB_CHARSET}")
|
||||
print(f"数据库主机: {settings.db_host}")
|
||||
print(f"数据库端口: {settings.db_port}")
|
||||
print(f"数据库名称: {settings.db_name}")
|
||||
print(f"数据库用户: {settings.db_user}")
|
||||
print(f"字符集: {settings.db_charset}")
|
||||
print()
|
||||
|
||||
|
||||
# 创建数据库连接
|
||||
print("正在连接数据库...")
|
||||
connection = create_database_connection()
|
||||
if not connection:
|
||||
print("数据库初始化失败!")
|
||||
return False
|
||||
|
||||
|
||||
try:
|
||||
# 创建数据库
|
||||
print("正在创建/选择数据库...")
|
||||
if not create_database(connection):
|
||||
return False
|
||||
|
||||
|
||||
# 获取SQL文件路径
|
||||
schema_dir = Path(__file__).parent
|
||||
mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"
|
||||
mindspider_sql = schema_dir / "mindspider_tables.sql"
|
||||
|
||||
|
||||
print()
|
||||
print("开始执行SQL脚本...")
|
||||
|
||||
|
||||
# 1. 执行MediaCrawler的原始表结构
|
||||
if mediacrawler_sql.exists():
|
||||
print("1. 创建MediaCrawler基础表...")
|
||||
execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")
|
||||
else:
|
||||
print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")
|
||||
|
||||
|
||||
# 2. 执行MindSpider扩展表结构
|
||||
print("2. 创建MindSpider扩展表...")
|
||||
if mindspider_sql.exists():
|
||||
@@ -135,18 +136,18 @@ def main():
|
||||
else:
|
||||
print("错误: MindSpider SQL文件不存在")
|
||||
return False
|
||||
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("数据库初始化完成!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# 显示创建的表
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SHOW TABLES")
|
||||
tables = cursor.fetchall()
|
||||
|
||||
print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:")
|
||||
|
||||
print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:")
|
||||
for table in tables:
|
||||
print(f" - {table[0]}")
|
||||
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
"""
|
||||
MindSpider 数据库初始化(SQLAlchemy 2.x 异步引擎)
|
||||
|
||||
此脚本创建 MindSpider 扩展表(与 MediaCrawler 原始表分离)。
|
||||
支持 MySQL 与 PostgreSQL,需已有可连接的数据库实例。
|
||||
|
||||
数据模型定义位置:
|
||||
- MindSpider/schema/models_sa.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
from sqlalchemy import text
|
||||
|
||||
from models_sa import Base
|
||||
|
||||
# 导入 models_bigdata 以确保所有表类被注册到 Base.metadata
|
||||
# models_bigdata 现在也使用 models_sa 的 Base,所以所有表都在同一个 metadata 中
|
||||
import models_bigdata # noqa: F401 # 导入以注册所有表类
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
from config import settings
|
||||
|
||||
def _env(key: str, default: Optional[str] = None) -> Optional[str]:
|
||||
v = os.getenv(key)
|
||||
return v if v not in (None, "") else default
|
||||
|
||||
|
||||
def _build_database_url() -> str:
|
||||
# 优先 DATABASE_URL
|
||||
database_url = settings.DATABASE_URL if hasattr(settings, "DATABASE_URL") else None
|
||||
if database_url:
|
||||
return database_url
|
||||
|
||||
dialect = (settings.DB_DIALECT or "mysql").lower()
|
||||
host = settings.DB_HOST or "localhost"
|
||||
port = str(settings.DB_PORT or ("3306" if dialect == "mysql" else "5432"))
|
||||
user = settings.DB_USER or "root"
|
||||
password = settings.DB_PASSWORD or ""
|
||||
db_name = settings.DB_NAME or "mindspider"
|
||||
|
||||
if dialect in ("postgresql", "postgres"):
|
||||
return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}"
|
||||
|
||||
return f"mysql+aiomysql://{user}:{password}@{host}:{port}/{db_name}"
|
||||
|
||||
|
||||
async def _create_views_if_needed(engine_dialect: str):
|
||||
# 视图为可选;仅当业务需要时创建。两端使用通用 SQL 聚合避免方言函数。
|
||||
# 如不需要视图,可跳过。
|
||||
engine_dialect = engine_dialect.lower()
|
||||
v_topic_crawling_stats = (
|
||||
"CREATE OR REPLACE VIEW v_topic_crawling_stats AS\n"
|
||||
"SELECT dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status,\n"
|
||||
" COUNT(DISTINCT ct.task_id) AS total_tasks,\n"
|
||||
" SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) AS completed_tasks,\n"
|
||||
" SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) AS failed_tasks,\n"
|
||||
" SUM(COALESCE(ct.total_crawled,0)) AS total_content_crawled,\n"
|
||||
" SUM(COALESCE(ct.success_count,0)) AS total_success_count,\n"
|
||||
" SUM(COALESCE(ct.error_count,0)) AS total_error_count\n"
|
||||
"FROM daily_topics dt\n"
|
||||
"LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id\n"
|
||||
"GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status"
|
||||
)
|
||||
|
||||
v_daily_summary = (
|
||||
"CREATE OR REPLACE VIEW v_daily_summary AS\n"
|
||||
"SELECT dn.crawl_date AS crawl_date,\n"
|
||||
" COUNT(DISTINCT dn.news_id) AS total_news,\n"
|
||||
" COUNT(DISTINCT dn.source_platform) AS platforms_covered,\n"
|
||||
" (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) AS topics_extracted,\n"
|
||||
" (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) AS tasks_created\n"
|
||||
"FROM daily_news dn\n"
|
||||
"GROUP BY dn.crawl_date\n"
|
||||
"ORDER BY dn.crawl_date DESC"
|
||||
)
|
||||
|
||||
# PostgreSQL 的 CREATE OR REPLACE VIEW 也可用;两端均执行
|
||||
from sqlalchemy.ext.asyncio import AsyncEngine
|
||||
engine: AsyncEngine = create_async_engine(_build_database_url())
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text(v_topic_crawling_stats))
|
||||
await conn.execute(text(v_daily_summary))
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
database_url = _build_database_url()
|
||||
engine = create_async_engine(database_url, pool_pre_ping=True, pool_recycle=1800)
|
||||
|
||||
# 由于 models_bigdata 和 models_sa 现在共享同一个 Base,所有表都在同一个 metadata 中
|
||||
# 只需创建一次,SQLAlchemy 会自动处理表之间的依赖关系
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
# 保持原有视图创建和释放逻辑
|
||||
dialect_name = engine.url.get_backend_name()
|
||||
await _create_views_if_needed(dialect_name)
|
||||
|
||||
await engine.dispose()
|
||||
logger.info("[init_database_sa] 数据表与视图创建完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
@@ -0,0 +1,467 @@
|
||||
"""
|
||||
舆情大数据聚合主表ORM模型(自动由原tables.sql结构同步生成,对应大表批量搜索与内容入库)
|
||||
|
||||
数据模型定义位置:
|
||||
- MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql # 主表结构来源文件
|
||||
- 本模块(自动映射SQL表,适配MySQL/PostgreSQL,推荐手动完善注释、唯一/索引补充)
|
||||
- MindSpider/schema/models_sa.py # Base 定义来源
|
||||
|
||||
本模块以MindSpider\DeepSentimentCrawling\MediaCrawler\database\models.py为准
|
||||
"""
|
||||
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import Integer, String, BigInteger, Text, ForeignKey
|
||||
|
||||
# 使用 models_sa 中的 Base,确保所有表在同一个 metadata 中,外键引用可以正常工作
|
||||
from models_sa import Base
|
||||
|
||||
class BilibiliVideo(Base):
|
||||
__tablename__ = "bilibili_video"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
video_id: Mapped[int] = mapped_column(BigInteger, nullable=False, index=True, unique=True)
|
||||
video_url: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
liked_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
disliked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_play_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_favorite_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_coin_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_danmaku: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_comment: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class BilibiliVideoComment(Base):
|
||||
__tablename__ = "bilibili_video_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
video_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
|
||||
|
||||
|
||||
class BilibiliUpInfo(Base):
|
||||
__tablename__ = "bilibili_up_info"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
total_fans: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
user_rank: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
is_official: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
|
||||
class BilibiliContactInfo(Base):
|
||||
__tablename__ = "bilibili_contact_info"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
up_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
fan_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
up_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fan_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
up_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fan_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
up_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fan_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
|
||||
|
||||
class BilibiliUpDynamic(Base):
|
||||
__tablename__ = "bilibili_up_dynamic"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
dynamic_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
pub_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
total_comments: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
total_forwards: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
|
||||
|
||||
class DouyinAweme(Base):
|
||||
__tablename__ = "douyin_aweme"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
aweme_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
music_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
note_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class DouyinAwemeComment(Base):
|
||||
__tablename__ = "douyin_aweme_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
|
||||
pictures: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
|
||||
|
||||
class DyCreator(Base):
|
||||
__tablename__ = "dy_creator"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
videos_count: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
|
||||
class KuaishouVideo(Base):
|
||||
__tablename__ = "kuaishou_video"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
viewd_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_play_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class KuaishouVideoComment(Base):
|
||||
__tablename__ = "kuaishou_video_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
class WeiboNote(Base):
|
||||
__tablename__ = "weibo_note"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
comments_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
shared_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class WeiboNoteComment(Base):
|
||||
__tablename__ = "weibo_note_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
comment_like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
|
||||
class WeiboCreator(Base):
|
||||
__tablename__ = "weibo_creator"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
|
||||
class XhsCreator(Base):
|
||||
__tablename__ = "xhs_creator"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
|
||||
class XhsNote(Base):
|
||||
__tablename__ = "xhs_note"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
last_update_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
image_list: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
xsec_token: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
|
||||
class XhsNoteComment(Base):
|
||||
__tablename__ = "xhs_note_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
|
||||
note_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sub_comment_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
pictures: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
class TiebaNote(Base):
|
||||
__tablename__ = "tieba_note"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
note_id: Mapped[str | None] = mapped_column(String(644), index=True, nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
|
||||
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
total_replay_num: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
total_replay_page: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class TiebaComment(Base):
|
||||
__tablename__ = "tieba_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
|
||||
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
|
||||
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
|
||||
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
|
||||
|
||||
class TiebaCreator(Base):
|
||||
__tablename__ = "tieba_creator"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
registration_duration: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
|
||||
class ZhihuContent(Base):
|
||||
__tablename__ = "zhihu_content"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
|
||||
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
content_url: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
question_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
title: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
created_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
|
||||
updated_time: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
source_keyword: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
|
||||
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
|
||||
|
||||
class ZhihuComment(Base):
|
||||
__tablename__ = "zhihu_comment"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
comment_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
|
||||
parent_comment_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
content: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
publish_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
like_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
dislike_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
|
||||
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
|
||||
|
||||
class ZhihuCreator(Base):
|
||||
__tablename__ = "zhihu_creator"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
user_id: Mapped[str | None] = mapped_column(String(64), unique=True, index=True, nullable=True)
|
||||
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
follows: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
fans: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
anwser_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
video_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
question_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
article_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
column_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
get_voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
|
||||
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
MindSpider 数据库ORM模型(SQLAlchemy 2.x)
|
||||
|
||||
此模块定义 MindSpider 扩展表(与原 MediaCrawler 表解耦)的 ORM 模型。
|
||||
数据模型定义位置:
|
||||
- 本文件(MindSpider/schema/models_sa.py)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
from datetime import date
|
||||
|
||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
|
||||
from sqlalchemy import Integer, String, Text, BigInteger, Date, Float, ForeignKey, Index, UniqueConstraint
|
||||
from sqlalchemy.schema import ForeignKeyConstraint
|
||||
from sqlalchemy.orm import relationship
|
||||
__all__ = [
|
||||
"Base",
|
||||
"DailyNews",
|
||||
"DailyTopic",
|
||||
"TopicNewsRelation",
|
||||
"CrawlingTask",
|
||||
]
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class DailyNews(Base):
|
||||
__tablename__ = "daily_news"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("news_id", name="uq_daily_news_id_unique"), # 为外键引用添加唯一约束
|
||||
UniqueConstraint("news_id", "source_platform", "crawl_date", name="uq_daily_news_unique"),
|
||||
Index("idx_daily_news_date", "crawl_date"),
|
||||
Index("idx_daily_news_platform", "source_platform"),
|
||||
Index("idx_daily_news_rank", "rank_position"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
source_platform: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
title: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||
url: Mapped[Optional[str]] = mapped_column(String(512))
|
||||
description: Mapped[Optional[str]] = mapped_column(Text)
|
||||
extra_info: Mapped[Optional[str]] = mapped_column(Text)
|
||||
crawl_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||
rank_position: Mapped[Optional[int]] = mapped_column(Integer)
|
||||
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
|
||||
|
||||
class DailyTopic(Base):
|
||||
__tablename__ = "daily_topics"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("topic_id", name="uq_daily_topics_id_unique"), # 为外键引用添加唯一约束
|
||||
UniqueConstraint("topic_id", "extract_date", name="uq_daily_topics_unique"),
|
||||
Index("idx_daily_topics_date", "extract_date"),
|
||||
Index("idx_daily_topics_status", "processing_status"),
|
||||
Index("idx_daily_topics_score", "relevance_score"),
|
||||
Index("idx_topic_date_status", "extract_date", "processing_status"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
topic_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
topic_description: Mapped[Optional[str]] = mapped_column(Text)
|
||||
keywords: Mapped[Optional[str]] = mapped_column(Text)
|
||||
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||
relevance_score: Mapped[Optional[float]] = mapped_column(Float)
|
||||
news_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
|
||||
processing_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
|
||||
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
|
||||
|
||||
class TopicNewsRelation(Base):
|
||||
__tablename__ = "topic_news_relation"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("topic_id", "news_id", "extract_date", name="uq_topic_news_unique"),
|
||||
Index("idx_topic_news_topic", "topic_id"),
|
||||
Index("idx_topic_news_news", "news_id"),
|
||||
Index("idx_topic_news_date", "extract_date"),
|
||||
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
|
||||
ForeignKeyConstraint(["news_id"], ["daily_news.news_id"], ondelete="CASCADE"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
relation_score: Mapped[Optional[float]] = mapped_column(Float)
|
||||
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
|
||||
|
||||
class CrawlingTask(Base):
|
||||
__tablename__ = "crawling_tasks"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("task_id", name="uq_crawling_tasks_unique"),
|
||||
Index("idx_crawling_tasks_topic", "topic_id"),
|
||||
Index("idx_crawling_tasks_platform", "platform"),
|
||||
Index("idx_crawling_tasks_status", "task_status"),
|
||||
Index("idx_crawling_tasks_date", "scheduled_date"),
|
||||
Index("idx_task_topic_platform", "topic_id", "platform", "task_status"),
|
||||
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
task_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
platform: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
search_keywords: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
task_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
|
||||
start_time: Mapped[Optional[int]] = mapped_column(BigInteger)
|
||||
end_time: Mapped[Optional[int]] = mapped_column(BigInteger)
|
||||
total_crawled: Mapped[Optional[int]] = mapped_column(Integer, default=0)
|
||||
success_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
|
||||
error_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
|
||||
error_message: Mapped[Optional[str]] = mapped_column(Text)
|
||||
config_params: Mapped[Optional[str]] = mapped_column(Text)
|
||||
scheduled_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user