1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
+4
View File
@@ -0,0 +1,4 @@
from .models_bigdata import Base as BDBase
from .models_sa import Base as SABase
__all__ = ["BDBase", "SABase"]
+150 -118
View File
@@ -7,10 +7,12 @@ MindSpider AI爬虫项目 - 数据库管理工具
import os
import sys
import pymysql
from sqlalchemy import create_engine, text, inspect
from sqlalchemy.engine import Engine
import argparse
from pathlib import Path
from datetime import datetime, timedelta
from loguru import logger
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -19,125 +21,132 @@ sys.path.append(str(project_root))
try:
import config
except ImportError:
print("错误: 无法导入config.py配置文件")
logger.error("错误: 无法导入config.py配置文件")
sys.exit(1)
from MindSpider.config import settings
class DatabaseManager:
def __init__(self):
self.connection = None
self.engine: Engine = None
self.connect()
def connect(self):
"""连接数据库"""
try:
self.connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
database=config.DB_NAME,
charset=config.DB_CHARSET,
autocommit=True
)
print(f"成功连接到数据库: {config.DB_NAME}")
dialect = (settings.DB_DIALECT or "mysql").lower()
if dialect in ("postgresql", "postgres"):
url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
else:
url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
self.engine = create_engine(url, future=True)
logger.info(f"成功连接到数据库: {settings.DB_NAME}")
except Exception as e:
print(f"数据库连接失败: {e}")
logger.error(f"数据库连接失败: {e}")
sys.exit(1)
def close(self):
"""关闭数据库连接"""
if self.connection:
self.connection.close()
if self.engine:
self.engine.dispose()
def show_tables(self):
"""显示所有表"""
print("\n" + "=" * 60)
print("数据库表列表")
print("=" * 60)
data_list_message = ""
data_list_message += "\n" + "=" * 60
data_list_message += "数据库表列表"
data_list_message += "=" * 60
logger.info(data_list_message)
cursor = self.connection.cursor()
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
inspector = inspect(self.engine)
tables = inspector.get_table_names()
if not tables:
print("数据库中没有表")
logger.info("数据库中没有表")
return
# 分类显示表
mindspider_tables = []
mediacrawler_tables = []
for table in tables:
table_name = table[0]
for table_name in tables:
if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']:
mindspider_tables.append(table_name)
else:
mediacrawler_tables.append(table_name)
print("MindSpider核心表:")
data_list_message += "MindSpider核心表:"
data_list_message += "\n"
for table in mindspider_tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
print(f" - {table:<25} ({count:>6} 条记录)")
with self.engine.connect() as conn:
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
data_list_message += f" - {table:<25} ({count:>6} 条记录)"
data_list_message += "\n"
print("\nMediaCrawler平台表:")
data_list_message += "\nMediaCrawler平台表:"
data_list_message += "\n"
for table in mediacrawler_tables:
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
print(f" - {table:<25} ({count:>6} 条记录)")
with self.engine.connect() as conn:
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
data_list_message += f" - {table:<25} ({count:>6} 条记录)"
data_list_message += "\n"
except:
print(f" - {table:<25} (查询失败)")
data_list_message += f" - {table:<25} (查询失败)"
data_list_message += "\n"
logger.info(data_list_message)
def show_statistics(self):
"""显示数据统计"""
print("\n" + "=" * 60)
print("数据统计")
print("=" * 60)
cursor = self.connection.cursor()
data_statistics_message = ""
data_statistics_message += "\n" + "=" * 60
data_statistics_message += "数据统计"
data_statistics_message += "=" * 60
data_statistics_message += "\n"
try:
# 新闻统计
cursor.execute("SELECT COUNT(*) FROM daily_news")
news_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")
news_days = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news")
platforms = cursor.fetchone()[0]
print(f"新闻数据:")
print(f" - 总新闻数: {news_count}")
print(f" - 覆盖天数: {news_days}")
print(f" - 新闻平台: {platforms}")
with self.engine.connect() as conn:
news_count = conn.execute(text("SELECT COUNT(*) FROM daily_news")).scalar_one()
news_days = conn.execute(text("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")).scalar_one()
platforms = conn.execute(text("SELECT COUNT(DISTINCT source_platform) FROM daily_news")).scalar_one()
data_statistics_message += "新闻数据:"
data_statistics_message += "\n"
data_statistics_message += f" - 总新闻数: {news_count}"
data_statistics_message += "\n"
data_statistics_message += f" - 覆盖天数: {news_days}"
data_statistics_message += "\n"
data_statistics_message += f" - 新闻平台: {platforms}"
data_statistics_message += "\n"
# 话题统计
cursor.execute("SELECT COUNT(*) FROM daily_topics")
topic_count = cursor.fetchone()[0]
with self.engine.connect() as conn:
topic_count = conn.execute(text("SELECT COUNT(*) FROM daily_topics")).scalar_one()
topic_days = conn.execute(text("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")).scalar_one()
cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")
topic_days = cursor.fetchone()[0]
print(f"\n话题数据:")
print(f" - 总话题数: {topic_count}")
print(f" - 提取天数: {topic_days}")
data_statistics_message += "话题数据:"
data_statistics_message += "\n"
data_statistics_message += f" - 总话题数: {topic_count}"
data_statistics_message += "\n"
data_statistics_message += f" - 提取天数: {topic_days}"
data_statistics_message += "\n"
# 爬取任务统计
cursor.execute("SELECT COUNT(*) FROM crawling_tasks")
task_count = cursor.fetchone()[0]
with self.engine.connect() as conn:
task_count = conn.execute(text("SELECT COUNT(*) FROM crawling_tasks")).scalar_one()
task_status = conn.execute(text("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")).all()
cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")
task_status = cursor.fetchall()
print(f"\n爬取任务:")
print(f" - 总任务数: {task_count}")
data_statistics_message += "爬取任务:"
data_statistics_message += "\n"
data_statistics_message += f" - 总任务数: {task_count}"
data_statistics_message += "\n"
for status, count in task_status:
print(f" - {status}: {count}")
data_statistics_message += f" - {status}: {count}"
data_statistics_message += "\n"
# 爬取内容统计
print(f"\n平台内容统计:")
data_statistics_message += "平台内容统计:"
data_statistics_message += "\n"
platform_tables = {
'xhs_note': '小红书',
'douyin_aweme': '抖音',
@@ -150,60 +159,78 @@ class DatabaseManager:
for table, platform in platform_tables.items():
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
print(f" - {platform}: {count}")
with self.engine.connect() as conn:
count = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar_one()
data_statistics_message += f" - {platform}: {count}"
data_statistics_message += "\n"
except:
print(f" - {platform}: 表不存在")
data_statistics_message += f" - {platform}: 表不存在"
data_statistics_message += "\n"
logger.info(data_statistics_message)
except Exception as e:
print(f"统计查询失败: {e}")
data_statistics_message += f"统计查询失败: {e}"
data_statistics_message += "\n"
logger.error(data_statistics_message)
def show_recent_data(self, days=7):
"""显示最近几天的数据"""
print(f"\n" + "=" * 60)
print(f"最近{days}天的数据")
print("=" * 60)
cursor = self.connection.cursor()
data_recent_message = ""
data_recent_message += "\n" + "=" * 60
data_recent_message += "最近" + str(days) + "天的数据"
data_recent_message += "=" * 60
from datetime import date, timedelta
start_date = date.today() - timedelta(days=days)
# 最近的新闻
cursor.execute("""
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
FROM daily_news
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
GROUP BY crawl_date
ORDER BY crawl_date DESC
""", (days,))
news_data = cursor.fetchall()
with self.engine.connect() as conn:
news_data = conn.execute(
text(
"""
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
FROM daily_news
WHERE crawl_date >= :start_date
GROUP BY crawl_date
ORDER BY crawl_date DESC
"""
),
{"start_date": start_date},
).all()
if news_data:
print("每日新闻统计:")
data_recent_message += "每日新闻统计:"
data_recent_message += "\n"
for date, count, platforms in news_data:
print(f" {date}: {count} 条新闻, {platforms} 个平台")
data_recent_message += f" {date}: {count} 条新闻, {platforms} 个平台"
data_recent_message += "\n"
# 最近的话题
cursor.execute("""
SELECT extract_date, COUNT(*) as topic_count
FROM daily_topics
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
GROUP BY extract_date
ORDER BY extract_date DESC
""", (days,))
topic_data = cursor.fetchall()
with self.engine.connect() as conn:
topic_data = conn.execute(
text(
"""
SELECT extract_date, COUNT(*) as topic_count
FROM daily_topics
WHERE extract_date >= :start_date
GROUP BY extract_date
ORDER BY extract_date DESC
"""
),
{"start_date": start_date},
).all()
if topic_data:
print("\n每日话题统计:")
data_recent_message += "每日话题统计:"
data_recent_message += "\n"
for date, count in topic_data:
print(f" {date}: {count} 个话题")
data_recent_message += f" {date}: {count} 个话题"
data_recent_message += "\n"
logger.info(data_recent_message)
def cleanup_old_data(self, days=90, dry_run=True):
"""清理旧数据"""
print(f"\n" + "=" * 60)
print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})")
print("=" * 60)
cleanup_message = ""
cleanup_message += "\n" + "=" * 60
cleanup_message += f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})"
cleanup_message += "=" * 60
cursor = self.connection.cursor()
cutoff_date = datetime.now() - timedelta(days=days)
# 检查要删除的数据
@@ -213,20 +240,25 @@ class DatabaseManager:
("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'")
]
for table, query in cleanup_queries:
cursor.execute(query)
count = cursor.fetchone()[0]
if count > 0:
print(f" {table}: {count} 条记录将被删除")
if not dry_run:
delete_query = query.replace("SELECT COUNT(*)", "DELETE")
cursor.execute(delete_query)
print(f" 已删除 {count} 条记录")
else:
print(f" {table}: 无需清理")
with self.engine.begin() as conn:
for table, query in cleanup_queries:
count = conn.execute(text(query)).scalar_one()
if count > 0:
cleanup_message += f" {table}: {count} 条记录将被删除"
cleanup_message += "\n"
if not dry_run:
delete_query = query.replace("SELECT COUNT(*)", "DELETE")
conn.execute(text(delete_query))
cleanup_message += f" 已删除 {count} 条记录"
cleanup_message += "\n"
else:
cleanup_message += f" {table}: 无需清理"
cleanup_message += "\n"
if dry_run:
print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。")
cleanup_message += "\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。"
cleanup_message += "\n"
logger.info(cleanup_message)
def main():
parser = argparse.ArgumentParser(description="MindSpider数据库管理工具")
+32 -31
View File
@@ -9,6 +9,7 @@ import os
import sys
import pymysql
from pathlib import Path
from MindSpider.config import settings
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
@@ -26,14 +27,14 @@ def create_database_connection():
"""创建数据库连接"""
try:
connection = pymysql.connect(
host=config.DB_HOST,
port=config.DB_PORT,
user=config.DB_USER,
password=config.DB_PASSWORD,
charset=config.DB_CHARSET,
host=settings.db_host,
port=settings.db_port,
user=settings.db_user,
password=settings.db_password,
charset=settings.db_charset,
autocommit=True
)
print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}")
print(f"成功连接到MySQL服务器: {settings.db_host}:{settings.db_port}")
return connection
except Exception as e:
print(f"连接数据库失败: {e}")
@@ -43,9 +44,9 @@ def create_database(connection):
"""创建数据库"""
try:
cursor = connection.cursor()
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
cursor.execute(f"USE `{config.DB_NAME}`")
print(f"数据库 '{config.DB_NAME}' 创建/选择成功")
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{settings.db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
cursor.execute(f"USE `{settings.db_name}`")
print(f"数据库 '{settings.db_name}' 创建/选择成功")
return True
except Exception as e:
print(f"创建数据库失败: {e}")
@@ -56,18 +57,18 @@ def execute_sql_file(connection, sql_file_path, description=""):
if not os.path.exists(sql_file_path):
print(f"警告: SQL文件不存在: {sql_file_path}")
return False
try:
cursor = connection.cursor()
with open(sql_file_path, 'r', encoding='utf-8') as f:
sql_content = f.read()
# 分割SQL语句(简单实现,按分号分割)
sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]
success_count = 0
error_count = 0
for stmt in sql_statements:
if not stmt or stmt.startswith('--'):
continue
@@ -77,10 +78,10 @@ def execute_sql_file(connection, sql_file_path, description=""):
except Exception as e:
error_count += 1
print(f"执行SQL语句失败: {str(e)[:100]}...")
print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")
return error_count == 0
except Exception as e:
print(f"执行SQL文件失败 {sql_file_path}: {e}")
return False
@@ -90,44 +91,44 @@ def main():
print("=" * 60)
print("MindSpider AI爬虫项目 - 数据库初始化")
print("=" * 60)
# 检查配置
print("检查数据库配置...")
print(f"数据库主机: {config.DB_HOST}")
print(f"数据库端口: {config.DB_PORT}")
print(f"数据库名称: {config.DB_NAME}")
print(f"数据库用户: {config.DB_USER}")
print(f"字符集: {config.DB_CHARSET}")
print(f"数据库主机: {settings.db_host}")
print(f"数据库端口: {settings.db_port}")
print(f"数据库名称: {settings.db_name}")
print(f"数据库用户: {settings.db_user}")
print(f"字符集: {settings.db_charset}")
print()
# 创建数据库连接
print("正在连接数据库...")
connection = create_database_connection()
if not connection:
print("数据库初始化失败!")
return False
try:
# 创建数据库
print("正在创建/选择数据库...")
if not create_database(connection):
return False
# 获取SQL文件路径
schema_dir = Path(__file__).parent
mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"
mindspider_sql = schema_dir / "mindspider_tables.sql"
print()
print("开始执行SQL脚本...")
# 1. 执行MediaCrawler的原始表结构
if mediacrawler_sql.exists():
print("1. 创建MediaCrawler基础表...")
execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")
else:
print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")
# 2. 执行MindSpider扩展表结构
print("2. 创建MindSpider扩展表...")
if mindspider_sql.exists():
@@ -135,18 +136,18 @@ def main():
else:
print("错误: MindSpider SQL文件不存在")
return False
print()
print("=" * 60)
print("数据库初始化完成!")
print("=" * 60)
# 显示创建的表
cursor = connection.cursor()
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:")
print(f"数据库 '{settings.db_name}' 中共创建了 {len(tables)} 个表:")
for table in tables:
print(f" - {table[0]}")
+119
View File
@@ -0,0 +1,119 @@
"""
MindSpider 数据库初始化(SQLAlchemy 2.x 异步引擎)
此脚本创建 MindSpider 扩展表(与 MediaCrawler 原始表分离)。
支持 MySQL 与 PostgreSQL,需已有可连接的数据库实例。
数据模型定义位置:
- MindSpider/schema/models_sa.py
"""
from __future__ import annotations
import asyncio
import os
from typing import Optional
from loguru import logger
from sqlalchemy.ext.asyncio import create_async_engine
from sqlalchemy import text
from models_sa import Base
# 导入 models_bigdata 以确保所有表类被注册到 Base.metadata
# models_bigdata 现在也使用 models_sa 的 Base,所以所有表都在同一个 metadata 中
import models_bigdata # noqa: F401 # 导入以注册所有表类
import sys
from pathlib import Path
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
from config import settings
def _env(key: str, default: Optional[str] = None) -> Optional[str]:
v = os.getenv(key)
return v if v not in (None, "") else default
def _build_database_url() -> str:
# 优先 DATABASE_URL
database_url = settings.DATABASE_URL if hasattr(settings, "DATABASE_URL") else None
if database_url:
return database_url
dialect = (settings.DB_DIALECT or "mysql").lower()
host = settings.DB_HOST or "localhost"
port = str(settings.DB_PORT or ("3306" if dialect == "mysql" else "5432"))
user = settings.DB_USER or "root"
password = settings.DB_PASSWORD or ""
db_name = settings.DB_NAME or "mindspider"
if dialect in ("postgresql", "postgres"):
return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}"
return f"mysql+aiomysql://{user}:{password}@{host}:{port}/{db_name}"
async def _create_views_if_needed(engine_dialect: str):
# 视图为可选;仅当业务需要时创建。两端使用通用 SQL 聚合避免方言函数。
# 如不需要视图,可跳过。
engine_dialect = engine_dialect.lower()
v_topic_crawling_stats = (
"CREATE OR REPLACE VIEW v_topic_crawling_stats AS\n"
"SELECT dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status,\n"
" COUNT(DISTINCT ct.task_id) AS total_tasks,\n"
" SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) AS completed_tasks,\n"
" SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) AS failed_tasks,\n"
" SUM(COALESCE(ct.total_crawled,0)) AS total_content_crawled,\n"
" SUM(COALESCE(ct.success_count,0)) AS total_success_count,\n"
" SUM(COALESCE(ct.error_count,0)) AS total_error_count\n"
"FROM daily_topics dt\n"
"LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id\n"
"GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status"
)
v_daily_summary = (
"CREATE OR REPLACE VIEW v_daily_summary AS\n"
"SELECT dn.crawl_date AS crawl_date,\n"
" COUNT(DISTINCT dn.news_id) AS total_news,\n"
" COUNT(DISTINCT dn.source_platform) AS platforms_covered,\n"
" (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) AS topics_extracted,\n"
" (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) AS tasks_created\n"
"FROM daily_news dn\n"
"GROUP BY dn.crawl_date\n"
"ORDER BY dn.crawl_date DESC"
)
# PostgreSQL 的 CREATE OR REPLACE VIEW 也可用;两端均执行
from sqlalchemy.ext.asyncio import AsyncEngine
engine: AsyncEngine = create_async_engine(_build_database_url())
async with engine.begin() as conn:
await conn.execute(text(v_topic_crawling_stats))
await conn.execute(text(v_daily_summary))
await engine.dispose()
async def main() -> None:
database_url = _build_database_url()
engine = create_async_engine(database_url, pool_pre_ping=True, pool_recycle=1800)
# 由于 models_bigdata 和 models_sa 现在共享同一个 Base,所有表都在同一个 metadata 中
# 只需创建一次,SQLAlchemy 会自动处理表之间的依赖关系
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
# 保持原有视图创建和释放逻辑
dialect_name = engine.url.get_backend_name()
await _create_views_if_needed(dialect_name)
await engine.dispose()
logger.info("[init_database_sa] 数据表与视图创建完成")
if __name__ == "__main__":
asyncio.run(main())
+467
View File
@@ -0,0 +1,467 @@
"""
舆情大数据聚合主表ORM模型(自动由原tables.sql结构同步生成,对应大表批量搜索与内容入库)
数据模型定义位置:
- MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql # 主表结构来源文件
- 本模块(自动映射SQL表,适配MySQL/PostgreSQL,推荐手动完善注释、唯一/索引补充)
- MindSpider/schema/models_sa.py # Base 定义来源
本模块以MindSpider\DeepSentimentCrawling\MediaCrawler\database\models.py为准
"""
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import Integer, String, BigInteger, Text, ForeignKey
# 使用 models_sa 中的 Base,确保所有表在同一个 metadata 中,外键引用可以正常工作
from models_sa import Base
class BilibiliVideo(Base):
__tablename__ = "bilibili_video"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
video_id: Mapped[int] = mapped_column(BigInteger, nullable=False, index=True, unique=True)
video_url: Mapped[str] = mapped_column(Text, nullable=False)
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
liked_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
disliked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_play_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_favorite_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_coin_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_danmaku: Mapped[str | None] = mapped_column(Text, nullable=True)
video_comment: Mapped[str | None] = mapped_column(Text, nullable=True)
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class BilibiliVideoComment(Base):
__tablename__ = "bilibili_video_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
video_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
class BilibiliUpInfo(Base):
__tablename__ = "bilibili_up_info"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
total_fans: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
user_rank: Mapped[int | None] = mapped_column(Integer, nullable=True)
is_official: Mapped[int | None] = mapped_column(Integer, nullable=True)
class BilibiliContactInfo(Base):
__tablename__ = "bilibili_contact_info"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
up_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
fan_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
up_name: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_name: Mapped[str | None] = mapped_column(Text, nullable=True)
up_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
up_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class BilibiliUpDynamic(Base):
__tablename__ = "bilibili_up_dynamic"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
dynamic_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
text: Mapped[str | None] = mapped_column(Text, nullable=True)
type: Mapped[str | None] = mapped_column(Text, nullable=True)
pub_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
total_comments: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_forwards: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class DouyinAweme(Base):
__tablename__ = "douyin_aweme"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
aweme_url: Mapped[str | None] = mapped_column(Text, nullable=True)
cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
music_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
note_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class DouyinAwemeComment(Base):
__tablename__ = "douyin_aweme_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
pictures: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
class DyCreator(Base):
__tablename__ = "dy_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
videos_count: Mapped[str | None] = mapped_column(String(255), nullable=True)
class KuaishouVideo(Base):
__tablename__ = "kuaishou_video"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
viewd_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_play_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class KuaishouVideoComment(Base):
__tablename__ = "kuaishou_video_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(Text, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
class WeiboNote(Base):
__tablename__ = "weibo_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comments_count: Mapped[str | None] = mapped_column(Text, nullable=True)
shared_count: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class WeiboNoteComment(Base):
__tablename__ = "weibo_note_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
comment_like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
class WeiboCreator(Base):
__tablename__ = "weibo_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
class XhsCreator(Base):
__tablename__ = "xhs_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
class XhsNote(Base):
__tablename__ = "xhs_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
last_update_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
image_list: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
xsec_token: Mapped[str | None] = mapped_column(Text, nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class XhsNoteComment(Base):
__tablename__ = "xhs_note_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
pictures: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
class TiebaNote(Base):
__tablename__ = "tieba_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
note_id: Mapped[str | None] = mapped_column(String(644), index=True, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
total_replay_num: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
total_replay_page: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class TiebaComment(Base):
__tablename__ = "tieba_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class TiebaCreator(Base):
__tablename__ = "tieba_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
registration_duration: Mapped[str | None] = mapped_column(Text, nullable=True)
class ZhihuContent(Base):
__tablename__ = "zhihu_content"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
content_url: Mapped[str | None] = mapped_column(Text, nullable=True)
question_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
created_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
updated_time: Mapped[str | None] = mapped_column(Text, nullable=True)
voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class ZhihuComment(Base):
__tablename__ = "zhihu_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
comment_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
like_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
dislike_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class ZhihuCreator(Base):
__tablename__ = "zhihu_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), unique=True, index=True, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
fans: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
anwser_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
video_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
question_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
article_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
column_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
get_voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+126
View File
@@ -0,0 +1,126 @@
"""
MindSpider 数据库ORM模型(SQLAlchemy 2.x
此模块定义 MindSpider 扩展表(与原 MediaCrawler 表解耦)的 ORM 模型。
数据模型定义位置:
- 本文件(MindSpider/schema/models_sa.py
"""
from __future__ import annotations
from typing import Optional
from datetime import date
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Integer, String, Text, BigInteger, Date, Float, ForeignKey, Index, UniqueConstraint
from sqlalchemy.schema import ForeignKeyConstraint
from sqlalchemy.orm import relationship
__all__ = [
"Base",
"DailyNews",
"DailyTopic",
"TopicNewsRelation",
"CrawlingTask",
]
class Base(DeclarativeBase):
pass
class DailyNews(Base):
__tablename__ = "daily_news"
__table_args__ = (
UniqueConstraint("news_id", name="uq_daily_news_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("news_id", "source_platform", "crawl_date", name="uq_daily_news_unique"),
Index("idx_daily_news_date", "crawl_date"),
Index("idx_daily_news_platform", "source_platform"),
Index("idx_daily_news_rank", "rank_position"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
source_platform: Mapped[str] = mapped_column(String(32), nullable=False)
title: Mapped[str] = mapped_column(String(500), nullable=False)
url: Mapped[Optional[str]] = mapped_column(String(512))
description: Mapped[Optional[str]] = mapped_column(Text)
extra_info: Mapped[Optional[str]] = mapped_column(Text)
crawl_date: Mapped[date] = mapped_column(Date, nullable=False)
rank_position: Mapped[Optional[int]] = mapped_column(Integer)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class DailyTopic(Base):
__tablename__ = "daily_topics"
__table_args__ = (
UniqueConstraint("topic_id", name="uq_daily_topics_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("topic_id", "extract_date", name="uq_daily_topics_unique"),
Index("idx_daily_topics_date", "extract_date"),
Index("idx_daily_topics_status", "processing_status"),
Index("idx_daily_topics_score", "relevance_score"),
Index("idx_topic_date_status", "extract_date", "processing_status"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_name: Mapped[str] = mapped_column(String(255), nullable=False)
topic_description: Mapped[Optional[str]] = mapped_column(Text)
keywords: Mapped[Optional[str]] = mapped_column(Text)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
relevance_score: Mapped[Optional[float]] = mapped_column(Float)
news_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
processing_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class TopicNewsRelation(Base):
__tablename__ = "topic_news_relation"
__table_args__ = (
UniqueConstraint("topic_id", "news_id", "extract_date", name="uq_topic_news_unique"),
Index("idx_topic_news_topic", "topic_id"),
Index("idx_topic_news_news", "news_id"),
Index("idx_topic_news_date", "extract_date"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
ForeignKeyConstraint(["news_id"], ["daily_news.news_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
relation_score: Mapped[Optional[float]] = mapped_column(Float)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class CrawlingTask(Base):
__tablename__ = "crawling_tasks"
__table_args__ = (
UniqueConstraint("task_id", name="uq_crawling_tasks_unique"),
Index("idx_crawling_tasks_topic", "topic_id"),
Index("idx_crawling_tasks_platform", "platform"),
Index("idx_crawling_tasks_status", "task_status"),
Index("idx_crawling_tasks_date", "scheduled_date"),
Index("idx_task_topic_platform", "topic_id", "platform", "task_status"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
task_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
platform: Mapped[str] = mapped_column(String(32), nullable=False)
search_keywords: Mapped[str] = mapped_column(Text, nullable=False)
task_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
start_time: Mapped[Optional[int]] = mapped_column(BigInteger)
end_time: Mapped[Optional[int]] = mapped_column(BigInteger)
total_crawled: Mapped[Optional[int]] = mapped_column(Integer, default=0)
success_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_message: Mapped[Optional[str]] = mapped_column(Text)
config_params: Mapped[Optional[str]] = mapped_column(Text)
scheduled_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)