1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 96c7d2d3b7
commit 4104ee4e80
145 changed files with 8677 additions and 6270 deletions
+4
View File
@@ -0,0 +1,4 @@
from .models_bigdata import Base as BDBase
from .models_sa import Base as SABase
__all__ = ["BDBase", "SABase"]
+119
View File
@@ -0,0 +1,119 @@
"""
MindSpider 数据库初始化(SQLAlchemy 2.x 异步引擎)
此脚本创建 MindSpider 扩展表(与 MediaCrawler 原始表分离)。
支持 MySQL 与 PostgreSQL,需已有可连接的数据库实例。
数据模型定义位置:
- MindSpider/schema/models_sa.py
"""
from __future__ import annotations
import asyncio
import os
from typing import Optional
from loguru import logger
from sqlalchemy.ext.asyncio import create_async_engine
from sqlalchemy import text
from models_sa import Base
# 导入 models_bigdata 以确保所有表类被注册到 Base.metadata
# models_bigdata 现在也使用 models_sa 的 Base,所以所有表都在同一个 metadata 中
import models_bigdata # noqa: F401 # 导入以注册所有表类
import sys
from pathlib import Path
# 添加项目根目录到路径
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
from config import settings
def _env(key: str, default: Optional[str] = None) -> Optional[str]:
v = os.getenv(key)
return v if v not in (None, "") else default
def _build_database_url() -> str:
# 优先 DATABASE_URL
database_url = settings.DATABASE_URL if hasattr(settings, "DATABASE_URL") else None
if database_url:
return database_url
dialect = (settings.DB_DIALECT or "mysql").lower()
host = settings.DB_HOST or "localhost"
port = str(settings.DB_PORT or ("3306" if dialect == "mysql" else "5432"))
user = settings.DB_USER or "root"
password = settings.DB_PASSWORD or ""
db_name = settings.DB_NAME or "mindspider"
if dialect in ("postgresql", "postgres"):
return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}"
return f"mysql+aiomysql://{user}:{password}@{host}:{port}/{db_name}"
async def _create_views_if_needed(engine_dialect: str):
# 视图为可选;仅当业务需要时创建。两端使用通用 SQL 聚合避免方言函数。
# 如不需要视图,可跳过。
engine_dialect = engine_dialect.lower()
v_topic_crawling_stats = (
"CREATE OR REPLACE VIEW v_topic_crawling_stats AS\n"
"SELECT dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status,\n"
" COUNT(DISTINCT ct.task_id) AS total_tasks,\n"
" SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) AS completed_tasks,\n"
" SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) AS failed_tasks,\n"
" SUM(COALESCE(ct.total_crawled,0)) AS total_content_crawled,\n"
" SUM(COALESCE(ct.success_count,0)) AS total_success_count,\n"
" SUM(COALESCE(ct.error_count,0)) AS total_error_count\n"
"FROM daily_topics dt\n"
"LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id\n"
"GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status"
)
v_daily_summary = (
"CREATE OR REPLACE VIEW v_daily_summary AS\n"
"SELECT dn.crawl_date AS crawl_date,\n"
" COUNT(DISTINCT dn.news_id) AS total_news,\n"
" COUNT(DISTINCT dn.source_platform) AS platforms_covered,\n"
" (SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) AS topics_extracted,\n"
" (SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) AS tasks_created\n"
"FROM daily_news dn\n"
"GROUP BY dn.crawl_date\n"
"ORDER BY dn.crawl_date DESC"
)
# PostgreSQL 的 CREATE OR REPLACE VIEW 也可用;两端均执行
from sqlalchemy.ext.asyncio import AsyncEngine
engine: AsyncEngine = create_async_engine(_build_database_url())
async with engine.begin() as conn:
await conn.execute(text(v_topic_crawling_stats))
await conn.execute(text(v_daily_summary))
await engine.dispose()
async def main() -> None:
database_url = _build_database_url()
engine = create_async_engine(database_url, pool_pre_ping=True, pool_recycle=1800)
# 由于 models_bigdata 和 models_sa 现在共享同一个 Base,所有表都在同一个 metadata 中
# 只需创建一次,SQLAlchemy 会自动处理表之间的依赖关系
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
# 保持原有视图创建和释放逻辑
dialect_name = engine.url.get_backend_name()
await _create_views_if_needed(dialect_name)
await engine.dispose()
logger.info("[init_database_sa] 数据表与视图创建完成")
if __name__ == "__main__":
asyncio.run(main())
+467
View File
@@ -0,0 +1,467 @@
"""
舆情大数据聚合主表ORM模型(自动由原tables.sql结构同步生成,对应大表批量搜索与内容入库)
数据模型定义位置:
- MindSpider/DeepSentimentCrawling/MediaCrawler/schema/tables.sql # 主表结构来源文件
- 本模块(自动映射SQL表,适配MySQL/PostgreSQL,推荐手动完善注释、唯一/索引补充)
- MindSpider/schema/models_sa.py # Base 定义来源
本模块以MindSpider\DeepSentimentCrawling\MediaCrawler\database\models.py为准
"""
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import Integer, String, BigInteger, Text, ForeignKey
# 使用 models_sa 中的 Base,确保所有表在同一个 metadata 中,外键引用可以正常工作
from models_sa import Base
class BilibiliVideo(Base):
__tablename__ = "bilibili_video"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
video_id: Mapped[int] = mapped_column(BigInteger, nullable=False, index=True, unique=True)
video_url: Mapped[str] = mapped_column(Text, nullable=False)
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
liked_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
disliked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_play_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_favorite_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_coin_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_danmaku: Mapped[str | None] = mapped_column(Text, nullable=True)
video_comment: Mapped[str | None] = mapped_column(Text, nullable=True)
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class BilibiliVideoComment(Base):
__tablename__ = "bilibili_video_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
video_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
class BilibiliUpInfo(Base):
__tablename__ = "bilibili_up_info"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
sex: Mapped[str | None] = mapped_column(Text, nullable=True)
sign: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
total_fans: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
user_rank: Mapped[int | None] = mapped_column(Integer, nullable=True)
is_official: Mapped[int | None] = mapped_column(Integer, nullable=True)
class BilibiliContactInfo(Base):
__tablename__ = "bilibili_contact_info"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
up_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
fan_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
up_name: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_name: Mapped[str | None] = mapped_column(Text, nullable=True)
up_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_sign: Mapped[str | None] = mapped_column(Text, nullable=True)
up_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
fan_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class BilibiliUpDynamic(Base):
__tablename__ = "bilibili_up_dynamic"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
dynamic_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
text: Mapped[str | None] = mapped_column(Text, nullable=True)
type: Mapped[str | None] = mapped_column(Text, nullable=True)
pub_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
total_comments: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_forwards: Mapped[int | None] = mapped_column(Integer, nullable=True)
total_liked: Mapped[int | None] = mapped_column(Integer, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class DouyinAweme(Base):
__tablename__ = "douyin_aweme"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
aweme_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
aweme_url: Mapped[str | None] = mapped_column(Text, nullable=True)
cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
music_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
note_download_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class DouyinAwemeComment(Base):
__tablename__ = "douyin_aweme_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
sec_uid: Mapped[str | None] = mapped_column(String(255), nullable=True)
short_user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_unique_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_signature: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
aweme_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, default='0', nullable=True)
pictures: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
class DyCreator(Base):
__tablename__ = "dy_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
videos_count: Mapped[str | None] = mapped_column(String(255), nullable=True)
class KuaishouVideo(Base):
__tablename__ = "kuaishou_video"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
video_type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
viewd_count: Mapped[str | None] = mapped_column(Text, nullable=True)
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_cover_url: Mapped[str | None] = mapped_column(Text, nullable=True)
video_play_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class KuaishouVideoComment(Base):
__tablename__ = "kuaishou_video_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(Text, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
video_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
class WeiboNote(Base):
__tablename__ = "weibo_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comments_count: Mapped[str | None] = mapped_column(Text, nullable=True)
shared_count: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class WeiboNoteComment(Base):
__tablename__ = "weibo_note_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
profile_url: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
note_id: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
create_date_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
comment_like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
class WeiboCreator(Base):
__tablename__ = "weibo_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
class XhsCreator(Base):
__tablename__ = "xhs_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
interaction: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
class XhsNote(Base):
__tablename__ = "xhs_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
type: Mapped[str | None] = mapped_column(Text, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
video_url: Mapped[str | None] = mapped_column(Text, nullable=True)
time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
last_update_time: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
liked_count: Mapped[str | None] = mapped_column(Text, nullable=True)
collected_count: Mapped[str | None] = mapped_column(Text, nullable=True)
comment_count: Mapped[str | None] = mapped_column(Text, nullable=True)
share_count: Mapped[str | None] = mapped_column(Text, nullable=True)
image_list: Mapped[str | None] = mapped_column(Text, nullable=True)
tag_list: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
xsec_token: Mapped[str | None] = mapped_column(Text, nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class XhsNoteComment(Base):
__tablename__ = "xhs_note_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
create_time: Mapped[int | None] = mapped_column(BigInteger, index=True, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
pictures: Mapped[str | None] = mapped_column(Text, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
like_count: Mapped[str | None] = mapped_column(Text, nullable=True)
class TiebaNote(Base):
__tablename__ = "tieba_note"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
note_id: Mapped[str | None] = mapped_column(String(644), index=True, nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
total_replay_num: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
total_replay_page: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class TiebaComment(Base):
__tablename__ = "tieba_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
comment_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
tieba_id: Mapped[str | None] = mapped_column(String(255), default='', nullable=True)
tieba_name: Mapped[str | None] = mapped_column(Text, nullable=True)
tieba_link: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, default='', nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
note_id: Mapped[str | None] = mapped_column(String(255), index=True, nullable=True)
note_url: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class TiebaCreator(Base):
__tablename__ = "tieba_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
user_name: Mapped[str | None] = mapped_column(Text, nullable=True)
nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[str | None] = mapped_column(Text, nullable=True)
fans: Mapped[str | None] = mapped_column(Text, nullable=True)
registration_duration: Mapped[str | None] = mapped_column(Text, nullable=True)
class ZhihuContent(Base):
__tablename__ = "zhihu_content"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
content_url: Mapped[str | None] = mapped_column(Text, nullable=True)
question_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
title: Mapped[str | None] = mapped_column(Text, nullable=True)
desc: Mapped[str | None] = mapped_column(Text, nullable=True)
created_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
updated_time: Mapped[str | None] = mapped_column(Text, nullable=True)
voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
source_keyword: Mapped[str | None] = mapped_column(Text, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
user_url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
topic_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("daily_topics.topic_id", ondelete="SET NULL"), nullable=True)
crawling_task_id: Mapped[str | None] = mapped_column(String(64), ForeignKey("crawling_tasks.task_id", ondelete="SET NULL"), nullable=True)
class ZhihuComment(Base):
__tablename__ = "zhihu_comment"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
comment_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
parent_comment_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
content: Mapped[str | None] = mapped_column(Text, nullable=True)
publish_time: Mapped[str | None] = mapped_column(String(32), index=True, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
sub_comment_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
like_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
dislike_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
content_id: Mapped[str | None] = mapped_column(String(64), index=True, nullable=True)
content_type: Mapped[str | None] = mapped_column(Text, nullable=True)
user_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
class ZhihuCreator(Base):
__tablename__ = "zhihu_creator"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str | None] = mapped_column(String(64), unique=True, index=True, nullable=True)
user_link: Mapped[str | None] = mapped_column(Text, nullable=True)
user_nickname: Mapped[str | None] = mapped_column(Text, nullable=True)
user_avatar: Mapped[str | None] = mapped_column(Text, nullable=True)
url_token: Mapped[str | None] = mapped_column(Text, nullable=True)
gender: Mapped[str | None] = mapped_column(Text, nullable=True)
ip_location: Mapped[str | None] = mapped_column(Text, nullable=True)
follows: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
fans: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
anwser_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
video_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
question_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
article_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
column_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
get_voteup_count: Mapped[int | None] = mapped_column(Integer, default=0, nullable=True)
add_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
last_modify_ts: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+126
View File
@@ -0,0 +1,126 @@
"""
MindSpider 数据库ORM模型(SQLAlchemy 2.x
此模块定义 MindSpider 扩展表(与原 MediaCrawler 表解耦)的 ORM 模型。
数据模型定义位置:
- 本文件(MindSpider/schema/models_sa.py
"""
from __future__ import annotations
from typing import Optional
from datetime import date
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Integer, String, Text, BigInteger, Date, Float, ForeignKey, Index, UniqueConstraint
from sqlalchemy.schema import ForeignKeyConstraint
from sqlalchemy.orm import relationship
__all__ = [
"Base",
"DailyNews",
"DailyTopic",
"TopicNewsRelation",
"CrawlingTask",
]
class Base(DeclarativeBase):
pass
class DailyNews(Base):
__tablename__ = "daily_news"
__table_args__ = (
UniqueConstraint("news_id", name="uq_daily_news_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("news_id", "source_platform", "crawl_date", name="uq_daily_news_unique"),
Index("idx_daily_news_date", "crawl_date"),
Index("idx_daily_news_platform", "source_platform"),
Index("idx_daily_news_rank", "rank_position"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
source_platform: Mapped[str] = mapped_column(String(32), nullable=False)
title: Mapped[str] = mapped_column(String(500), nullable=False)
url: Mapped[Optional[str]] = mapped_column(String(512))
description: Mapped[Optional[str]] = mapped_column(Text)
extra_info: Mapped[Optional[str]] = mapped_column(Text)
crawl_date: Mapped[date] = mapped_column(Date, nullable=False)
rank_position: Mapped[Optional[int]] = mapped_column(Integer)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class DailyTopic(Base):
__tablename__ = "daily_topics"
__table_args__ = (
UniqueConstraint("topic_id", name="uq_daily_topics_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("topic_id", "extract_date", name="uq_daily_topics_unique"),
Index("idx_daily_topics_date", "extract_date"),
Index("idx_daily_topics_status", "processing_status"),
Index("idx_daily_topics_score", "relevance_score"),
Index("idx_topic_date_status", "extract_date", "processing_status"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_name: Mapped[str] = mapped_column(String(255), nullable=False)
topic_description: Mapped[Optional[str]] = mapped_column(Text)
keywords: Mapped[Optional[str]] = mapped_column(Text)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
relevance_score: Mapped[Optional[float]] = mapped_column(Float)
news_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
processing_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class TopicNewsRelation(Base):
__tablename__ = "topic_news_relation"
__table_args__ = (
UniqueConstraint("topic_id", "news_id", "extract_date", name="uq_topic_news_unique"),
Index("idx_topic_news_topic", "topic_id"),
Index("idx_topic_news_news", "news_id"),
Index("idx_topic_news_date", "extract_date"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
ForeignKeyConstraint(["news_id"], ["daily_news.news_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
relation_score: Mapped[Optional[float]] = mapped_column(Float)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class CrawlingTask(Base):
__tablename__ = "crawling_tasks"
__table_args__ = (
UniqueConstraint("task_id", name="uq_crawling_tasks_unique"),
Index("idx_crawling_tasks_topic", "topic_id"),
Index("idx_crawling_tasks_platform", "platform"),
Index("idx_crawling_tasks_status", "task_status"),
Index("idx_crawling_tasks_date", "scheduled_date"),
Index("idx_task_topic_platform", "topic_id", "platform", "task_status"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
task_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
platform: Mapped[str] = mapped_column(String(32), nullable=False)
search_keywords: Mapped[str] = mapped_column(Text, nullable=False)
task_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
start_time: Mapped[Optional[int]] = mapped_column(BigInteger)
end_time: Mapped[Optional[int]] = mapped_column(BigInteger)
total_crawled: Mapped[Optional[int]] = mapped_column(Integer, default=0)
success_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_message: Mapped[Optional[str]] = mapped_column(Text)
config_params: Mapped[Optional[str]] = mapped_column(Text)
scheduled_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)