1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
+126
View File
@@ -0,0 +1,126 @@
"""
MindSpider 数据库ORM模型(SQLAlchemy 2.x
此模块定义 MindSpider 扩展表(与原 MediaCrawler 表解耦)的 ORM 模型。
数据模型定义位置:
- 本文件(MindSpider/schema/models_sa.py
"""
from __future__ import annotations
from typing import Optional
from datetime import date
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Integer, String, Text, BigInteger, Date, Float, ForeignKey, Index, UniqueConstraint
from sqlalchemy.schema import ForeignKeyConstraint
from sqlalchemy.orm import relationship
__all__ = [
"Base",
"DailyNews",
"DailyTopic",
"TopicNewsRelation",
"CrawlingTask",
]
class Base(DeclarativeBase):
pass
class DailyNews(Base):
__tablename__ = "daily_news"
__table_args__ = (
UniqueConstraint("news_id", name="uq_daily_news_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("news_id", "source_platform", "crawl_date", name="uq_daily_news_unique"),
Index("idx_daily_news_date", "crawl_date"),
Index("idx_daily_news_platform", "source_platform"),
Index("idx_daily_news_rank", "rank_position"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
source_platform: Mapped[str] = mapped_column(String(32), nullable=False)
title: Mapped[str] = mapped_column(String(500), nullable=False)
url: Mapped[Optional[str]] = mapped_column(String(512))
description: Mapped[Optional[str]] = mapped_column(Text)
extra_info: Mapped[Optional[str]] = mapped_column(Text)
crawl_date: Mapped[date] = mapped_column(Date, nullable=False)
rank_position: Mapped[Optional[int]] = mapped_column(Integer)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class DailyTopic(Base):
__tablename__ = "daily_topics"
__table_args__ = (
UniqueConstraint("topic_id", name="uq_daily_topics_id_unique"), # 为外键引用添加唯一约束
UniqueConstraint("topic_id", "extract_date", name="uq_daily_topics_unique"),
Index("idx_daily_topics_date", "extract_date"),
Index("idx_daily_topics_status", "processing_status"),
Index("idx_daily_topics_score", "relevance_score"),
Index("idx_topic_date_status", "extract_date", "processing_status"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_name: Mapped[str] = mapped_column(String(255), nullable=False)
topic_description: Mapped[Optional[str]] = mapped_column(Text)
keywords: Mapped[Optional[str]] = mapped_column(Text)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
relevance_score: Mapped[Optional[float]] = mapped_column(Float)
news_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
processing_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class TopicNewsRelation(Base):
__tablename__ = "topic_news_relation"
__table_args__ = (
UniqueConstraint("topic_id", "news_id", "extract_date", name="uq_topic_news_unique"),
Index("idx_topic_news_topic", "topic_id"),
Index("idx_topic_news_news", "news_id"),
Index("idx_topic_news_date", "extract_date"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
ForeignKeyConstraint(["news_id"], ["daily_news.news_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
news_id: Mapped[str] = mapped_column(String(128), nullable=False)
relation_score: Mapped[Optional[float]] = mapped_column(Float)
extract_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
class CrawlingTask(Base):
__tablename__ = "crawling_tasks"
__table_args__ = (
UniqueConstraint("task_id", name="uq_crawling_tasks_unique"),
Index("idx_crawling_tasks_topic", "topic_id"),
Index("idx_crawling_tasks_platform", "platform"),
Index("idx_crawling_tasks_status", "task_status"),
Index("idx_crawling_tasks_date", "scheduled_date"),
Index("idx_task_topic_platform", "topic_id", "platform", "task_status"),
ForeignKeyConstraint(["topic_id"], ["daily_topics.topic_id"], ondelete="CASCADE"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
task_id: Mapped[str] = mapped_column(String(64), nullable=False)
topic_id: Mapped[str] = mapped_column(String(64), nullable=False)
platform: Mapped[str] = mapped_column(String(32), nullable=False)
search_keywords: Mapped[str] = mapped_column(Text, nullable=False)
task_status: Mapped[Optional[str]] = mapped_column(String(16), default="pending")
start_time: Mapped[Optional[int]] = mapped_column(BigInteger)
end_time: Mapped[Optional[int]] = mapped_column(BigInteger)
total_crawled: Mapped[Optional[int]] = mapped_column(Integer, default=0)
success_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_count: Mapped[Optional[int]] = mapped_column(Integer, default=0)
error_message: Mapped[Optional[str]] = mapped_column(Text)
config_params: Mapped[Optional[str]] = mapped_column(Text)
scheduled_date: Mapped[date] = mapped_column(Date, nullable=False)
add_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)
last_modify_ts: Mapped[int] = mapped_column(BigInteger, nullable=False)