1. 同步MediaCrawler为最新版本

2. 修复数据库not null错误
3. 支持PG数据库
4. 规范环境变量及配置使用
5. 规范为uv安装
6. 使用loggru
This commit is contained in:
Doiiars
2025-11-03 22:38:34 +08:00
parent 62fac9ee2e
commit f4fe4141d4
155 changed files with 9414 additions and 6247 deletions
@@ -0,0 +1,35 @@
# persist-1<persist1@126.com>
# 原因:将 db.py 改造为模块,移除直接执行入口,修复相对导入问题。
# 副作用:无
# 回滚策略:还原此文件。
import asyncio
import sys
from pathlib import Path
# Add project root to sys.path
project_root = Path(__file__).resolve().parents[1]
if str(project_root) not in sys.path:
sys.path.append(str(project_root))
from tools import utils
from database.db_session import create_tables
async def init_table_schema(db_type: str):
"""
Initializes the database table schema.
This will create tables based on the ORM models.
Args:
db_type: The type of database, 'sqlite', 'mysql', or 'postgresql'.
"""
utils.logger.info(f"[init_table_schema] begin init {db_type} table schema ...")
await create_tables(db_type)
utils.logger.info(f"[init_table_schema] {db_type} table schema init successful")
async def init_db(db_type: str = None):
await init_table_schema(db_type)
async def close():
"""
Placeholder for closing database connections if needed in the future.
"""
pass
@@ -0,0 +1,87 @@
from sqlalchemy import text
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from contextlib import asynccontextmanager
from .models import Base
import config
from config.db_config import mysql_db_config, sqlite_db_config, postgresql_db_config
# Keep a cache of engines
_engines = {}
async def create_database_if_not_exists(db_type: str):
if db_type == "mysql" or db_type == "db":
# Connect to the server without a database
server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
engine = create_async_engine(server_url, echo=False)
async with engine.connect() as conn:
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
await engine.dispose()
elif db_type == "postgresql":
# Connect to PostgreSQL default database (postgres) to create target database
server_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/postgres"
engine = create_async_engine(server_url, echo=False, isolation_level="AUTOCOMMIT")
async with engine.connect() as conn:
# PostgreSQL uses different syntax - check if database exists first
result = await conn.execute(
text(f"SELECT 1 FROM pg_database WHERE datname = '{postgresql_db_config['db_name']}'")
)
exists = result.scalar() is not None
if not exists:
# Set autocommit for CREATE DATABASE
await conn.commit()
await conn.execute(text(f"CREATE DATABASE {postgresql_db_config['db_name']}"))
await engine.dispose()
def get_async_engine(db_type: str = None):
if db_type is None:
db_type = config.SAVE_DATA_OPTION
if db_type in _engines:
return _engines[db_type]
if db_type in ["json", "csv"]:
return None
if db_type == "sqlite":
db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
elif db_type == "mysql" or db_type == "db":
db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
elif db_type == "postgresql":
db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
else:
raise ValueError(f"Unsupported database type: {db_type}")
engine = create_async_engine(db_url, echo=False)
_engines[db_type] = engine
return engine
async def create_tables(db_type: str = None):
if db_type is None:
db_type = config.SAVE_DATA_OPTION
await create_database_if_not_exists(db_type)
engine = get_async_engine(db_type)
if engine:
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
@asynccontextmanager
async def get_session() -> AsyncSession:
engine = get_async_engine(config.SAVE_DATA_OPTION)
if not engine:
yield None
return
AsyncSessionFactory = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
session = AsyncSessionFactory()
try:
yield session
await session.commit()
except Exception as e:
await session.rollback()
raise e
finally:
await session.close()
@@ -0,0 +1,434 @@
from sqlalchemy import create_engine, Column, Integer, Text, String, BigInteger
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class BilibiliVideo(Base):
__tablename__ = 'bilibili_video'
id = Column(Integer, primary_key=True)
video_id = Column(BigInteger, nullable=False, index=True, unique=True)
video_url = Column(Text, nullable=False)
user_id = Column(BigInteger, index=True)
nickname = Column(Text)
avatar = Column(Text)
liked_count = Column(Integer)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
video_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
disliked_count = Column(Text)
video_play_count = Column(Text)
video_favorite_count = Column(Text)
video_share_count = Column(Text)
video_coin_count = Column(Text)
video_danmaku = Column(Text)
video_comment = Column(Text)
video_cover_url = Column(Text)
source_keyword = Column(Text, default='')
class BilibiliVideoComment(Base):
__tablename__ = 'bilibili_video_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
sex = Column(Text)
sign = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
video_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text, default='0')
class BilibiliUpInfo(Base):
__tablename__ = 'bilibili_up_info'
id = Column(Integer, primary_key=True)
user_id = Column(BigInteger, index=True)
nickname = Column(Text)
sex = Column(Text)
sign = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
total_fans = Column(Integer)
total_liked = Column(Integer)
user_rank = Column(Integer)
is_official = Column(Integer)
class BilibiliContactInfo(Base):
__tablename__ = 'bilibili_contact_info'
id = Column(Integer, primary_key=True)
up_id = Column(BigInteger, index=True)
fan_id = Column(BigInteger, index=True)
up_name = Column(Text)
fan_name = Column(Text)
up_sign = Column(Text)
fan_sign = Column(Text)
up_avatar = Column(Text)
fan_avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class BilibiliUpDynamic(Base):
__tablename__ = 'bilibili_up_dynamic'
id = Column(Integer, primary_key=True)
dynamic_id = Column(BigInteger, index=True)
user_id = Column(String(255))
user_name = Column(Text)
text = Column(Text)
type = Column(Text)
pub_ts = Column(BigInteger)
total_comments = Column(Integer)
total_forwards = Column(Integer)
total_liked = Column(Integer)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class DouyinAweme(Base):
__tablename__ = 'douyin_aweme'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
sec_uid = Column(String(255))
short_user_id = Column(String(255))
user_unique_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
user_signature = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
aweme_id = Column(BigInteger, index=True)
aweme_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
liked_count = Column(Text)
comment_count = Column(Text)
share_count = Column(Text)
collected_count = Column(Text)
aweme_url = Column(Text)
cover_url = Column(Text)
video_download_url = Column(Text)
music_download_url = Column(Text)
note_download_url = Column(Text)
source_keyword = Column(Text, default='')
class DouyinAwemeComment(Base):
__tablename__ = 'douyin_aweme_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
sec_uid = Column(String(255))
short_user_id = Column(String(255))
user_unique_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
user_signature = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
aweme_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text, default='0')
pictures = Column(Text, default='')
class DyCreator(Base):
__tablename__ = 'dy_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
interaction = Column(Text)
videos_count = Column(String(255))
class KuaishouVideo(Base):
__tablename__ = 'kuaishou_video'
id = Column(Integer, primary_key=True)
user_id = Column(String(64))
nickname = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
video_id = Column(String(255), index=True)
video_type = Column(Text)
title = Column(Text)
desc = Column(Text)
create_time = Column(BigInteger, index=True)
liked_count = Column(Text)
viewd_count = Column(Text)
video_url = Column(Text)
video_cover_url = Column(Text)
video_play_url = Column(Text)
source_keyword = Column(Text, default='')
class KuaishouVideoComment(Base):
__tablename__ = 'kuaishou_video_comment'
id = Column(Integer, primary_key=True)
user_id = Column(Text)
nickname = Column(Text)
avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
video_id = Column(String(255), index=True)
content = Column(Text)
create_time = Column(BigInteger)
sub_comment_count = Column(Text)
class WeiboNote(Base):
__tablename__ = 'weibo_note'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
gender = Column(Text)
profile_url = Column(Text)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
note_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger, index=True)
create_date_time = Column(String(255), index=True)
liked_count = Column(Text)
comments_count = Column(Text)
shared_count = Column(Text)
note_url = Column(Text)
source_keyword = Column(Text, default='')
class WeiboNoteComment(Base):
__tablename__ = 'weibo_note_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
gender = Column(Text)
profile_url = Column(Text)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(BigInteger, index=True)
note_id = Column(BigInteger, index=True)
content = Column(Text)
create_time = Column(BigInteger)
create_date_time = Column(String(255), index=True)
comment_like_count = Column(Text)
sub_comment_count = Column(Text)
parent_comment_id = Column(String(255))
class WeiboCreator(Base):
__tablename__ = 'weibo_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
tag_list = Column(Text)
class XhsCreator(Base):
__tablename__ = 'xhs_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
desc = Column(Text)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
interaction = Column(Text)
tag_list = Column(Text)
class XhsNote(Base):
__tablename__ = 'xhs_note'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
note_id = Column(String(255), index=True)
type = Column(Text)
title = Column(Text)
desc = Column(Text)
video_url = Column(Text)
time = Column(BigInteger, index=True)
last_update_time = Column(BigInteger)
liked_count = Column(Text)
collected_count = Column(Text)
comment_count = Column(Text)
share_count = Column(Text)
image_list = Column(Text)
tag_list = Column(Text)
note_url = Column(Text)
source_keyword = Column(Text, default='')
xsec_token = Column(Text)
class XhsNoteComment(Base):
__tablename__ = 'xhs_note_comment'
id = Column(Integer, primary_key=True)
user_id = Column(String(255))
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
comment_id = Column(String(255), index=True)
create_time = Column(BigInteger, index=True)
note_id = Column(String(255))
content = Column(Text)
sub_comment_count = Column(Integer)
pictures = Column(Text)
parent_comment_id = Column(String(255))
like_count = Column(Text)
class TiebaNote(Base):
__tablename__ = 'tieba_note'
id = Column(Integer, primary_key=True)
note_id = Column(String(644), index=True)
title = Column(Text)
desc = Column(Text)
note_url = Column(Text)
publish_time = Column(String(255), index=True)
user_link = Column(Text, default='')
user_nickname = Column(Text, default='')
user_avatar = Column(Text, default='')
tieba_id = Column(String(255), default='')
tieba_name = Column(Text)
tieba_link = Column(Text)
total_replay_num = Column(Integer, default=0)
total_replay_page = Column(Integer, default=0)
ip_location = Column(Text, default='')
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
source_keyword = Column(Text, default='')
class TiebaComment(Base):
__tablename__ = 'tieba_comment'
id = Column(Integer, primary_key=True)
comment_id = Column(String(255), index=True)
parent_comment_id = Column(String(255), default='')
content = Column(Text)
user_link = Column(Text, default='')
user_nickname = Column(Text, default='')
user_avatar = Column(Text, default='')
tieba_id = Column(String(255), default='')
tieba_name = Column(Text)
tieba_link = Column(Text)
publish_time = Column(String(255), index=True)
ip_location = Column(Text, default='')
sub_comment_count = Column(Integer, default=0)
note_id = Column(String(255), index=True)
note_url = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class TiebaCreator(Base):
__tablename__ = 'tieba_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(64))
user_name = Column(Text)
nickname = Column(Text)
avatar = Column(Text)
ip_location = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
gender = Column(Text)
follows = Column(Text)
fans = Column(Text)
registration_duration = Column(Text)
class ZhihuContent(Base):
__tablename__ = 'zhihu_content'
id = Column(Integer, primary_key=True)
content_id = Column(String(64), index=True)
content_type = Column(Text)
content_text = Column(Text)
content_url = Column(Text)
question_id = Column(String(255))
title = Column(Text)
desc = Column(Text)
created_time = Column(String(32), index=True)
updated_time = Column(Text)
voteup_count = Column(Integer, default=0)
comment_count = Column(Integer, default=0)
source_keyword = Column(Text)
user_id = Column(String(255))
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
user_url_token = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
# persist-1<persist1@126.com>
# 原因:修复 ORM 模型定义错误,确保与数据库表结构一致。
# 副作用:无
# 回滚策略:还原此行
class ZhihuComment(Base):
__tablename__ = 'zhihu_comment'
id = Column(Integer, primary_key=True)
comment_id = Column(String(64), index=True)
parent_comment_id = Column(String(64))
content = Column(Text)
publish_time = Column(String(32), index=True)
ip_location = Column(Text)
sub_comment_count = Column(Integer, default=0)
like_count = Column(Integer, default=0)
dislike_count = Column(Integer, default=0)
content_id = Column(String(64), index=True)
content_type = Column(Text)
user_id = Column(String(64))
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)
class ZhihuCreator(Base):
__tablename__ = 'zhihu_creator'
id = Column(Integer, primary_key=True)
user_id = Column(String(64), unique=True, index=True)
user_link = Column(Text)
user_nickname = Column(Text)
user_avatar = Column(Text)
url_token = Column(Text)
gender = Column(Text)
ip_location = Column(Text)
follows = Column(Integer, default=0)
fans = Column(Integer, default=0)
anwser_count = Column(Integer, default=0)
video_count = Column(Integer, default=0)
question_count = Column(Integer, default=0)
article_count = Column(Integer, default=0)
column_count = Column(Integer, default=0)
get_voteup_count = Column(Integer, default=0)
add_ts = Column(BigInteger)
last_modify_ts = Column(BigInteger)