1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -7,11 +7,12 @@ BroadTopicExtraction模块 - 数据库管理器
|
||||
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime, date
|
||||
from datetime import datetime, date, timedelta
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
import pymysql
|
||||
from pymysql.cursors import DictCursor
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine
|
||||
from loguru import logger
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -22,37 +23,44 @@ try:
|
||||
except ImportError:
|
||||
raise ImportError("无法导入config.py配置文件")
|
||||
|
||||
from config import settings
|
||||
|
||||
class DatabaseManager:
|
||||
"""数据库管理器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化数据库管理器"""
|
||||
self.connection = None
|
||||
self.engine: Engine = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""连接数据库"""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
database=config.DB_NAME,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True,
|
||||
cursorclass=DictCursor
|
||||
)
|
||||
print(f"成功连接到数据库: {config.DB_NAME}")
|
||||
dialect = (settings.DB_DIALECT or "mysql").lower()
|
||||
if dialect in ("postgresql", "postgres"):
|
||||
url = f"postgresql+psycopg://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
|
||||
else:
|
||||
url = f"mysql+pymysql://{settings.DB_USER}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}?charset={settings.DB_CHARSET}"
|
||||
self.engine = create_engine(url, future=True)
|
||||
logger.info(f"成功连接到数据库: {settings.DB_NAME}")
|
||||
except ModuleNotFoundError as e:
|
||||
missing: str = str(e)
|
||||
if "psycopg" in missing:
|
||||
logger.error("数据库连接失败: 未安装PostgreSQL驱动 psycopg。请安装: psycopg[binary]。参考指令:uv pip install psycopg[binary]")
|
||||
elif "pymysql" in missing:
|
||||
logger.error("数据库连接失败: 未安装MySQL驱动 pymysql。请安装: pymysql。参考指令:uv pip install pymysql")
|
||||
else:
|
||||
logger.error(f"数据库连接失败(缺少驱动): {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"数据库连接失败: {e}")
|
||||
logger.error(f"数据库连接失败: {e}")
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接"""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
print("数据库连接已关闭")
|
||||
if self.engine:
|
||||
self.engine.dispose()
|
||||
logger.info("数据库连接已关闭")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@@ -79,48 +87,49 @@ class DatabaseManager:
|
||||
current_timestamp = int(datetime.now().timestamp())
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 先删除当天所有的新闻记录(覆盖模式)
|
||||
delete_query = "DELETE FROM daily_news WHERE crawl_date = %s"
|
||||
deleted_count = cursor.execute(delete_query, (crawl_date,))
|
||||
if deleted_count > 0:
|
||||
print(f"覆盖模式:删除了当天已有的 {deleted_count} 条新闻记录")
|
||||
|
||||
# 批量插入新记录
|
||||
saved_count = 0
|
||||
# 先独立事务执行删除,防止后续插入失败导致无法清理
|
||||
with self.engine.begin() as conn:
|
||||
deleted = conn.execute(text("DELETE FROM daily_news WHERE crawl_date = :d"), {"d": crawl_date}).rowcount
|
||||
if deleted and deleted > 0:
|
||||
logger.info(f"覆盖模式:删除了当天已有的 {deleted} 条新闻记录")
|
||||
|
||||
# 逐条插入,单条失败不影响后续(每条独立事务)
|
||||
for news_item in news_data:
|
||||
try:
|
||||
# 简化的新闻ID生成
|
||||
news_id = f"{news_item.get('source', 'unknown')}_{news_item.get('id', news_item.get('rank', 0))}"
|
||||
|
||||
# 插入新记录
|
||||
insert_query = """
|
||||
INSERT INTO daily_news (
|
||||
news_id, source_platform, title, url, crawl_date,
|
||||
rank_position, add_ts
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(insert_query, (
|
||||
news_id,
|
||||
news_item.get('source', 'unknown'),
|
||||
news_item.get('title', ''),
|
||||
news_item.get('url', ''),
|
||||
crawl_date,
|
||||
news_item.get('rank', None),
|
||||
current_timestamp
|
||||
))
|
||||
title_val = (news_item.get("title", "") or "")
|
||||
if len(title_val) > 500:
|
||||
title_val = title_val[:500]
|
||||
with self.engine.begin() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"""
|
||||
INSERT INTO daily_news (
|
||||
news_id, source_platform, title, url, crawl_date,
|
||||
rank_position, add_ts, last_modify_ts
|
||||
) VALUES (:news_id, :source_platform, :title, :url, :crawl_date, :rank_position, :add_ts, :last_modify_ts)
|
||||
"""
|
||||
),
|
||||
{
|
||||
"news_id": news_id,
|
||||
"source_platform": news_item.get("source", "unknown"),
|
||||
"title": title_val,
|
||||
"url": news_item.get("url", ""),
|
||||
"crawl_date": crawl_date,
|
||||
"rank_position": news_item.get("rank", None),
|
||||
"add_ts": current_timestamp,
|
||||
"last_modify_ts": current_timestamp,
|
||||
},
|
||||
)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存单条新闻失败: {e}")
|
||||
logger.warning(f"保存单条新闻失败: {e}")
|
||||
continue
|
||||
|
||||
print(f"成功保存 {saved_count} 条新闻记录")
|
||||
logger.info(f"成功保存 {saved_count} 条新闻记录")
|
||||
return saved_count
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存新闻数据失败: {e}")
|
||||
logger.exception(f"保存新闻数据失败: {e}")
|
||||
return 0
|
||||
|
||||
def get_daily_news(self, crawl_date: date = None) -> List[Dict]:
|
||||
@@ -136,15 +145,13 @@ class DatabaseManager:
|
||||
if not crawl_date:
|
||||
crawl_date = date.today()
|
||||
|
||||
query = """
|
||||
SELECT * FROM daily_news
|
||||
WHERE crawl_date = %s
|
||||
ORDER BY rank_position ASC
|
||||
"""
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(query, (crawl_date,))
|
||||
return cursor.fetchall()
|
||||
query = (
|
||||
"SELECT * FROM daily_news WHERE crawl_date = :d ORDER BY rank_position ASC"
|
||||
)
|
||||
with self.engine.connect() as conn:
|
||||
result = conn.execute(text(query), {"d": crawl_date})
|
||||
rows = result.mappings().all()
|
||||
return rows
|
||||
|
||||
# ==================== 话题数据操作 ====================
|
||||
|
||||
@@ -166,37 +173,31 @@ class DatabaseManager:
|
||||
current_timestamp = int(datetime.now().timestamp())
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 检查今天是否已有记录
|
||||
check_query = "SELECT id FROM daily_topics WHERE extract_date = %s"
|
||||
cursor.execute(check_query, (extract_date,))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
keywords_json = json.dumps(keywords, ensure_ascii=False)
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
update_query = """
|
||||
UPDATE daily_topics
|
||||
SET keywords = %s, summary = %s, add_ts = %s
|
||||
WHERE extract_date = %s
|
||||
"""
|
||||
cursor.execute(update_query, (keywords_json, summary, current_timestamp, extract_date))
|
||||
print(f"更新了 {extract_date} 的话题分析")
|
||||
else:
|
||||
# 插入新记录
|
||||
insert_query = """
|
||||
INSERT INTO daily_topics (extract_date, keywords, summary, add_ts)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(insert_query, (extract_date, keywords_json, summary, current_timestamp))
|
||||
print(f"保存了 {extract_date} 的话题分析")
|
||||
|
||||
with self.engine.begin() as conn:
|
||||
check = conn.execute(
|
||||
text("SELECT id FROM daily_topics WHERE extract_date = :d AND topic_id = :tid"),
|
||||
{"d": extract_date, "tid": "summary"},
|
||||
).first()
|
||||
if check:
|
||||
conn.execute(
|
||||
text(
|
||||
"UPDATE daily_topics SET keywords = :k, topic_description = :s, add_ts = :ts, last_modify_ts = :lmt, topic_name = :tn WHERE extract_date = :d AND topic_id = :tid"
|
||||
),
|
||||
{"k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp, "d": extract_date, "tid": "summary", "tn": "每日新闻分析"},
|
||||
)
|
||||
logger.info(f"更新了 {extract_date} 的话题分析")
|
||||
else:
|
||||
conn.execute(
|
||||
text(
|
||||
"INSERT INTO daily_topics (extract_date, topic_id, topic_name, keywords, topic_description, add_ts, last_modify_ts) VALUES (:d, :tid, :tn, :k, :s, :ts, :lmt)"
|
||||
),
|
||||
{"d": extract_date, "tid": "summary", "tn": "每日新闻分析", "k": keywords_json, "s": summary, "ts": current_timestamp, "lmt": current_timestamp},
|
||||
)
|
||||
logger.info(f"保存了 {extract_date} 的话题分析")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"保存话题分析失败: {e}")
|
||||
logger.exception(f"保存话题分析失败: {e}")
|
||||
return False
|
||||
|
||||
def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
|
||||
@@ -213,20 +214,15 @@ class DatabaseManager:
|
||||
extract_date = date.today()
|
||||
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = "SELECT * FROM daily_topics WHERE extract_date = %s"
|
||||
cursor.execute(query, (extract_date,))
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result:
|
||||
# 解析关键词JSON
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
return result
|
||||
else:
|
||||
with self.engine.connect() as conn:
|
||||
result = conn.execute(text("SELECT * FROM daily_topics WHERE extract_date = :d"), {"d": extract_date}).mappings().first()
|
||||
if result:
|
||||
result = dict(result) # 转为可变dict以支持item赋值
|
||||
result["keywords"] = json.loads(result["keywords"]) if result.get("keywords") else []
|
||||
return result
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取话题分析失败: {e}")
|
||||
logger.exception(f"获取话题分析失败: {e}")
|
||||
return None
|
||||
|
||||
def get_recent_topics(self, days: int = 7) -> List[Dict]:
|
||||
@@ -240,23 +236,23 @@ class DatabaseManager:
|
||||
话题分析列表
|
||||
"""
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
query = """
|
||||
SELECT * FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
cursor.execute(query, (days,))
|
||||
results = cursor.fetchall()
|
||||
|
||||
# 解析每个结果的关键词JSON
|
||||
for result in results:
|
||||
result['keywords'] = json.loads(result['keywords'])
|
||||
|
||||
return results
|
||||
|
||||
start_date = date.today() - timedelta(days=days)
|
||||
with self.engine.connect() as conn:
|
||||
results = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT * FROM daily_topics
|
||||
WHERE extract_date >= :start_date
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).mappings().all()
|
||||
for r in results:
|
||||
r["keywords"] = json.loads(r["keywords"]) if r.get("keywords") else []
|
||||
return results
|
||||
except Exception as e:
|
||||
print(f"获取最近话题分析失败: {e}")
|
||||
logger.exception(f"获取最近话题分析失败: {e}")
|
||||
return []
|
||||
|
||||
# ==================== 统计查询 ====================
|
||||
@@ -264,56 +260,48 @@ class DatabaseManager:
|
||||
def get_summary_stats(self, days: int = 7) -> Dict:
|
||||
"""获取统计摘要"""
|
||||
try:
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 新闻统计
|
||||
news_query = """
|
||||
SELECT
|
||||
crawl_date,
|
||||
COUNT(*) as news_count,
|
||||
COUNT(DISTINCT source_platform) as platforms_count
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
"""
|
||||
cursor.execute(news_query, (days,))
|
||||
news_stats = cursor.fetchall()
|
||||
|
||||
# 话题统计
|
||||
topics_query = """
|
||||
SELECT
|
||||
extract_date,
|
||||
keywords,
|
||||
CHAR_LENGTH(summary) as summary_length
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
cursor.execute(topics_query, (days,))
|
||||
topics_stats = cursor.fetchall()
|
||||
|
||||
return {
|
||||
'news_stats': news_stats,
|
||||
'topics_stats': topics_stats
|
||||
}
|
||||
|
||||
start_date = date.today() - timedelta(days=days)
|
||||
with self.engine.connect() as conn:
|
||||
news_stats = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms_count
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= :start_date
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).all()
|
||||
topics_stats = conn.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT extract_date, keywords, CHAR_LENGTH(topic_description) as summary_length
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= :start_date
|
||||
ORDER BY extract_date DESC
|
||||
"""
|
||||
),
|
||||
{"start_date": start_date},
|
||||
).all()
|
||||
return {"news_stats": news_stats, "topics_stats": topics_stats}
|
||||
except Exception as e:
|
||||
print(f"获取统计摘要失败: {e}")
|
||||
return {'news_stats': [], 'topics_stats': []}
|
||||
logger.exception(f"获取统计摘要失败: {e}")
|
||||
return {"news_stats": [], "topics_stats": []}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试数据库管理器
|
||||
with DatabaseManager() as db:
|
||||
# 测试获取新闻
|
||||
news = db.get_daily_news()
|
||||
print(f"今日新闻数量: {len(news)}")
|
||||
logger.info(f"今日新闻数量: {len(news)}")
|
||||
|
||||
# 测试获取话题
|
||||
topics = db.get_daily_topics()
|
||||
if topics:
|
||||
print(f"今日话题关键词: {topics['keywords']}")
|
||||
logger.info(f"今日话题关键词: {topics['keywords']}")
|
||||
else:
|
||||
print("今日暂无话题分析")
|
||||
logger.info("今日暂无话题分析")
|
||||
|
||||
print("简化数据库管理器测试完成!")
|
||||
logger.info("简化数据库管理器测试完成!")
|
||||
|
||||
Reference in New Issue
Block a user