生成日报、周报

This commit is contained in:
z66
2025-10-30 09:54:47 +08:00
parent c894e344aa
commit c5a5a0a99c
17 changed files with 3129 additions and 1 deletions
+459
View File
@@ -0,0 +1,459 @@
"""
报告生成器基类
提供数据源接口、AI处理接口等扩展能力
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import os
import sys
from loguru import logger
# 添加父目录到路径
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(current_dir))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from utils.mysql_agent import MySQLAgent
from config import Config
class DataSource(ABC):
"""数据源接口基类,用于后续扩展其他数据源"""
@abstractmethod
def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
"""
获取指定时间范围内的数据
Args:
start_time: 开始时间
end_time: 结束时间
Returns:
数据列表,每条数据应包含:标题、链接、摘要、发布时间等字段
"""
pass
@abstractmethod
def get_source_name(self) -> str:
"""获取数据源名称"""
pass
class RSSDataSource(DataSource):
"""RSS数据源实现"""
def __init__(self, db_agent: MySQLAgent, table_name: str = "collector_rss_subscriptions"):
self.db_agent = db_agent
self.table_name = table_name
self.logger = logger.bind(module="RSSDataSource")
def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
"""从数据库获取RSS数据"""
try:
sql = f"""
SELECT
`文章标题` as title,
`文章链接` as link,
`文章摘要` as summary,
`发布时间` as publish_time,
`来源URL` as source_url,
`创建时间` as create_time
FROM `{self.table_name}`
WHERE `发布时间` >= %s AND `发布时间` < %s
ORDER BY `发布时间` DESC
"""
params = (
start_time.strftime('%Y-%m-%d %H:%M:%S'),
end_time.strftime('%Y-%m-%d %H:%M:%S')
)
df = self.db_agent.query_to_df(sql, params=params, is_print=False)
if df.empty:
self.logger.info(f"时间范围 {start_time}{end_time} 内没有RSS数据")
return []
# 转换为字典列表
data_list = df.to_dict('records')
self.logger.info(f"获取到 {len(data_list)} 条RSS数据")
return data_list
except Exception as e:
self.logger.error(f"获取RSS数据失败: {str(e)}", exc_info=True)
return []
def get_source_name(self) -> str:
return "RSS订阅"
class AIAnalysisDataSource(DataSource):
"""AI分析结果数据源实现 - 从ai_processor_rss_analysis表获取已筛选的相关内容"""
def __init__(self, db_agent: MySQLAgent, table_name: str = "ai_processor_rss_analysis"):
self.db_agent = db_agent
self.table_name = table_name
self.logger = logger.bind(module="AIAnalysisDataSource")
def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
"""从AI分析结果表获取相关数据(是否相关=1)"""
try:
sql = f"""
SELECT
`文章标题` as title,
`文章链接` as link,
`文章摘要` as summary,
`发布时间` as publish_time,
`来源URL` as source_url,
`分类` as category,
`标签` as tags,
`相关度评分` as relevance_score,
`分析说明` as analysis_note,
`处理时间` as process_time
FROM `{self.table_name}`
WHERE `发布时间` >= %s AND `发布时间` < %s
AND `是否相关` = 1
ORDER BY `发布时间` DESC, `相关度评分` DESC
"""
params = (
start_time.strftime('%Y-%m-%d %H:%M:%S'),
end_time.strftime('%Y-%m-%d %H:%M:%S')
)
df = self.db_agent.query_to_df(sql, params=params, is_print=False)
if df.empty:
self.logger.info(f"时间范围 {start_time}{end_time} 内没有相关数据(是否相关=1")
return []
# 转换为字典列表
data_list = df.to_dict('records')
self.logger.info(f"获取到 {len(data_list)} 条相关数据(是否相关=1")
return data_list
except Exception as e:
self.logger.error(f"获取AI分析数据失败: {str(e)}", exc_info=True)
return []
def get_source_name(self) -> str:
return "AI分析结果"
class AIProcessor:
"""AI处理器,用于筛选和分析内容"""
def __init__(self, api_key: str = None, model: str = None):
from openai import OpenAI
self.base_url = 'https://qianfan.baidubce.com/v2'
self.api_key = api_key or Config.BAIDU_AI_CONFIG.get('api_key')
self.model = model or Config.BAIDU_AI_CONFIG.get('model', 'ernie-x1-turbo-32k')
self.client = OpenAI(
base_url=self.base_url,
api_key=self.api_key
)
self.logger = logger.bind(module="AIProcessor")
def filter_automotive_content(self, articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
筛选与汽车后市场相关的内容
Args:
articles: 文章列表
Returns:
筛选后的文章列表(包含AI标记信息)
"""
if not articles:
return []
self.logger.info(f"开始AI筛选 {len(articles)} 篇文章")
# 批量处理,避免API限流
batch_size = 10
filtered_articles = []
for i in range(0, len(articles), batch_size):
batch = articles[i:i + batch_size]
try:
# 构建批量分析的prompt
articles_text = ""
for idx, article in enumerate(batch):
articles_text += f"\n[{idx + i}] 标题: {article.get('title', '')}\n"
articles_text += f"摘要: {article.get('summary', '')}\n"
prompt = f"""请分析以下新闻文章,判断哪些与汽车后市场相关。
汽车后市场的定义:汽车销售以后,围绕汽车使用过程中的各种服务,包括:
- 汽车维修保养
- 汽车配件
- 汽车改装
- 汽车美容
- 汽车用品
- 汽车金融
- 汽车保险
- 二手车交易
- 汽车租赁
- 汽车检测
- 汽车报废回收
- 汽车相关法律法规和政策
文章列表:
{articles_text}
请按以下JSON格式返回结果:
{{
"related_articles": [
{{
"index": 文章的序号(从0开始),
"is_related": true/false,
"reason": "判断理由",
"category": "所属类别(如:维修保养、配件、政策等)"
}}
]
}}
只返回JSON,不要其他文字说明。"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": prompt
}]
)
result_text = response.choices[0].message.content.strip()
# 尝试解析JSON(去除可能的markdown代码块标记)
import json
import re
# 提取JSON部分(尝试多种方式)
result_json = None
# 方式1:查找markdown代码块中的JSON
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result_text, re.DOTALL)
if json_match:
try:
result_json = json.loads(json_match.group(1))
except:
pass
# 方式2:直接查找JSON对象
if result_json is None:
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
if json_match:
try:
result_json = json.loads(json_match.group())
except:
pass
# 方式3:尝试直接解析
if result_json is None:
try:
result_json = json.loads(result_text)
except:
self.logger.warning(f"无法解析AI返回的JSON: {result_text[:200]}")
result_json = {'related_articles': []}
# 处理结果
for item in result_json.get('related_articles', []):
idx = item.get('index', -1)
if 0 <= idx < len(batch):
article = batch[idx]
if item.get('is_related', False):
article['ai_marked'] = True
article['ai_category'] = item.get('category', '其他')
article['ai_reason'] = item.get('reason', '')
filtered_articles.append(article)
# 避免API限流
import time
if i + batch_size < len(articles):
time.sleep(1.5)
except Exception as e:
self.logger.error(f"AI筛选批处理失败: {str(e)}", exc_info=True)
# 如果AI处理失败,保留所有文章但标记为未筛选
for article in batch:
article['ai_marked'] = False
article['ai_error'] = str(e)
self.logger.info(f"AI筛选完成,找到 {len(filtered_articles)} 篇相关文章")
return filtered_articles
def generate_news_summary(self, articles: List[Dict[str, Any]]) -> str:
"""
生成新闻摘要
Args:
articles: 筛选后的文章列表
Returns:
Markdown格式的新闻摘要
"""
if not articles:
return "## 相关新闻\n\n暂无相关新闻。\n"
articles_text = ""
for idx, article in enumerate(articles, 1):
category = article.get('ai_category', '其他')
reason = article.get('ai_reason', '')
articles_text += f"\n### {idx}. {article.get('title', '无标题')}\n"
articles_text += f"- **类别**: {category}\n"
articles_text += f"- **摘要**: {article.get('summary', '无摘要')}\n"
articles_text += f"- **链接**: [{article.get('link', '')}]({article.get('link', '')})\n"
articles_text += f"- **发布时间**: {article.get('publish_time', '')}\n"
if reason:
articles_text += f"- **相关性说明**: {reason}\n"
articles_text += "\n"
return f"## 汽车后市场相关新闻\n\n共找到 {len(articles)} 篇相关新闻:\n\n{articles_text}"
class BaseReporter:
"""报告生成器基类"""
def __init__(self, data_sources: List[DataSource] = None):
self.data_sources = data_sources or []
self.ai_processor = AIProcessor()
self.logger = logger.bind(module="BaseReporter")
def add_data_source(self, data_source: DataSource):
"""添加数据源"""
self.data_sources.append(data_source)
self.logger.info(f"添加数据源: {data_source.get_source_name()}")
def collect_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
"""从所有数据源收集数据"""
all_data = []
for source in self.data_sources:
try:
data = source.fetch_data(start_time, end_time)
# 标记数据来源
for item in data:
item['data_source'] = source.get_source_name()
all_data.extend(data)
except Exception as e:
self.logger.error(f"{source.get_source_name()} 收集数据失败: {str(e)}")
# 按发布时间排序
all_data.sort(key=lambda x: x.get('publish_time', ''), reverse=True)
return all_data
def generate_report_content(self, articles: List[Dict[str, Any]], report_type: str = "日报") -> str:
"""
生成报告内容(Markdown格式)
Args:
articles: 文章列表(已从AI分析结果表筛选,是否相关=1)
report_type: 报告类型("日报""周报"),用于无数据时的提示
"""
# 数据已经是从AI分析结果表筛选过的(是否相关=1),直接使用
related_articles = articles
# 生成统计信息
related_count = len(related_articles)
# 如果没有相关数据,返回提示信息
if related_count == 0:
if report_type == "日报":
message = "昨日无汽车后市场相关的新闻"
else:
message = "上周无汽车后市场相关的新闻"
return f"""
## 数据统计
- **相关文章数**: 0
## 相关新闻
{message}
"""
# 生成新闻摘要
news_summary = self._generate_news_summary_from_analysis(related_articles)
stats = f"""
## 数据统计
- **相关文章数**: {related_count}
"""
return stats + news_summary
def _generate_news_summary_from_analysis(self, articles: List[Dict[str, Any]]) -> str:
"""
从AI分析结果生成新闻摘要(使用数据库中已有的分类和分析说明)
Args:
articles: 文章列表(包含category、tags、analysis_note等字段)
Returns:
Markdown格式的新闻摘要
"""
if not articles:
return "## 相关新闻\n\n暂无相关新闻。\n"
articles_text = ""
for idx, article in enumerate(articles, 1):
category = article.get('category', '其他')
tags = article.get('tags', '')
analysis_note = article.get('analysis_note', '')
relevance_score = article.get('relevance_score', '')
articles_text += f"\n### {idx}. {article.get('title', '无标题')}\n"
articles_text += f"- **分类**: {category}\n"
if tags:
articles_text += f"- **标签**: {tags}\n"
articles_text += f"- **摘要**: {article.get('summary', '无摘要')}\n"
articles_text += f"- **链接**: [{article.get('link', '')}]({article.get('link', '')})\n"
articles_text += f"- **发布时间**: {article.get('publish_time', '')}\n"
if relevance_score:
articles_text += f"- **相关度评分**: {relevance_score}\n"
if analysis_note:
articles_text += f"- **分析说明**: {analysis_note}\n"
articles_text += "\n"
return f"## 汽车后市场相关新闻\n\n共找到 {len(articles)} 篇相关新闻:\n\n{articles_text}"
def generate_html_report(self, markdown_content: str, template_path: str = None) -> str:
"""生成HTML报告"""
# 使用相对导入避免循环依赖
from .html_template import HTMLTemplateManager
template_manager = HTMLTemplateManager()
if template_path and os.path.exists(template_path):
# 使用外部模板
html_content = template_manager.render_external_template(template_path, markdown_content)
else:
# 使用内置模板
html_content = template_manager.render_builtin_template(markdown_content)
return html_content
def save_report(self, html_content: str, output_path: str):
"""保存HTML报告到文件"""
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
self.logger.info(f"HTML报告已保存到: {output_path}")
def save_markdown_report(self, markdown_content: str, output_path: str):
"""保存Markdown报告到文件"""
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
self.logger.info(f"Markdown报告已保存到: {output_path}")