intelligence_system/applications/reporter/base_reporter.py

"""
报告生成器基类
提供数据源接口、AI处理接口等扩展能力
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import os
import sys
from loguru import logger

# 添加父目录到路径
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(os.path.dirname(current_dir))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from utils.mysql_agent import MySQLAgent
from config import Config


class DataSource(ABC):
    """数据源接口基类，用于后续扩展其他数据源"""

    @abstractmethod
    def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
        """
        获取指定时间范围内的数据

        Args:
            start_time: 开始时间
            end_time: 结束时间

        Returns:
            数据列表，每条数据应包含：标题、链接、摘要、发布时间等字段
        """
        pass

    @abstractmethod
    def get_source_name(self) -> str:
        """获取数据源名称"""
        pass


class RSSDataSource(DataSource):
    """RSS数据源实现"""

    def __init__(self, db_agent: MySQLAgent, table_name: str = "collector_rss_subscriptions"):
        self.db_agent = db_agent
        self.table_name = table_name
        self.logger = logger.bind(module="RSSDataSource")

    def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
        """从数据库获取RSS数据"""
        try:
            sql = f"""
                SELECT
                    `文章标题` as title,
                    `文章链接` as link,
                    `文章摘要` as summary,
                    `发布时间` as publish_time,
                    `来源URL` as source_url,
                    `创建时间` as create_time
                FROM `{self.table_name}`
                WHERE `发布时间` >= %s AND `发布时间` < %s
                ORDER BY `发布时间` DESC
            """

            params = (
                start_time.strftime('%Y-%m-%d %H:%M:%S'),
                end_time.strftime('%Y-%m-%d %H:%M:%S')
            )

            df = self.db_agent.query_to_df(sql, params=params, is_print=False)

            if df.empty:
                self.logger.info(f"时间范围 {start_time} 到 {end_time} 内没有RSS数据")
                return []

            # 转换为字典列表
            data_list = df.to_dict('records')
            self.logger.info(f"获取到 {len(data_list)} 条RSS数据")
            return data_list

        except Exception as e:
            self.logger.error(f"获取RSS数据失败: {str(e)}", exc_info=True)
            return []

    def get_source_name(self) -> str:
        return "RSS订阅"


class AIAnalysisDataSource(DataSource):
    """AI分析结果数据源实现 - 从ai_processor_rss_analysis表获取已筛选的相关内容"""

    def __init__(self, db_agent: MySQLAgent, table_name: str = "ai_processor_rss_analysis"):
        self.db_agent = db_agent
        self.table_name = table_name
        self.logger = logger.bind(module="AIAnalysisDataSource")

    def fetch_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
        """从AI分析结果表获取相关数据（是否相关=1）"""
        try:
            sql = f"""
                SELECT
                    `文章标题` as title,
                    `文章链接` as link,
                    `文章摘要` as summary,
                    `发布时间` as publish_time,
                    `来源URL` as source_url,
                    `分类` as category,
                    `标签` as tags,
                    `相关度评分` as relevance_score,
                    `分析说明` as analysis_note,
                    `处理时间` as process_time
                FROM `{self.table_name}`
                WHERE `发布时间` >= %s AND `发布时间` < %s
                  AND `是否相关` = 1
                ORDER BY `发布时间` DESC, `相关度评分` DESC
            """

            params = (
                start_time.strftime('%Y-%m-%d %H:%M:%S'),
                end_time.strftime('%Y-%m-%d %H:%M:%S')
            )

            df = self.db_agent.query_to_df(sql, params=params, is_print=False)

            if df.empty:
                self.logger.info(f"时间范围 {start_time} 到 {end_time} 内没有相关数据（是否相关=1）")
                return []

            # 转换为字典列表
            data_list = df.to_dict('records')
            self.logger.info(f"获取到 {len(data_list)} 条相关数据（是否相关=1）")
            return data_list

        except Exception as e:
            self.logger.error(f"获取AI分析数据失败: {str(e)}", exc_info=True)
            return []

    def get_source_name(self) -> str:
        return "AI分析结果"


class AIProcessor:
    """AI处理器，用于筛选和分析内容"""

    def __init__(self, api_key: str = None, model: str = None):
        from openai import OpenAI

        self.base_url = 'https://qianfan.baidubce.com/v2'
        self.api_key = api_key or Config.BAIDU_AI_CONFIG.get('api_key')
        self.model = model or Config.BAIDU_AI_CONFIG.get('model', 'ernie-x1-turbo-32k')
        self.client = OpenAI(
            base_url=self.base_url,
            api_key=self.api_key
        )
        self.logger = logger.bind(module="AIProcessor")

    def filter_automotive_content(self, articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        筛选与汽车后市场相关的内容

        Args:
            articles: 文章列表

        Returns:
            筛选后的文章列表（包含AI标记信息）
        """
        if not articles:
            return []

        self.logger.info(f"开始AI筛选 {len(articles)} 篇文章")

        # 批量处理，避免API限流
        batch_size = 10
        filtered_articles = []

        for i in range(0, len(articles), batch_size):
            batch = articles[i:i + batch_size]
            try:
                # 构建批量分析的prompt
                articles_text = ""
                for idx, article in enumerate(batch):
                    articles_text += f"\n[{idx + i}] 标题: {article.get('title', '')}\n"
                    articles_text += f"摘要: {article.get('summary', '')}\n"

                prompt = f"""请分析以下新闻文章，判断哪些与汽车后市场相关。

汽车后市场的定义：汽车销售以后，围绕汽车使用过程中的各种服务，包括：
- 汽车维修保养
- 汽车配件
- 汽车改装
- 汽车美容
- 汽车用品
- 汽车金融
- 汽车保险
- 二手车交易
- 汽车租赁
- 汽车检测
- 汽车报废回收
- 汽车相关法律法规和政策

文章列表：
{articles_text}

请按以下JSON格式返回结果：
{{
    "related_articles": [
        {{
            "index": 文章的序号（从0开始）,
            "is_related": true/false,
            "reason": "判断理由",
            "category": "所属类别（如：维修保养、配件、政策等）"
        }}
    ]
}}

只返回JSON，不要其他文字说明。"""

                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[{
                        "role": "user",
                        "content": prompt
                    }]
                )

                result_text = response.choices[0].message.content.strip()

                # 尝试解析JSON（去除可能的markdown代码块标记）
                import json
                import re

                # 提取JSON部分（尝试多种方式）
                result_json = None
                # 方式1：查找markdown代码块中的JSON
                json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', result_text, re.DOTALL)
                if json_match:
                    try:
                        result_json = json.loads(json_match.group(1))
                    except:
                        pass

                # 方式2：直接查找JSON对象
                if result_json is None:
                    json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
                    if json_match:
                        try:
                            result_json = json.loads(json_match.group())
                        except:
                            pass

                # 方式3：尝试直接解析
                if result_json is None:
                    try:
                        result_json = json.loads(result_text)
                    except:
                        self.logger.warning(f"无法解析AI返回的JSON: {result_text[:200]}")
                        result_json = {'related_articles': []}

                # 处理结果
                for item in result_json.get('related_articles', []):
                    idx = item.get('index', -1)
                    if 0 <= idx < len(batch):
                        article = batch[idx]
                        if item.get('is_related', False):
                            article['ai_marked'] = True
                            article['ai_category'] = item.get('category', '其他')
                            article['ai_reason'] = item.get('reason', '')
                            filtered_articles.append(article)

                # 避免API限流
                import time
                if i + batch_size < len(articles):
                    time.sleep(1.5)

            except Exception as e:
                self.logger.error(f"AI筛选批处理失败: {str(e)}", exc_info=True)
                # 如果AI处理失败，保留所有文章但标记为未筛选
                for article in batch:
                    article['ai_marked'] = False
                    article['ai_error'] = str(e)

        self.logger.info(f"AI筛选完成，找到 {len(filtered_articles)} 篇相关文章")
        return filtered_articles

    def generate_news_summary(self, articles: List[Dict[str, Any]]) -> str:
        """
        生成新闻摘要

        Args:
            articles: 筛选后的文章列表

        Returns:
            Markdown格式的新闻摘要
        """
        if not articles:
            return "## 相关新闻\n\n暂无相关新闻。\n"

        articles_text = ""
        for idx, article in enumerate(articles, 1):
            category = article.get('ai_category', '其他')
            reason = article.get('ai_reason', '')
            articles_text += f"\n### {idx}. {article.get('title', '无标题')}\n"
            articles_text += f"- **类别**: {category}\n"
            articles_text += f"- **摘要**: {article.get('summary', '无摘要')}\n"
            articles_text += f"- **链接**: [{article.get('link', '')}]({article.get('link', '')})\n"
            articles_text += f"- **发布时间**: {article.get('publish_time', '')}\n"
            if reason:
                articles_text += f"- **相关性说明**: {reason}\n"
            articles_text += "\n"

        return f"## 汽车后市场相关新闻\n\n共找到 {len(articles)} 篇相关新闻：\n\n{articles_text}"


class BaseReporter:
    """报告生成器基类"""

    def __init__(self, data_sources: List[DataSource] = None):
        self.data_sources = data_sources or []
        self.ai_processor = AIProcessor()
        self.logger = logger.bind(module="BaseReporter")

    def add_data_source(self, data_source: DataSource):
        """添加数据源"""
        self.data_sources.append(data_source)
        self.logger.info(f"添加数据源: {data_source.get_source_name()}")

    def collect_data(self, start_time: datetime, end_time: datetime) -> List[Dict[str, Any]]:
        """从所有数据源收集数据"""
        all_data = []
        for source in self.data_sources:
            try:
                data = source.fetch_data(start_time, end_time)
                # 标记数据来源
                for item in data:
                    item['data_source'] = source.get_source_name()
                all_data.extend(data)
            except Exception as e:
                self.logger.error(f"从 {source.get_source_name()} 收集数据失败: {str(e)}")

        # 按发布时间排序
        all_data.sort(key=lambda x: x.get('publish_time', ''), reverse=True)
        return all_data

    def generate_report_content(self, articles: List[Dict[str, Any]], report_type: str = "日报") -> str:
        """
        生成报告内容（Markdown格式）

        Args:
            articles: 文章列表（已从AI分析结果表筛选，是否相关=1）
            report_type: 报告类型（"日报"或"周报"），用于无数据时的提示
        """
        # 数据已经是从AI分析结果表筛选过的（是否相关=1），直接使用
        related_articles = articles

        # 生成统计信息
        related_count = len(related_articles)

        # 如果没有相关数据，返回提示信息
        if related_count == 0:
            if report_type == "日报":
                message = "昨日无汽车后市场相关的新闻"
            else:
                message = "上周无汽车后市场相关的新闻"

            return f"""
## 数据统计

- **相关文章数**: 0

## 相关新闻

{message}
"""

        # 生成新闻摘要
        news_summary = self._generate_news_summary_from_analysis(related_articles)

        stats = f"""
## 数据统计

- **相关文章数**: {related_count}

"""

        return stats + news_summary

    def _generate_news_summary_from_analysis(self, articles: List[Dict[str, Any]]) -> str:
        """
        从AI分析结果生成新闻摘要（使用数据库中已有的分类和分析说明）

        Args:
            articles: 文章列表（包含category、tags、analysis_note等字段）

        Returns:
            Markdown格式的新闻摘要
        """
        if not articles:
            return "## 相关新闻\n\n暂无相关新闻。\n"

        articles_text = ""
        for idx, article in enumerate(articles, 1):
            category = article.get('category', '其他')
            tags = article.get('tags', '')
            analysis_note = article.get('analysis_note', '')
            relevance_score = article.get('relevance_score', '')

            articles_text += f"\n### {idx}. {article.get('title', '无标题')}\n"
            articles_text += f"- **分类**: {category}\n"
            if tags:
                articles_text += f"- **标签**: {tags}\n"
            articles_text += f"- **摘要**: {article.get('summary', '无摘要')}\n"
            articles_text += f"- **链接**: [{article.get('link', '')}]({article.get('link', '')})\n"
            articles_text += f"- **发布时间**: {article.get('publish_time', '')}\n"
            if relevance_score:
                articles_text += f"- **相关度评分**: {relevance_score}\n"
            if analysis_note:
                articles_text += f"- **分析说明**: {analysis_note}\n"
            articles_text += "\n"

        return f"## 汽车后市场相关新闻\n\n共找到 {len(articles)} 篇相关新闻：\n\n{articles_text}"

    def generate_html_report(self, markdown_content: str, template_path: str = None) -> str:
        """生成HTML报告"""
        # 使用相对导入避免循环依赖
        from .html_template import HTMLTemplateManager

        template_manager = HTMLTemplateManager()

        if template_path and os.path.exists(template_path):
            # 使用外部模板
            html_content = template_manager.render_external_template(template_path, markdown_content)
        else:
            # 使用内置模板
            html_content = template_manager.render_builtin_template(markdown_content)

        return html_content

    def save_report(self, html_content: str, output_path: str):
        """保存HTML报告到文件"""
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html_content)

        self.logger.info(f"HTML报告已保存到: {output_path}")

    def save_markdown_report(self, markdown_content: str, output_path: str):
        """保存Markdown报告到文件"""
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)

        self.logger.info(f"Markdown报告已保存到: {output_path}")