The final report agent has been largely completed.

2025-08-26 17:34:36 +08:00
parent 197e68f7ba
commit f0788b64f3
52 changed files with 7853 additions and 825 deletions
@@ -0,0 +1,15 @@
+"""
+Report Engine节点处理模块
+实现报告生成的各个处理步骤
+"""
+
+from .base_node import BaseNode, StateMutationNode
+from .template_selection_node import TemplateSelectionNode
+from .html_generation_node import HTMLGenerationNode
+
+__all__ = [
+    "BaseNode",
+    "StateMutationNode", 
+    "TemplateSelectionNode",
+    "HTMLGenerationNode"
+]
@@ -0,0 +1,93 @@
+"""
+Report Engine节点基类
+定义所有处理节点的基础接口
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+from ..llms.base import BaseLLM
+from ..state.state import ReportState
+
+
+class BaseNode(ABC):
+    """节点基类"""
+    
+    def __init__(self, llm_client: BaseLLM, node_name: str = ""):
+        """
+        初始化节点
+        
+        Args:
+            llm_client: LLM客户端
+            node_name: 节点名称
+        """
+        self.llm_client = llm_client
+        self.node_name = node_name or self.__class__.__name__
+        self.logger = logging.getLogger('ReportEngine')
+    
+    @abstractmethod
+    def run(self, input_data: Any, **kwargs) -> Any:
+        """
+        执行节点处理逻辑
+        
+        Args:
+            input_data: 输入数据
+            **kwargs: 额外参数
+            
+        Returns:
+            处理结果
+        """
+        pass
+    
+    def validate_input(self, input_data: Any) -> bool:
+        """
+        验证输入数据
+        
+        Args:
+            input_data: 输入数据
+            
+        Returns:
+            验证是否通过
+        """
+        return True
+    
+    def process_output(self, output: Any) -> Any:
+        """
+        处理输出数据
+        
+        Args:
+            output: 原始输出
+            
+        Returns:
+            处理后的输出
+        """
+        return output
+    
+    def log_info(self, message: str):
+        """记录信息日志"""
+        formatted_message = f"[{self.node_name}] {message}"
+        self.logger.info(formatted_message)
+    
+    def log_error(self, message: str):
+        """记录错误日志"""
+        formatted_message = f"[{self.node_name}] {message}"
+        self.logger.error(formatted_message)
+
+
+class StateMutationNode(BaseNode):
+    """带状态修改功能的节点基类"""
+    
+    @abstractmethod
+    def mutate_state(self, input_data: Any, state: ReportState, **kwargs) -> ReportState:
+        """
+        修改状态
+        
+        Args:
+            input_data: 输入数据
+            state: 当前状态
+            **kwargs: 额外参数
+            
+        Returns:
+            修改后的状态
+        """
+        pass
@@ -0,0 +1,340 @@
+"""
+HTML生成节点
+将整合后的内容转换为美观的HTML报告
+"""
+
+import json
+from datetime import datetime
+from typing import Dict, Any
+
+from .base_node import StateMutationNode
+from ..llms.base import BaseLLM
+from ..state.state import ReportState
+from ..prompts import SYSTEM_PROMPT_HTML_GENERATION
+# 不再需要text_processing依赖
+
+
+class HTMLGenerationNode(StateMutationNode):
+    """HTML生成处理节点"""
+    
+    def __init__(self, llm_client: BaseLLM):
+        """
+        初始化HTML生成节点
+        
+        Args:
+            llm_client: LLM客户端
+        """
+        super().__init__(llm_client, "HTMLGenerationNode")
+    
+    def run(self, input_data: Dict[str, Any], **kwargs) -> str:
+        """
+        执行HTML生成
+        
+        Args:
+            input_data: 包含报告数据的字典
+                - query: 原始查询
+                - query_engine_report: QueryEngine报告内容
+                - media_engine_report: MediaEngine报告内容  
+                - insight_engine_report: InsightEngine报告内容
+                - forum_logs: 论坛日志内容
+                - selected_template: 选择的模板内容
+                
+        Returns:
+            生成的HTML内容
+        """
+        self.log_info("开始生成HTML报告...")
+        
+        try:
+            # 准备LLM输入数据
+            llm_input = {
+                "query": input_data.get('query', ''),
+                "query_engine_report": input_data.get('query_engine_report', ''),
+                "media_engine_report": input_data.get('media_engine_report', ''),
+                "insight_engine_report": input_data.get('insight_engine_report', ''),
+                "forum_logs": input_data.get('forum_logs', ''),
+                "selected_template": input_data.get('selected_template', '')
+            }
+            
+            # 转换为JSON格式
+            message = json.dumps(llm_input, ensure_ascii=False, indent=2)
+            
+            # 调用LLM生成HTML
+            response = self.llm_client.invoke(SYSTEM_PROMPT_HTML_GENERATION, message)
+            
+            # 处理响应
+            processed_response = self.process_output(response)
+            
+            self.log_info("HTML报告生成完成")
+            return processed_response
+            
+        except Exception as e:
+            self.log_error(f"HTML生成失败: {str(e)}")
+            # 返回备用HTML
+            return self._generate_fallback_html(input_data)
+    
+    def mutate_state(self, input_data: Dict[str, Any], state: ReportState, **kwargs) -> ReportState:
+        """
+        修改报告状态，添加生成的HTML内容
+        
+        Args:
+            input_data: 输入数据
+            state: 当前报告状态
+            **kwargs: 额外参数
+            
+        Returns:
+            更新后的报告状态
+        """
+        # 生成HTML
+        html_content = self.run(input_data, **kwargs)
+        
+        # 更新状态
+        state.html_content = html_content
+        state.mark_completed()
+        
+        return state
+    
+    def process_output(self, output: str) -> str:
+        """
+        处理LLM输出，提取HTML内容
+        
+        Args:
+            output: LLM原始输出
+            
+        Returns:
+            清理后的HTML内容
+        """
+        try:
+            self.log_info(f"处理LLM原始输出，长度: {len(output)} 字符")
+            
+            html_content = ""
+            
+            # 尝试解析JSON响应
+            try:
+                result = json.loads(output)
+                html_content = result.get('html_content', '')
+                self.log_info("成功从JSON中提取html_content")
+            except json.JSONDecodeError:
+                self.log_info("不是JSON格式，直接使用原始输出")
+                html_content = output
+            
+            # 如果还是没有内容，尝试其他提取方法
+            if not html_content.strip():
+                # 查找HTML标记
+                if '<!DOCTYPE html>' in output:
+                    start_idx = output.find('<!DOCTYPE html>')
+                    html_content = output[start_idx:]
+                elif '<html' in output:
+                    start_idx = output.find('<html')
+                    html_content = output[start_idx:]
+                else:
+                    html_content = output
+            
+            # 清理markdown代码块标记
+            if html_content.startswith('```html'):
+                html_content = html_content.replace('```html', '').replace('```', '').strip()
+            elif html_content.startswith('```'):
+                html_content = html_content.replace('```', '').strip()
+            
+            # 处理转义字符
+            html_content = html_content.replace('\\n', '\n')
+            html_content = html_content.replace('\\t', '\t')
+            html_content = html_content.replace('\\r', '\r')
+            html_content = html_content.replace('\\"', '"')
+            html_content = html_content.replace("\\'", "'")
+            
+            # 验证HTML内容
+            if not html_content.strip():
+                raise ValueError("生成的HTML内容为空")
+            
+            # 确保HTML有基本结构
+            if not html_content.strip().startswith('<!DOCTYPE') and not html_content.strip().startswith('<html'):
+                self.log_info("HTML缺少基本结构，添加包装")
+                html_content = f"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>智能舆情分析报告</title>
+</head>
+<body>
+{html_content}
+</body>
+</html>"""
+            
+            self.log_info(f"HTML处理完成，最终长度: {len(html_content)} 字符")
+            return html_content.strip()
+            
+        except Exception as e:
+            self.log_error(f"处理HTML输出失败: {str(e)}")
+            return self._generate_error_html(str(e))
+    
+    def _generate_fallback_html(self, input_data: Dict[str, Any]) -> str:
+        """
+        生成备用HTML报告（当LLM失败时使用）
+        
+        Args:
+            input_data: 输入数据
+            
+        Returns:
+            备用HTML内容
+        """
+        self.log_info("使用备用HTML生成方法")
+        
+        query = input_data.get('query', '智能舆情分析报告')
+        query_report = input_data.get('query_engine_report', '')
+        media_report = input_data.get('media_engine_report', '')
+        insight_report = input_data.get('insight_engine_report', '')
+        forum_logs = input_data.get('forum_logs', '')
+        
+        generation_time = datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")
+        
+        html_content = f"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{query} - 智能舆情分析报告</title>
+    <style>
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background: #f5f5f5;
+        }}
+        .container {{
+            background: white;
+            padding: 40px;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }}
+        h1 {{
+            color: #2c3e50;
+            border-bottom: 3px solid #3498db;
+            padding-bottom: 10px;
+        }}
+        h2 {{
+            color: #34495e;
+            margin-top: 30px;
+            margin-bottom: 15px;
+        }}
+        .section {{
+            margin-bottom: 30px;
+            padding: 20px;
+            border-left: 4px solid #3498db;
+            background: #f8f9fa;
+        }}
+        .meta {{
+            background: #e9ecef;
+            padding: 15px;
+            border-radius: 5px;
+            margin-bottom: 20px;
+        }}
+        .footer {{
+            margin-top: 40px;
+            padding-top: 20px;
+            border-top: 1px solid #eee;
+            text-align: center;
+            color: #666;
+        }}
+        pre {{
+            background: #f4f4f4;
+            padding: 15px;
+            border-radius: 5px;
+            overflow-x: auto;
+            white-space: pre-wrap;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>{query}</h1>
+        
+        <div class="meta">
+            <strong>报告生成时间:</strong> {generation_time}<br>
+            <strong>数据来源:</strong> QueryEngine、MediaEngine、InsightEngine、ForumEngine<br>
+            <strong>报告类型:</strong> 综合舆情分析报告
+        </div>
+        
+        <h2>执行摘要</h2>
+        <div class="section">
+            本报告整合了多个分析引擎的研究结果，为您提供全面的舆情分析洞察。
+            通过对查询主题"{query}"的深度分析，我们从多个维度展现了当前的舆情态势。
+        </div>
+        
+        {f'<h2>QueryEngine分析结果</h2><div class="section"><pre>{query_report}</pre></div>' if query_report else ''}
+        
+        {f'<h2>MediaEngine分析结果</h2><div class="section"><pre>{media_report}</pre></div>' if media_report else ''}
+        
+        {f'<h2>InsightEngine分析结果</h2><div class="section"><pre>{insight_report}</pre></div>' if insight_report else ''}
+        
+        {f'<h2>论坛监控数据</h2><div class="section"><pre>{forum_logs[:2000]}{"..." if len(forum_logs) > 2000 else ""}</pre></div>' if forum_logs else ''}
+        
+        <h2>综合结论</h2>
+        <div class="section">
+            基于多个分析引擎的综合研究，我们对"{query}"主题进行了全面分析。
+            各引擎从不同角度提供了深入洞察，为决策提供了重要参考。
+        </div>
+        
+        <div class="footer">
+            <p>本报告由智能舆情分析平台自动生成</p>
+            <p>ReportEngine v1.0 | 生成时间: {generation_time}</p>
+        </div>
+    </div>
+</body>
+</html>"""
+        
+        return html_content
+    
+    def _generate_error_html(self, error_message: str) -> str:
+        """
+        生成错误HTML页面
+        
+        Args:
+            error_message: 错误信息
+            
+        Returns:
+            错误HTML内容
+        """
+        return f"""<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>报告生成失败</title>
+    <style>
+        body {{
+            font-family: Arial, sans-serif;
+            text-align: center;
+            padding: 50px;
+            background: #f8f9fa;
+        }}
+        .error-container {{
+            background: white;
+            padding: 40px;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+            max-width: 600px;
+            margin: 0 auto;
+        }}
+        .error-title {{
+            color: #e74c3c;
+            font-size: 24px;
+            margin-bottom: 20px;
+        }}
+        .error-message {{
+            color: #666;
+            margin-bottom: 20px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="error-container">
+        <div class="error-title">报告生成失败</div>
+        <div class="error-message">错误信息: {error_message}</div>
+        <p>请检查输入数据或稍后重试。</p>
+    </div>
+</body>
+</html>"""
@@ -0,0 +1,274 @@
+"""
+模板选择节点
+根据查询内容和可用模板选择最合适的报告模板
+"""
+
+import os
+import json
+from typing import Dict, Any, List, Optional
+
+from .base_node import BaseNode
+from ..prompts import SYSTEM_PROMPT_TEMPLATE_SELECTION
+
+
+class TemplateSelectionNode(BaseNode):
+    """模板选择处理节点"""
+    
+    def __init__(self, llm_client, template_dir: str = "ReportEngine/report_template"):
+        """
+        初始化模板选择节点
+        
+        Args:
+            llm_client: LLM客户端
+            template_dir: 模板目录路径
+        """
+        super().__init__(llm_client, "TemplateSelectionNode")
+        self.template_dir = template_dir
+        
+    def run(self, input_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """
+        执行模板选择
+        
+        Args:
+            input_data: 包含查询和报告内容的字典
+                - query: 原始查询
+                - reports: 三个子agent的报告列表
+                - forum_logs: 论坛日志内容
+                
+        Returns:
+            选择的模板信息
+        """
+        self.log_info("开始模板选择...")
+        
+        query = input_data.get('query', '')
+        reports = input_data.get('reports', [])
+        forum_logs = input_data.get('forum_logs', '')
+        
+        # 获取可用模板
+        available_templates = self._get_available_templates()
+        
+        if not available_templates:
+            self.log_info("未找到预设模板，使用内置默认模板")
+            return self._get_fallback_template()
+        
+        # 首先尝试简单关键词匹配
+        simple_match = self._simple_keyword_matching(query, available_templates)
+        if simple_match:
+            self.log_info(f"通过关键词匹配选择模板: {simple_match['template_name']}")
+            return simple_match
+        
+        # 如果关键词匹配失败，尝试LLM选择
+        try:
+            llm_result = self._llm_template_selection(query, reports, forum_logs, available_templates)
+            if llm_result:
+                return llm_result
+        except Exception as e:
+            self.log_error(f"LLM模板选择失败: {str(e)}")
+        
+        # 所有方法都失败，使用默认的社会热点事件模板
+        default_template = self._get_default_social_event_template(available_templates)
+        if default_template:
+            return default_template
+        
+        # 最后备选方案
+        return self._get_fallback_template()
+    
+    def _simple_keyword_matching(self, query: str, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """基于关键词的简单模板匹配"""
+        query_lower = query.lower()
+        
+        # 关键词映射
+        keyword_mapping = {
+            '企业': ['企业品牌'],
+            '品牌': ['企业品牌'],
+            '声誉': ['企业品牌'],
+            '市场': ['市场竞争'],
+            '竞争': ['市场竞争'],
+            '格局': ['市场竞争'],
+            '政策': ['政策', '行业'],
+            '行业': ['政策', '行业'],
+            '动态': ['政策', '行业'],
+            '突发': ['突发事件', '危机'],
+            '危机': ['突发事件', '危机'],
+            '公关': ['突发事件', '危机'],
+            '日常': ['日常', '定期'],
+            '定期': ['日常', '定期'],
+            '监测': ['日常', '定期'],
+            '热点': ['社会公共热点'],
+            '社会': ['社会公共热点'],
+            '事件': ['社会公共热点'],
+        }
+        
+        # 检查查询中的关键词
+        for keyword, template_keywords in keyword_mapping.items():
+            if keyword in query_lower:
+                # 查找匹配的模板
+                for template in available_templates:
+                    for template_keyword in template_keywords:
+                        if template_keyword in template['name']:
+                            return {
+                                'template_name': template['name'],
+                                'template_content': template['content'],
+                                'selection_reason': f'基于关键词"{keyword}"匹配选择'
+                            }
+        
+        return None
+    
+    def _llm_template_selection(self, query: str, reports: List[Any], forum_logs: str, 
+                              available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """使用LLM进行模板选择"""
+        self.log_info("尝试使用LLM进行模板选择...")
+        
+        # 构建模板列表
+        template_list = "\n".join([f"- {t['name']}: {t['description']}" for t in available_templates])
+        
+        user_message = f"""查询内容: {query}
+
+报告数量: {len(reports)} 个分析引擎报告
+论坛日志: {'有' if forum_logs else '无'}
+
+可用模板:
+{template_list}
+
+请选择最合适的模板。"""
+        
+        # 调用LLM
+        response = self.llm_client.invoke(SYSTEM_PROMPT_TEMPLATE_SELECTION, user_message)
+        
+        # 检查响应是否为空
+        if not response or not response.strip():
+            self.log_error("LLM返回空响应")
+            return None
+        
+        self.log_info(f"LLM原始响应: {response[:200]}...")
+        
+        # 尝试解析JSON响应
+        try:
+            # 清理响应文本
+            cleaned_response = self._clean_llm_response(response)
+            result = json.loads(cleaned_response)
+            
+            # 验证选择的模板是否存在
+            selected_template_name = result.get('template_name', '')
+            for template in available_templates:
+                if template['name'] == selected_template_name or selected_template_name in template['name']:
+                    self.log_info(f"LLM选择模板: {selected_template_name}")
+                    return {
+                        'template_name': template['name'],
+                        'template_content': template['content'],
+                        'selection_reason': result.get('selection_reason', 'LLM智能选择')
+                    }
+            
+            self.log_error(f"LLM选择的模板不存在: {selected_template_name}")
+            return None
+            
+        except json.JSONDecodeError as e:
+            self.log_error(f"JSON解析失败: {str(e)}")
+            # 尝试从文本响应中提取模板信息
+            return self._extract_template_from_text(response, available_templates)
+    
+    def _clean_llm_response(self, response: str) -> str:
+        """清理LLM响应"""
+        # 移除可能的markdown代码块标记
+        if '```json' in response:
+            response = response.split('```json')[1].split('```')[0]
+        elif '```' in response:
+            response = response.split('```')[1].split('```')[0]
+        
+        # 移除前后空白
+        response = response.strip()
+        
+        return response
+    
+    def _extract_template_from_text(self, response: str, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """从文本响应中提取模板信息"""
+        self.log_info("尝试从文本响应中提取模板信息")
+        
+        # 查找响应中是否包含模板名称
+        for template in available_templates:
+            template_name_variants = [
+                template['name'],
+                template['name'].replace('.md', ''),
+                template['name'].replace('模板', ''),
+            ]
+            
+            for variant in template_name_variants:
+                if variant in response:
+                    self.log_info(f"在响应中找到模板: {template['name']}")
+                    return {
+                        'template_name': template['name'],
+                        'template_content': template['content'],
+                        'selection_reason': '从文本响应中提取'
+                    }
+        
+        return None
+    
+    def _get_available_templates(self) -> List[Dict[str, Any]]:
+        """获取可用的模板列表"""
+        templates = []
+        
+        if not os.path.exists(self.template_dir):
+            self.log_error(f"模板目录不存在: {self.template_dir}")
+            return templates
+        
+        # 查找所有markdown模板文件
+        for filename in os.listdir(self.template_dir):
+            if filename.endswith('.md'):
+                template_path = os.path.join(self.template_dir, filename)
+                try:
+                    with open(template_path, 'r', encoding='utf-8') as f:
+                        content = f.read()
+                    
+                    template_name = filename.replace('.md', '')
+                    description = self._extract_template_description(template_name)
+                    
+                    templates.append({
+                        'name': template_name,
+                        'path': template_path,
+                        'content': content,
+                        'description': description
+                    })
+                except Exception as e:
+                    self.log_error(f"读取模板文件失败 {filename}: {str(e)}")
+        
+        return templates
+    
+    def _extract_template_description(self, template_name: str) -> str:
+        """根据模板名称生成描述"""
+        if '企业品牌' in template_name:
+            return "适用于企业品牌声誉和形象分析"
+        elif '市场竞争' in template_name:
+            return "适用于市场竞争格局和对手分析"
+        elif '日常' in template_name or '定期' in template_name:
+            return "适用于日常监测和定期汇报"
+        elif '政策' in template_name or '行业' in template_name:
+            return "适用于政策影响和行业动态分析"
+        elif '热点' in template_name or '社会' in template_name:
+            return "适用于社会热点和公共事件分析"
+        elif '突发' in template_name or '危机' in template_name:
+            return "适用于突发事件和危机公关"
+        
+        return "通用报告模板"
+    
+    def _get_default_social_event_template(self, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """获取默认的社会热点事件分析模板"""
+        # 查找社会热点事件分析模板
+        for template in available_templates:
+            if '社会公共热点事件' in template['name'] or '热点' in template['name']:
+                self.log_info(f"使用默认模板: {template['name']}")
+                return {
+                    'template_name': template['name'],
+                    'template_content': template['content'],
+                    'selection_reason': '默认使用社会热点事件分析模板'
+                }
+        return None
+    
+    def _get_fallback_template(self) -> Dict[str, Any]:
+        """获取备用默认模板（空模板，让LLM自行发挥）"""
+        self.log_info("未找到合适模板，使用空模板让LLM自行发挥")
+        
+        return {
+            'template_name': '自由发挥模板',
+            'template_content': '',
+            'selection_reason': '未找到合适的预设模板，让LLM根据内容自行设计报告结构'
+        }