bettafish-company/QueryEngine/nodes/summary_node.py

"""
总结节点实现
负责根据搜索结果生成和更新段落内容
"""

import json
from typing import Dict, Any, List
from json.decoder import JSONDecodeError

from .base_node import StateMutationNode
from ..state.state import State
from ..prompts import SYSTEM_PROMPT_FIRST_SUMMARY, SYSTEM_PROMPT_REFLECTION_SUMMARY
from ..utils.text_processing import (
    remove_reasoning_from_output,
    clean_json_tags,
    extract_clean_response,
    fix_incomplete_json,
    format_search_results_for_prompt
)

# 导入论坛读取工具
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
try:
    from utils.forum_reader import get_latest_host_speech, format_host_speech_for_prompt
    FORUM_READER_AVAILABLE = True
except ImportError:
    FORUM_READER_AVAILABLE = False
    print("警告: 无法导入forum_reader模块，将跳过HOST发言读取功能")


class FirstSummaryNode(StateMutationNode):
    """根据搜索结果生成段落首次总结的节点"""

    def __init__(self, llm_client):
        """
        初始化首次总结节点

        Args:
            llm_client: LLM客户端
        """
        super().__init__(llm_client, "FirstSummaryNode")

    def validate_input(self, input_data: Any) -> bool:
        """验证输入数据"""
        if isinstance(input_data, str):
            try:
                data = json.loads(input_data)
                required_fields = ["title", "content", "search_query", "search_results"]
                return all(field in data for field in required_fields)
            except JSONDecodeError:
                return False
        elif isinstance(input_data, dict):
            required_fields = ["title", "content", "search_query", "search_results"]
            return all(field in input_data for field in required_fields)
        return False

    def run(self, input_data: Any, **kwargs) -> str:
        """
        调用LLM生成段落总结

        Args:
            input_data: 包含title、content、search_query和search_results的数据
            **kwargs: 额外参数

        Returns:
            段落总结内容
        """
        try:
            if not self.validate_input(input_data):
                raise ValueError("输入数据格式错误")

            # 准备输入数据
            if isinstance(input_data, str):
                data = json.loads(input_data)
            else:
                data = input_data.copy() if isinstance(input_data, dict) else input_data

            # 读取最新的HOST发言（如果可用）
            if FORUM_READER_AVAILABLE:
                try:
                    host_speech = get_latest_host_speech()
                    if host_speech:
                        # 将HOST发言添加到输入数据中
                        data['host_speech'] = host_speech
                        self.log_info(f"已读取HOST发言，长度: {len(host_speech)}字符")
                except Exception as e:
                    self.log_info(f"读取HOST发言失败: {str(e)}")

            # 转换为JSON字符串
            message = json.dumps(data, ensure_ascii=False)

            # 如果有HOST发言，添加到消息前面作为参考
            if FORUM_READER_AVAILABLE and 'host_speech' in data and data['host_speech']:
                formatted_host = format_host_speech_for_prompt(data['host_speech'])
                message = formatted_host + "\n" + message

            self.log_info("正在生成首次段落总结")

            # 调用LLM，增加max_tokens以支持更长的总结
            response = self.llm_client.invoke(
                SYSTEM_PROMPT_FIRST_SUMMARY,
                message,
                max_tokens=8192  # 支持更长的总结内容
            )

            # 处理响应
            processed_response = self.process_output(response)

            self.log_info("成功生成首次段落总结")
            return processed_response

        except Exception as e:
            self.log_error(f"生成首次总结失败: {str(e)}")
            raise e

    def process_output(self, output: str) -> str:
        """
        处理LLM输出，提取段落内容

        Args:
            output: LLM原始输出

        Returns:
            段落内容
        """
        try:
            # 清理响应文本
            cleaned_output = remove_reasoning_from_output(output)
            cleaned_output = clean_json_tags(cleaned_output)

            # 记录清理后的输出用于调试
            self.log_info(f"清理后的输出: {cleaned_output}")

            # 解析JSON
            try:
                result = json.loads(cleaned_output)
                self.log_info("JSON解析成功")
            except JSONDecodeError as e:
                self.log_info(f"JSON解析失败: {str(e)}")
                # 尝试修复JSON
                fixed_json = fix_incomplete_json(cleaned_output)
                if fixed_json:
                    try:
                        result = json.loads(fixed_json)
                        self.log_info("JSON修复成功")
                    except JSONDecodeError:
                        self.log_info("JSON修复失败，直接使用清理后的文本")
                        # 如果不是JSON格式，直接返回清理后的文本
                        return cleaned_output
                else:
                    self.log_info("无法修复JSON，直接使用清理后的文本")
                    # 如果不是JSON格式，直接返回清理后的文本
                    return cleaned_output

            # 提取段落内容
            if isinstance(result, dict):
                paragraph_content = result.get("paragraph_latest_state", "")
                if paragraph_content:
                    return paragraph_content

            # 如果提取失败，返回原始清理后的文本
            return cleaned_output

        except Exception as e:
            self.log_error(f"处理输出失败: {str(e)}")
            return "段落总结生成失败"

    def mutate_state(self, input_data: Any, state: State, paragraph_index: int, **kwargs) -> State:
        """
        更新段落的最新总结到状态

        Args:
            input_data: 输入数据
            state: 当前状态
            paragraph_index: 段落索引
            **kwargs: 额外参数

        Returns:
            更新后的状态
        """
        try:
            # 生成总结
            summary = self.run(input_data, **kwargs)

            # 更新状态
            if 0 <= paragraph_index < len(state.paragraphs):
                state.paragraphs[paragraph_index].research.latest_summary = summary
                self.log_info(f"已更新段落 {paragraph_index} 的首次总结")
            else:
                raise ValueError(f"段落索引 {paragraph_index} 超出范围")

            state.update_timestamp()
            return state

        except Exception as e:
            self.log_error(f"状态更新失败: {str(e)}")
            raise e


class ReflectionSummaryNode(StateMutationNode):
    """根据反思搜索结果更新段落总结的节点"""

    def __init__(self, llm_client):
        """
        初始化反思总结节点

        Args:
            llm_client: LLM客户端
        """
        super().__init__(llm_client, "ReflectionSummaryNode")

    def validate_input(self, input_data: Any) -> bool:
        """验证输入数据"""
        if isinstance(input_data, str):
            try:
                data = json.loads(input_data)
                required_fields = ["title", "content", "search_query", "search_results", "paragraph_latest_state"]
                return all(field in data for field in required_fields)
            except JSONDecodeError:
                return False
        elif isinstance(input_data, dict):
            required_fields = ["title", "content", "search_query", "search_results", "paragraph_latest_state"]
            return all(field in input_data for field in required_fields)
        return False

    def run(self, input_data: Any, **kwargs) -> str:
        """
        调用LLM更新段落内容

        Args:
            input_data: 包含完整反思信息的数据
            **kwargs: 额外参数

        Returns:
            更新后的段落内容
        """
        try:
            if not self.validate_input(input_data):
                raise ValueError("输入数据格式错误")

            # 准备输入数据
            if isinstance(input_data, str):
                data = json.loads(input_data)
            else:
                data = input_data.copy() if isinstance(input_data, dict) else input_data

            # 读取最新的HOST发言（如果可用）
            if FORUM_READER_AVAILABLE:
                try:
                    host_speech = get_latest_host_speech()
                    if host_speech:
                        # 将HOST发言添加到输入数据中
                        data['host_speech'] = host_speech
                        self.log_info(f"已读取HOST发言，长度: {len(host_speech)}字符")
                except Exception as e:
                    self.log_info(f"读取HOST发言失败: {str(e)}")

            # 转换为JSON字符串
            message = json.dumps(data, ensure_ascii=False)

            # 如果有HOST发言，添加到消息前面作为参考
            if FORUM_READER_AVAILABLE and 'host_speech' in data and data['host_speech']:
                formatted_host = format_host_speech_for_prompt(data['host_speech'])
                message = formatted_host + "\n" + message

            self.log_info("正在生成反思总结")

            # 调用LLM，增加max_tokens以支持更长的总结
            response = self.llm_client.invoke(
                SYSTEM_PROMPT_REFLECTION_SUMMARY,
                message,
                max_tokens=8192  # 支持更长的总结内容
            )

            # 处理响应
            processed_response = self.process_output(response)

            self.log_info("成功生成反思总结")
            return processed_response

        except Exception as e:
            self.log_error(f"生成反思总结失败: {str(e)}")
            raise e

    def process_output(self, output: str) -> str:
        """
        处理LLM输出，提取更新后的段落内容

        Args:
            output: LLM原始输出

        Returns:
            更新后的段落内容
        """
        try:
            # 清理响应文本
            cleaned_output = remove_reasoning_from_output(output)
            cleaned_output = clean_json_tags(cleaned_output)

            # 记录清理后的输出用于调试
            self.log_info(f"清理后的输出: {cleaned_output}")

            # 解析JSON
            try:
                result = json.loads(cleaned_output)
                self.log_info("JSON解析成功")
            except JSONDecodeError as e:
                self.log_info(f"JSON解析失败: {str(e)}")
                # 尝试修复JSON
                fixed_json = fix_incomplete_json(cleaned_output)
                if fixed_json:
                    try:
                        result = json.loads(fixed_json)
                        self.log_info("JSON修复成功")
                    except JSONDecodeError:
                        self.log_info("JSON修复失败，直接使用清理后的文本")
                        # 如果不是JSON格式，直接返回清理后的文本
                        return cleaned_output
                else:
                    self.log_info("无法修复JSON，直接使用清理后的文本")
                    # 如果不是JSON格式，直接返回清理后的文本
                    return cleaned_output

            # 提取更新后的段落内容
            if isinstance(result, dict):
                updated_content = result.get("updated_paragraph_latest_state", "")
                if updated_content:
                    return updated_content

            # 如果提取失败，返回原始清理后的文本
            return cleaned_output

        except Exception as e:
            self.log_error(f"处理输出失败: {str(e)}")
            return "反思总结生成失败"

    def mutate_state(self, input_data: Any, state: State, paragraph_index: int, **kwargs) -> State:
        """
        将更新后的总结写入状态

        Args:
            input_data: 输入数据
            state: 当前状态
            paragraph_index: 段落索引
            **kwargs: 额外参数

        Returns:
            更新后的状态
        """
        try:
            # 生成更新后的总结
            updated_summary = self.run(input_data, **kwargs)

            # 更新状态
            if 0 <= paragraph_index < len(state.paragraphs):
                state.paragraphs[paragraph_index].research.latest_summary = updated_summary
                state.paragraphs[paragraph_index].research.increment_reflection()
                self.log_info(f"已更新段落 {paragraph_index} 的反思总结")
            else:
                raise ValueError(f"段落索引 {paragraph_index} 超出范围")

            state.update_timestamp()
            return state

        except Exception as e:
            self.log_error(f"状态更新失败: {str(e)}")
            raise e