agent_jrxml/tests/test_continuation_extraction.py

"""续写 + JRXML 提取单元测试。

测试 _strip_continuation_wrapper、_extract_jrxml 在
多轮续写场景下的鲁棒性，以及 _generate_with_continuation 的完成检测。
"""

from __future__ import annotations

import pytest
from agent.nodes import _strip_continuation_wrapper, _extract_jrxml

# ── 完整 JRXML ─────────────────────────────────────────────────────

COMPLETE_JRXML = """<?xml version="1.0" encoding="UTF-8"?>
<jasperReport name="test" pageWidth="595" pageHeight="842">
    <field name="field_1" class="java.lang.String"/>
    <queryString><![CDATA[SELECT * FROM t]]></queryString>
    <title>
        <band height="50">
            <staticText>
                <reportElement x="0" y="0" width="100" height="20"/>
                <text><![CDATA[$F{field_1}]]></text>
            </staticText>
        </band>
    </title>
</jasperReport>"""

# 第一轮输出：完整开头但缺少 </jasperReport>（模拟截断）
ROUND1_TRUNCATED = """<?xml version="1.0" encoding="UTF-8"?>
<jasperReport name="test" pageWidth="595" pageHeight="842">
    <field name="field_1" class="java.lang.String"/>
    <field name="field_2" class="java.lang.String"/>
    <queryString><![CDATA[SELECT * FROM t]]></queryString>
    <title>
        <band height="50">
            <staticText>
                <reportElement x="0" y="0" width="100" height="20"/>
                <text><![CDATA[$F{field_1}]]></text>
            </staticText>
        </band>
    </title>
    <detail>
        <band height="30">
            <textField>
                <reportElement x="0" y="0" width="100" height="20"/>
                <textFieldExpression><![CDATA[$F{field_1}]]></"""

# 第二轮续写：用 markdown 包裹 + 错误关闭标签（真实 LLM 行为）
ROUND2_MARKDOWN_CONTINUATION = """继续输出剩余的 JRXML 内容：

```
                <textFieldExpression><![CDATA[$F{field_2}]]></textFieldExpression>
            </textField>
        </band>
    </detail>
</jasperReport>
```"""

# 第二轮续写变体：用 </report> 关闭（另一种常见 LLM 错误）
ROUND2_REPORT_CLOSE = """继续输出：

```
                <textFieldExpression><![CDATA[$F{field_2}]]></textFieldExpression>
            </textField>
        </band>
    </detail>
</report>
```"""

# 第二轮续写变体：只用 ``` 开头，无结尾（不完整代码块）
ROUND2_PARTIAL_MARKDOWN = """
```xml
                <textFieldExpression><![CDATA[$F{field_2}]]></textFieldExpression>
            </textField>
        </band>
    </detail>
</jasperReport>
```"""


# ── _strip_continuation_wrapper 测试 ───────────────────────────────

class TestStripContinuationWrapper:
    def test_removes_complete_markdown_block(self):
        text = '继续输出：\n\n```\n<band>test</band>\n```'
        result = _strip_continuation_wrapper(text)
        assert result == '<band>test</band>'

    def test_removes_xml_fenced_block(self):
        text = '```xml\n<band>test</band>\n```'
        result = _strip_continuation_wrapper(text)
        assert result == '<band>test</band>'

    def test_removes_opening_fence_only(self):
        text = '```xml\n<band>test</band>'
        result = _strip_continuation_wrapper(text)
        assert '<band>test</band>' in result
        assert '```' not in result

    def test_removes_closing_fence_only(self):
        text = '<band>test</band>\n```'
        result = _strip_continuation_wrapper(text)
        assert '<band>test</band>' in result
        assert '```' not in result

    def test_removes_continuation_prefix_chinese(self):
        text = '继续输出剩余的 JRXML 内容：\n<band>test</band>'
        result = _strip_continuation_wrapper(text)
        assert result == '<band>test</band>'

    def test_pure_xml_passes_through(self):
        text = '<band>test</band>'
        result = _strip_continuation_wrapper(text)
        assert result == '<band>test</band>'

    def test_empty_becomes_empty(self):
        assert _strip_continuation_wrapper('') == ''
        assert _strip_continuation_wrapper('   ') == ''

    def test_empty_markdown_block_returns_empty(self):
        text = '```xml\n```'
        result = _strip_continuation_wrapper(text)
        assert result == ''

    def test_multiple_backtick_pairs_extracts_first_valid(self):
        text = '```\nfragment\n```\n```xml\ncomplete<?xml ...\n```'
        result = _strip_continuation_wrapper(text)
        assert result == 'fragment'


# ── _extract_jrxml 多轮续写场景测试 ─────────────────────────────────

class TestExtractJrxmlMultiRound:
    def test_extracts_from_mixed_multi_round_output(self):
        """第一轮无 markdown + 第二轮有 markdown 的混合文本。"""
        combined = ROUND1_TRUNCATED + ROUND2_MARKDOWN_CONTINUATION
        result = _extract_jrxml(combined)
        assert result.startswith("<?xml")
        assert "</jasperReport>" in result
        assert '$F{field_1}' in result
        assert '$F{field_2}' in result

    def test_extracts_with_report_close_tag(self):
        """第二轮用 </report> 而非 </jasperReport> 关闭。"""
        combined = ROUND1_TRUNCATED + ROUND2_REPORT_CLOSE
        result = _extract_jrxml(combined)
        assert result.startswith("<?xml")
        assert "</report>" in result
        assert '$F{field_2}' in result

    def test_extracts_with_partial_markdown(self):
        """第二轮用 ```xml 开头，``` 结尾。"""
        combined = ROUND1_TRUNCATED + ROUND2_PARTIAL_MARKDOWN
        result = _extract_jrxml(combined)
        assert result.startswith("<?xml")
        assert "</jasperReport>" in result

    def test_single_round_complete_jrxml_in_markdown(self):
        """单轮输出：完整的 JRXML 在 markdown 代码块中。"""
        text = '```xml\n' + COMPLETE_JRXML + '\n```'
        result = _extract_jrxml(text)
        assert result == COMPLETE_JRXML

    def test_single_round_pure_jrxml(self):
        """单轮输出：纯 JRXML 无 markdown。"""
        result = _extract_jrxml(COMPLETE_JRXML)
        assert result == COMPLETE_JRXML

    def test_jrxml_with_leading_explanation(self):
        """JRXML 前有自然语言解释。"""
        text = '这是生成的报表模板：\n' + COMPLETE_JRXML
        result = _extract_jrxml(text)
        assert result == COMPLETE_JRXML

    def test_two_markdown_blocks_skips_fragment(self):
        """文本中有两个 markdown 块，第一个是片段，第二个是完整 JRXML。"""
        text = (
            '```\nsome fragment\n```\n'
            '```xml\n' + COMPLETE_JRXML + '\n```'
        )
        result = _extract_jrxml(text)
        assert result == COMPLETE_JRXML

    def test_two_markdown_blocks_first_is_complete(self):
        """文本中有两个 markdown 块，第一个是完整 JRXML。"""
        text = (
            '```xml\n' + COMPLETE_JRXML + '\n```\n'
            '```\nsome other stuff\n```'
        )
        result = _extract_jrxml(text)
        assert result == COMPLETE_JRXML

    def test_no_xml_passes_through(self):
        """无 XML 内容的文本原样返回。"""
        text = 'Hello, this has no XML at all.'
        result = _extract_jrxml(text)
        assert result == text


# ── 完成检测测试 ───────────────────────────────────────────────────

class TestCompletionDetection:
    def test_jasperreport_close_detected(self):
        """以 </jasperReport> 结尾的 JRXML 应被识别为完成。"""
        import re
        jrxml = COMPLETE_JRXML.strip()
        _jrxml_end = r"</(?:[\w:]+:)?(?:jasperReport|report)>\s*$"
        assert re.search(_jrxml_end, jrxml, re.IGNORECASE)

    def test_report_close_detected(self):
        """以 </report> 结尾的 JRXML 也应被识别为完成。"""
        import re
        jrxml = COMPLETE_JRXML.replace('</jasperReport>', '</report>').strip()
        _jrxml_end = r"</(?:[\w:]+:)?(?:jasperReport|report)>\s*$"
        assert re.search(_jrxml_end, jrxml, re.IGNORECASE)

    def test_namespaced_jasperreport_close_detected(self):
        """以 </ns0:jasperReport> 结尾的 JRXML 也应被识别。"""
        import re
        jrxml = COMPLETE_JRXML.replace('</jasperReport>', '</ns0:jasperReport>').strip()
        _jrxml_end = r"</(?:[\w:]+:)?(?:jasperReport|report)>\s*$"
        assert re.search(_jrxml_end, jrxml, re.IGNORECASE)

    def test_truncated_jrxml_not_detected(self):
        """截断的 JRXML（无关闭标签）不应被识别为完成。"""
        import re
        _jrxml_end = r"</(?:[\w:]+:)?(?:jasperReport|report)>\s*$"
        assert not re.search(_jrxml_end, ROUND1_TRUNCATED.strip(), re.IGNORECASE)