fix: namespace prefix regex for textField/field counting in fidelity check

2026-05-25 23:56:31 +08:00
parent cacff6f63a
commit 6e6199bd26
1 changed files with 6 additions and 4 deletions
@@ -1266,9 +1266,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
    issues = []
-    # 1. 元素数量对比
+    # 1. 元素数量对比（支持 namespace 前缀，如 <jrxml:textField>）
-    text_fields = len(re.findall(r"<textField", jrxml))
+    text_fields = len(re.findall(r"<[a-zA-Z0-9_-]+:textField|<textField", jrxml))
-    static_texts = len(re.findall(r"<staticText", jrxml))
+    static_texts = len(re.findall(r"<[a-zA-Z0-9_-]+:staticText|<staticText", jrxml))
    total_jrxml_elements = text_fields + static_texts
    ocr_text_count = 0
@@ -1288,7 +1288,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
        element_coverage = 1.0
    # 2. 字段名覆盖（英文字段名 vs OCR 中文字段名天然不匹配，权重降低）
-    jrxml_fields = set(re.findall(r'<field name="([^"]+)"', jrxml))
+    # 支持 namespace 前缀的 field 声明（如 <jrxml:field>）
    raw_fields = re.findall(r'(?:<[a-zA-Z0-9_-]+:)?field\s+name="([^"]+)"', jrxml)
    jrxml_fields = set(raw_fields)
    ocr_field_names = set()
    ocr_fields = ocr_result.get("fields", []) if isinstance(ocr_result, dict) else []
    for f in ocr_fields: