fix: namespace prefix regex for textField/field counting in fidelity check

This commit is contained in:
2026-05-25 23:56:31 +08:00
parent cacff6f63a
commit 6e6199bd26
+6 -4
View File
@@ -1266,9 +1266,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
issues = []
# 1. 元素数量对比
text_fields = len(re.findall(r"<textField", jrxml))
static_texts = len(re.findall(r"<staticText", jrxml))
# 1. 元素数量对比(支持 namespace 前缀,如 <jrxml:textField>
text_fields = len(re.findall(r"<[a-zA-Z0-9_-]+:textField|<textField", jrxml))
static_texts = len(re.findall(r"<[a-zA-Z0-9_-]+:staticText|<staticText", jrxml))
total_jrxml_elements = text_fields + static_texts
ocr_text_count = 0
@@ -1288,7 +1288,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
element_coverage = 1.0
# 2. 字段名覆盖(英文字段名 vs OCR 中文字段名天然不匹配,权重降低)
jrxml_fields = set(re.findall(r'<field name="([^"]+)"', jrxml))
# 支持 namespace 前缀的 field 声明(如 <jrxml:field>
raw_fields = re.findall(r'(?:<[a-zA-Z0-9_-]+:)?field\s+name="([^"]+)"', jrxml)
jrxml_fields = set(raw_fields)
ocr_field_names = set()
ocr_fields = ocr_result.get("fields", []) if isinstance(ocr_result, dict) else []
for f in ocr_fields: