fix: namespace prefix regex for textField/field counting in fidelity check
This commit is contained in:
+6
-4
@@ -1266,9 +1266,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
|
|||||||
|
|
||||||
issues = []
|
issues = []
|
||||||
|
|
||||||
# 1. 元素数量对比
|
# 1. 元素数量对比(支持 namespace 前缀,如 <jrxml:textField>)
|
||||||
text_fields = len(re.findall(r"<textField", jrxml))
|
text_fields = len(re.findall(r"<[a-zA-Z0-9_-]+:textField|<textField", jrxml))
|
||||||
static_texts = len(re.findall(r"<staticText", jrxml))
|
static_texts = len(re.findall(r"<[a-zA-Z0-9_-]+:staticText|<staticText", jrxml))
|
||||||
total_jrxml_elements = text_fields + static_texts
|
total_jrxml_elements = text_fields + static_texts
|
||||||
|
|
||||||
ocr_text_count = 0
|
ocr_text_count = 0
|
||||||
@@ -1288,7 +1288,9 @@ def _check_ocr_fidelity(jrxml: str, state: dict) -> dict:
|
|||||||
element_coverage = 1.0
|
element_coverage = 1.0
|
||||||
|
|
||||||
# 2. 字段名覆盖(英文字段名 vs OCR 中文字段名天然不匹配,权重降低)
|
# 2. 字段名覆盖(英文字段名 vs OCR 中文字段名天然不匹配,权重降低)
|
||||||
jrxml_fields = set(re.findall(r'<field name="([^"]+)"', jrxml))
|
# 支持 namespace 前缀的 field 声明(如 <jrxml:field>)
|
||||||
|
raw_fields = re.findall(r'(?:<[a-zA-Z0-9_-]+:)?field\s+name="([^"]+)"', jrxml)
|
||||||
|
jrxml_fields = set(raw_fields)
|
||||||
ocr_field_names = set()
|
ocr_field_names = set()
|
||||||
ocr_fields = ocr_result.get("fields", []) if isinstance(ocr_result, dict) else []
|
ocr_fields = ocr_result.get("fields", []) if isinstance(ocr_result, dict) else []
|
||||||
for f in ocr_fields:
|
for f in ocr_fields:
|
||||||
|
|||||||
Reference in New Issue
Block a user