Improve the Rendering of Inline Formulas

This commit is contained in:
马一丁
2025-11-27 15:12:33 +08:00
parent 336e24f03a
commit c9023982c9
2 changed files with 79 additions and 17 deletions
+54 -10
View File
@@ -1094,9 +1094,37 @@ class HTMLRenderer:
def _render_paragraph(self, block: Dict[str, Any]) -> str: def _render_paragraph(self, block: Dict[str, Any]) -> str:
"""渲染段落,内部通过inline run保持混排样式""" """渲染段落,内部通过inline run保持混排样式"""
inlines = "".join(self._render_inline(run) for run in block.get("inlines", [])) inlines_data = block.get("inlines", [])
# 仅包含单个display公式时直接渲染为块,避免<p>内嵌<div>
if len(inlines_data) == 1:
standalone = self._render_standalone_math_inline(inlines_data[0])
if standalone:
return standalone
inlines = "".join(self._render_inline(run) for run in inlines_data)
return f"<p>{inlines}</p>" return f"<p>{inlines}</p>"
def _render_standalone_math_inline(self, run: Dict[str, Any] | str) -> str | None:
"""当段落只包含单个display公式时,转为math-block避免破坏行内布局"""
if isinstance(run, dict):
text_value, marks = self._normalize_inline_payload(run)
if marks:
return None
math_id_hint = run.get("mathIds") or run.get("mathId")
else:
text_value = "" if run is None else str(run)
math_id_hint = None
marks = []
rendered = self._render_text_with_inline_math(
text_value,
math_id_hint,
allow_display_block=True
)
if rendered and rendered.strip().startswith('<div class="math-block"'):
return rendered
return None
def _render_list(self, block: Dict[str, Any]) -> str: def _render_list(self, block: Dict[str, Any]) -> str:
"""渲染有序/无序/任务列表""" """渲染有序/无序/任务列表"""
list_type = block.get("listType", "bullet") list_type = block.get("listType", "bullet")
@@ -2034,7 +2062,12 @@ class HTMLRenderer:
break break
return latex return latex
def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None: def _render_text_with_inline_math(
self,
text: Any,
math_id: str | list | None = None,
allow_display_block: bool = False
) -> str | None:
""" """
识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。 识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
@@ -2045,17 +2078,19 @@ class HTMLRenderer:
return None return None
pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S) pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
matches = list(pattern.finditer(text))
if not matches:
return None
cursor = 0 cursor = 0
parts: List[str] = [] parts: List[str] = []
idx = 0
id_iter = iter(math_id) if isinstance(math_id, list) else None id_iter = iter(math_id) if isinstance(math_id, list) else None
for m in pattern.finditer(text):
for idx, m in enumerate(matches, start=1):
start, end = m.span() start, end = m.span()
if start > cursor: prefix = text[cursor:start]
parts.append(self._escape_html(text[cursor:start]))
raw = next(g for g in m.groups()[1:] if g is not None) raw = next(g for g in m.groups()[1:] if g is not None)
latex = self._normalize_latex_string(raw) latex = self._normalize_latex_string(raw)
idx += 1
# 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成 # 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
if id_iter: if id_iter:
mid = next(id_iter, f"auto-math-{idx}") mid = next(id_iter, f"auto-math-{idx}")
@@ -2063,14 +2098,23 @@ class HTMLRenderer:
mid = math_id or f"auto-math-{idx}" mid = math_id or f"auto-math-{idx}"
id_attr = f' data-math-id="{self._escape_attr(mid)}"' id_attr = f' data-math-id="{self._escape_attr(mid)}"'
is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[') is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
if is_display: is_standalone = (
len(matches) == 1 and
not text[:start].strip() and
not text[end:].strip()
)
use_block = allow_display_block and is_display and is_standalone
if use_block:
# 独立display公式,跳过两侧空白,直接渲染块级
parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>') parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
cursor = len(text)
break
else: else:
if prefix:
parts.append(self._escape_html(prefix))
parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>') parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
cursor = end cursor = end
if cursor == 0:
return None
if cursor < len(text): if cursor < len(text):
parts.append(self._escape_html(text[cursor:])) parts.append(self._escape_html(text[cursor:]))
return "".join(parts) return "".join(parts)
+25 -7
View File
@@ -550,7 +550,8 @@ class PDFRenderer:
# 仅单个math mark # 仅单个math mark
raw = math_mark.get('value') or run.get('text') or '' raw = math_mark.get('value') or run.get('text') or ''
latex = self._normalize_latex(raw) latex = self._normalize_latex(raw)
is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw))) # 行内mark统一按inline处理,避免误将行内公式当成display
is_display = False
if not latex: if not latex:
continue continue
block_counter[0] += 1 block_counter[0] += 1
@@ -748,13 +749,19 @@ class PDFRenderer:
if not isinstance(text, str): if not isinstance(text, str):
return None return None
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
m = pattern.search(text) matches = list(pattern.finditer(text))
if not m: if not matches:
return None return None
m = matches[0]
raw = next(g for g in m.groups() if g is not None) raw = next(g for g in m.groups() if g is not None)
latex = raw.strip() latex = raw.strip()
is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \] is_display_raw = bool(m.group(1) or m.group(4)) # $$ or \[ \]
return latex, is_display is_standalone = (
len(matches) == 1 and
not text[:m.start()].strip() and
not text[m.end():].strip()
)
return latex, bool(is_display_raw and is_standalone)
@staticmethod @staticmethod
def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]: def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
@@ -763,10 +770,21 @@ class PDFRenderer:
return [] return []
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S) pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
results = [] results = []
for m in pattern.finditer(text): matches = list(pattern.finditer(text))
if not matches:
return results
total = len(matches)
for m in matches:
raw = next(g for g in m.groups() if g is not None) raw = next(g for g in m.groups() if g is not None)
latex = raw.strip() latex = raw.strip()
is_display = bool(m.group(1) or m.group(4)) is_display_raw = bool(m.group(1) or m.group(4))
is_standalone = (
total == 1 and
not text[:m.start()].strip() and
not text[m.end():].strip()
)
is_display = is_display_raw and is_standalone
results.append((latex, is_display)) results.append((latex, is_display))
return results return results