Add Support for Rendering Various Inline and Block-level Mathematical Formulas

This commit is contained in:
马一丁
2025-11-27 10:29:27 +08:00
parent 4e882560da
commit 5e9da9cfbf
3 changed files with 195 additions and 21 deletions
+71 -3
View File
@@ -1262,7 +1262,8 @@ class HTMLRenderer:
def _render_math(self, block: Dict[str, Any]) -> str: def _render_math(self, block: Dict[str, Any]) -> str:
"""渲染数学公式,占位符交给外部MathJax或后处理""" """渲染数学公式,占位符交给外部MathJax或后处理"""
latex = self._escape_html(block.get("latex", "")) latex_raw = block.get("latex", "")
latex = self._escape_html(self._normalize_latex_string(latex_raw))
math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else "" math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else ""
id_attr = f' data-math-id="{math_id}"' if math_id else "" id_attr = f' data-math-id="{math_id}"' if math_id else ""
return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>' return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>'
@@ -1988,6 +1989,66 @@ class HTMLRenderer:
return text_value, marks return text_value, marks
@staticmethod
def _normalize_latex_string(raw: Any) -> str:
"""去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
if not isinstance(raw, str):
return ""
latex = raw.strip()
patterns = [
r'^\$\$(.*)\$\$$',
r'^\$(.*)\$$',
r'^\\\[(.*)\\\]$',
r'^\\\((.*)\\\)$',
]
for pat in patterns:
m = re.match(pat, latex, re.DOTALL)
if m:
latex = m.group(1).strip()
break
return latex
def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None:
"""
识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
- 支持 $...$、$$...$$、\\(\\)、\\[\\]。
- 若未检测到公式,返回None。
"""
if not isinstance(text, str) or not text:
return None
pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
cursor = 0
parts: List[str] = []
idx = 0
id_iter = iter(math_id) if isinstance(math_id, list) else None
for m in pattern.finditer(text):
start, end = m.span()
if start > cursor:
parts.append(self._escape_html(text[cursor:start]))
raw = next(g for g in m.groups()[1:] if g is not None)
latex = self._normalize_latex_string(raw)
idx += 1
# 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
if id_iter:
mid = next(id_iter, f"auto-math-{idx}")
else:
mid = math_id or f"auto-math-{idx}"
id_attr = f' data-math-id="{self._escape_attr(mid)}"'
is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
if is_display:
parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
else:
parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
cursor = end
if cursor == 0:
return None
if cursor < len(text):
parts.append(self._escape_html(text[cursor:]))
return "".join(parts)
@staticmethod @staticmethod
def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None: def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None:
"""尽力将字符串里的内联节点恢复为dict,修复渲染遗漏""" """尽力将字符串里的内联节点恢复为dict,修复渲染遗漏"""
@@ -2013,12 +2074,19 @@ class HTMLRenderer:
text_value, marks = self._normalize_inline_payload(run) text_value, marks = self._normalize_inline_payload(run)
math_mark = next((mark for mark in marks if mark.get("type") == "math"), None) math_mark = next((mark for mark in marks if mark.get("type") == "math"), None)
if math_mark: if math_mark:
latex = math_mark.get("value") latex = self._normalize_latex_string(math_mark.get("value"))
if not isinstance(latex, str) or not latex.strip(): if not isinstance(latex, str) or not latex.strip():
latex = text_value latex = self._normalize_latex_string(text_value)
math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else "" math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else ""
id_attr = f' data-math-id="{math_id}"' if math_id else "" id_attr = f' data-math-id="{math_id}"' if math_id else ""
return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>' return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>'
# 尝试从纯文本中提取数学公式(即便没有math mark)
math_id_hint = run.get("mathIds") or run.get("mathId")
mathified = self._render_text_with_inline_math(text_value, math_id_hint)
if mathified is not None:
return mathified
text = self._escape_html(text_value) text = self._escape_html(text_value)
styles: List[str] = [] styles: List[str] = []
prefix: List[str] = [] prefix: List[str] = []
+17 -2
View File
@@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器
""" """
import io import io
import re
from typing import Optional from typing import Optional
import matplotlib import matplotlib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@@ -40,8 +41,22 @@ class MathToSVG:
SVG 字符串如果转换失败则返回 None SVG 字符串如果转换失败则返回 None
""" """
try: try:
# 清理 LaTeX 字符串 # 清理 LaTeX 字符串,去除外层定界符,兼容 $...$ / $$...$$ / \\( \\) / \\[ \\]
latex = latex.strip() latex = (latex or "").strip()
patterns = [
r'^\$\$(.*)\$\$$',
r'^\$(.*)\$$',
r'^\\\[(.*)\\\]$',
r'^\\\((.*)\\\)$',
]
for pat in patterns:
m = re.match(pat, latex, re.DOTALL)
if m:
latex = m.group(1).strip()
break
# 清理控制字符并做常见兼容
latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
latex = latex.replace(r'\\tfrac', r'\\frac').replace(r'\\dfrac', r'\\frac')
if not latex: if not latex:
logger.warning("空的 LaTeX 公式") logger.warning("空的 LaTeX 公式")
return None return None
+107 -16
View File
@@ -10,6 +10,7 @@ import copy
import os import os
import sys import sys
import io import io
import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict from typing import Any, Dict
from datetime import datetime from datetime import datetime
@@ -544,23 +545,62 @@ class PDFRenderer:
continue continue
marks = run.get('marks') or [] marks = run.get('marks') or []
math_mark = next((m for m in marks if m.get('type') == 'math'), None) math_mark = next((m for m in marks if m.get('type') == 'math'), None)
if not math_mark:
if math_mark:
# 仅单个math mark
raw = math_mark.get('value') or run.get('text') or ''
latex = self._normalize_latex(raw)
is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw)))
if not latex:
continue
block_counter[0] += 1
math_id = run.get('mathId') or f"math-inline-{block_counter[0]}"
run['mathId'] = math_id
try:
svg_content = (
self.math_converter.convert_display_to_svg(latex)
if is_display else
self.math_converter.convert_inline_to_svg(latex)
)
if svg_content:
svg_map[math_id] = svg_content
logger.debug(f"公式 {math_id} 转换为SVG成功")
else:
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
except Exception as exc:
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
continue continue
latex = (math_mark.get('value') or run.get('text') or '').strip()
if not latex: # 无math mark,尝试解析文本中的多个公式
text_val = run.get('text')
if not isinstance(text_val, str):
continue continue
block_counter[0] += 1 segments = self._find_all_math_in_text(text_val)
math_id = f"math-inline-{block_counter[0]}" if not segments:
try: continue
svg_content = self.math_converter.convert_inline_to_svg(latex) ids_for_html: list[str] = []
if svg_content: for idx, (latex, is_display) in enumerate(segments, start=1):
svg_map[math_id] = svg_content if not latex:
run['mathId'] = math_id continue
logger.debug(f"公式 {math_id} 转换为SVG成功") block_counter[0] += 1
else: math_id = f"auto-math-{block_counter[0]}"
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...") ids_for_html.append(math_id)
except Exception as exc: try:
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}") svg_content = (
self.math_converter.convert_display_to_svg(latex)
if is_display else
self.math_converter.convert_inline_to_svg(latex)
)
if svg_content:
svg_map[math_id] = svg_content
logger.debug(f"公式 {math_id} 转换为SVG成功")
else:
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
except Exception as exc:
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
if ids_for_html:
# 将ID列表写回run,便于HTML渲染时使用相同ID(顺序对应segments)
run['mathIds'] = ids_for_html
for block in blocks: for block in blocks:
if not isinstance(block, dict): if not isinstance(block, dict):
@@ -570,7 +610,7 @@ class PDFRenderer:
# 处理math类型 # 处理math类型
if block_type == 'math': if block_type == 'math':
latex = block.get('latex', '').strip() latex = self._normalize_latex(block.get('latex', ''))
if latex: if latex:
block_counter[0] += 1 block_counter[0] += 1
math_id = f"math-block-{block_counter[0]}" math_id = f"math-block-{block_counter[0]}"
@@ -679,6 +719,57 @@ class PDFRenderer:
return html return html
@staticmethod
def _normalize_latex(raw: Any) -> str:
"""去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
if not isinstance(raw, str):
return ""
latex = raw.strip()
patterns = [
r'^\$\$(.*)\$\$$',
r'^\$(.*)\$$',
r'^\\\[(.*)\\\]$',
r'^\\\((.*)\\\)$',
]
for pat in patterns:
m = re.match(pat, latex, re.DOTALL)
if m:
latex = m.group(1).strip()
break
# 清理控制字符、防止mathtext解析失败
latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
# 常见兼容:\tfrac/\dfrac -> \frac
latex = latex.replace(r'\tfrac', r'\frac').replace(r'\dfrac', r'\frac')
return latex
@staticmethod
def _find_first_math_in_text(text: Any) -> tuple[str, bool] | None:
"""从纯文本中提取首个数学片段,返回(内容, 是否display)"""
if not isinstance(text, str):
return None
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
m = pattern.search(text)
if not m:
return None
raw = next(g for g in m.groups() if g is not None)
latex = raw.strip()
is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \]
return latex, is_display
@staticmethod
def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
"""从纯文本中提取所有数学片段,返回[(内容, 是否display)]"""
if not isinstance(text, str):
return []
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
results = []
for m in pattern.finditer(text):
raw = next(g for g in m.groups() if g is not None)
latex = raw.strip()
is_display = bool(m.group(1) or m.group(4))
results.append((latex, is_display))
return results
def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str:
""" """
将词云PNG data URI注入HTML替换对应canvas 将词云PNG data URI注入HTML替换对应canvas