Add Support for Rendering Various Inline and Block-level Mathematical Formulas
This commit is contained in:
@@ -1262,7 +1262,8 @@ class HTMLRenderer:
|
|||||||
|
|
||||||
def _render_math(self, block: Dict[str, Any]) -> str:
|
def _render_math(self, block: Dict[str, Any]) -> str:
|
||||||
"""渲染数学公式,占位符交给外部MathJax或后处理"""
|
"""渲染数学公式,占位符交给外部MathJax或后处理"""
|
||||||
latex = self._escape_html(block.get("latex", ""))
|
latex_raw = block.get("latex", "")
|
||||||
|
latex = self._escape_html(self._normalize_latex_string(latex_raw))
|
||||||
math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else ""
|
math_id = self._escape_attr(block.get("mathId", "")) if block.get("mathId") else ""
|
||||||
id_attr = f' data-math-id="{math_id}"' if math_id else ""
|
id_attr = f' data-math-id="{math_id}"' if math_id else ""
|
||||||
return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>'
|
return f'<div class="math-block"{id_attr}>$$ {latex} $$</div>'
|
||||||
@@ -1988,6 +1989,66 @@ class HTMLRenderer:
|
|||||||
|
|
||||||
return text_value, marks
|
return text_value, marks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_latex_string(raw: Any) -> str:
|
||||||
|
"""去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
|
||||||
|
if not isinstance(raw, str):
|
||||||
|
return ""
|
||||||
|
latex = raw.strip()
|
||||||
|
patterns = [
|
||||||
|
r'^\$\$(.*)\$\$$',
|
||||||
|
r'^\$(.*)\$$',
|
||||||
|
r'^\\\[(.*)\\\]$',
|
||||||
|
r'^\\\((.*)\\\)$',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
m = re.match(pat, latex, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
latex = m.group(1).strip()
|
||||||
|
break
|
||||||
|
return latex
|
||||||
|
|
||||||
|
def _render_text_with_inline_math(self, text: Any, math_id: str | list | None = None) -> str | None:
|
||||||
|
"""
|
||||||
|
识别纯文本中的数学定界符并渲染为math-inline/math-block,提升兼容性。
|
||||||
|
|
||||||
|
- 支持 $...$、$$...$$、\\(\\)、\\[\\]。
|
||||||
|
- 若未检测到公式,返回None。
|
||||||
|
"""
|
||||||
|
if not isinstance(text, str) or not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
pattern = re.compile(r'(\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\])', re.S)
|
||||||
|
cursor = 0
|
||||||
|
parts: List[str] = []
|
||||||
|
idx = 0
|
||||||
|
id_iter = iter(math_id) if isinstance(math_id, list) else None
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
start, end = m.span()
|
||||||
|
if start > cursor:
|
||||||
|
parts.append(self._escape_html(text[cursor:start]))
|
||||||
|
raw = next(g for g in m.groups()[1:] if g is not None)
|
||||||
|
latex = self._normalize_latex_string(raw)
|
||||||
|
idx += 1
|
||||||
|
# 若已有math_id,直接使用,避免与SVG注入ID不一致;否则按局部序号生成
|
||||||
|
if id_iter:
|
||||||
|
mid = next(id_iter, f"auto-math-{idx}")
|
||||||
|
else:
|
||||||
|
mid = math_id or f"auto-math-{idx}"
|
||||||
|
id_attr = f' data-math-id="{self._escape_attr(mid)}"'
|
||||||
|
is_display = m.group(1).startswith('$$') or m.group(1).startswith('\\[')
|
||||||
|
if is_display:
|
||||||
|
parts.append(f'<div class="math-block"{id_attr}>$$ {self._escape_html(latex)} $$</div>')
|
||||||
|
else:
|
||||||
|
parts.append(f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>')
|
||||||
|
cursor = end
|
||||||
|
|
||||||
|
if cursor == 0:
|
||||||
|
return None
|
||||||
|
if cursor < len(text):
|
||||||
|
parts.append(self._escape_html(text[cursor:]))
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None:
|
def _coerce_inline_payload(payload: Dict[str, Any]) -> Dict[str, Any] | None:
|
||||||
"""尽力将字符串里的内联节点恢复为dict,修复渲染遗漏"""
|
"""尽力将字符串里的内联节点恢复为dict,修复渲染遗漏"""
|
||||||
@@ -2013,12 +2074,19 @@ class HTMLRenderer:
|
|||||||
text_value, marks = self._normalize_inline_payload(run)
|
text_value, marks = self._normalize_inline_payload(run)
|
||||||
math_mark = next((mark for mark in marks if mark.get("type") == "math"), None)
|
math_mark = next((mark for mark in marks if mark.get("type") == "math"), None)
|
||||||
if math_mark:
|
if math_mark:
|
||||||
latex = math_mark.get("value")
|
latex = self._normalize_latex_string(math_mark.get("value"))
|
||||||
if not isinstance(latex, str) or not latex.strip():
|
if not isinstance(latex, str) or not latex.strip():
|
||||||
latex = text_value
|
latex = self._normalize_latex_string(text_value)
|
||||||
math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else ""
|
math_id = self._escape_attr(run.get("mathId", "")) if run.get("mathId") else ""
|
||||||
id_attr = f' data-math-id="{math_id}"' if math_id else ""
|
id_attr = f' data-math-id="{math_id}"' if math_id else ""
|
||||||
return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>'
|
return f'<span class="math-inline"{id_attr}>\\( {self._escape_html(latex)} \\)</span>'
|
||||||
|
|
||||||
|
# 尝试从纯文本中提取数学公式(即便没有math mark)
|
||||||
|
math_id_hint = run.get("mathIds") or run.get("mathId")
|
||||||
|
mathified = self._render_text_with_inline_math(text_value, math_id_hint)
|
||||||
|
if mathified is not None:
|
||||||
|
return mathified
|
||||||
|
|
||||||
text = self._escape_html(text_value)
|
text = self._escape_html(text_value)
|
||||||
styles: List[str] = []
|
styles: List[str] = []
|
||||||
prefix: List[str] = []
|
prefix: List[str] = []
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ LaTeX 数学公式转 SVG 渲染器
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import re
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@@ -40,8 +41,22 @@ class MathToSVG:
|
|||||||
SVG 字符串,如果转换失败则返回 None
|
SVG 字符串,如果转换失败则返回 None
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 清理 LaTeX 字符串
|
# 清理 LaTeX 字符串,去除外层定界符,兼容 $...$ / $$...$$ / \\( \\) / \\[ \\]
|
||||||
latex = latex.strip()
|
latex = (latex or "").strip()
|
||||||
|
patterns = [
|
||||||
|
r'^\$\$(.*)\$\$$',
|
||||||
|
r'^\$(.*)\$$',
|
||||||
|
r'^\\\[(.*)\\\]$',
|
||||||
|
r'^\\\((.*)\\\)$',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
m = re.match(pat, latex, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
latex = m.group(1).strip()
|
||||||
|
break
|
||||||
|
# 清理控制字符并做常见兼容
|
||||||
|
latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
|
||||||
|
latex = latex.replace(r'\\tfrac', r'\\frac').replace(r'\\dfrac', r'\\frac')
|
||||||
if not latex:
|
if not latex:
|
||||||
logger.warning("空的 LaTeX 公式")
|
logger.warning("空的 LaTeX 公式")
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import copy
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import io
|
import io
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -544,23 +545,62 @@ class PDFRenderer:
|
|||||||
continue
|
continue
|
||||||
marks = run.get('marks') or []
|
marks = run.get('marks') or []
|
||||||
math_mark = next((m for m in marks if m.get('type') == 'math'), None)
|
math_mark = next((m for m in marks if m.get('type') == 'math'), None)
|
||||||
if not math_mark:
|
|
||||||
|
if math_mark:
|
||||||
|
# 仅单个math mark
|
||||||
|
raw = math_mark.get('value') or run.get('text') or ''
|
||||||
|
latex = self._normalize_latex(raw)
|
||||||
|
is_display = bool(re.match(r'^\s*(\$\$|\\\[)', str(raw)))
|
||||||
|
if not latex:
|
||||||
|
continue
|
||||||
|
block_counter[0] += 1
|
||||||
|
math_id = run.get('mathId') or f"math-inline-{block_counter[0]}"
|
||||||
|
run['mathId'] = math_id
|
||||||
|
try:
|
||||||
|
svg_content = (
|
||||||
|
self.math_converter.convert_display_to_svg(latex)
|
||||||
|
if is_display else
|
||||||
|
self.math_converter.convert_inline_to_svg(latex)
|
||||||
|
)
|
||||||
|
if svg_content:
|
||||||
|
svg_map[math_id] = svg_content
|
||||||
|
logger.debug(f"公式 {math_id} 转换为SVG成功")
|
||||||
|
else:
|
||||||
|
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
|
||||||
continue
|
continue
|
||||||
latex = (math_mark.get('value') or run.get('text') or '').strip()
|
|
||||||
if not latex:
|
# 无math mark,尝试解析文本中的多个公式
|
||||||
|
text_val = run.get('text')
|
||||||
|
if not isinstance(text_val, str):
|
||||||
continue
|
continue
|
||||||
block_counter[0] += 1
|
segments = self._find_all_math_in_text(text_val)
|
||||||
math_id = f"math-inline-{block_counter[0]}"
|
if not segments:
|
||||||
try:
|
continue
|
||||||
svg_content = self.math_converter.convert_inline_to_svg(latex)
|
ids_for_html: list[str] = []
|
||||||
if svg_content:
|
for idx, (latex, is_display) in enumerate(segments, start=1):
|
||||||
svg_map[math_id] = svg_content
|
if not latex:
|
||||||
run['mathId'] = math_id
|
continue
|
||||||
logger.debug(f"公式 {math_id} 转换为SVG成功")
|
block_counter[0] += 1
|
||||||
else:
|
math_id = f"auto-math-{block_counter[0]}"
|
||||||
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
|
ids_for_html.append(math_id)
|
||||||
except Exception as exc:
|
try:
|
||||||
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
|
svg_content = (
|
||||||
|
self.math_converter.convert_display_to_svg(latex)
|
||||||
|
if is_display else
|
||||||
|
self.math_converter.convert_inline_to_svg(latex)
|
||||||
|
)
|
||||||
|
if svg_content:
|
||||||
|
svg_map[math_id] = svg_content
|
||||||
|
logger.debug(f"公式 {math_id} 转换为SVG成功")
|
||||||
|
else:
|
||||||
|
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
|
||||||
|
if ids_for_html:
|
||||||
|
# 将ID列表写回run,便于HTML渲染时使用相同ID(顺序对应segments)
|
||||||
|
run['mathIds'] = ids_for_html
|
||||||
|
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
if not isinstance(block, dict):
|
if not isinstance(block, dict):
|
||||||
@@ -570,7 +610,7 @@ class PDFRenderer:
|
|||||||
|
|
||||||
# 处理math类型
|
# 处理math类型
|
||||||
if block_type == 'math':
|
if block_type == 'math':
|
||||||
latex = block.get('latex', '').strip()
|
latex = self._normalize_latex(block.get('latex', ''))
|
||||||
if latex:
|
if latex:
|
||||||
block_counter[0] += 1
|
block_counter[0] += 1
|
||||||
math_id = f"math-block-{block_counter[0]}"
|
math_id = f"math-block-{block_counter[0]}"
|
||||||
@@ -679,6 +719,57 @@ class PDFRenderer:
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_latex(raw: Any) -> str:
|
||||||
|
"""去除外层数学定界符,兼容 $...$、$$...$$、\\(\\)、\\[\\] 等格式"""
|
||||||
|
if not isinstance(raw, str):
|
||||||
|
return ""
|
||||||
|
latex = raw.strip()
|
||||||
|
patterns = [
|
||||||
|
r'^\$\$(.*)\$\$$',
|
||||||
|
r'^\$(.*)\$$',
|
||||||
|
r'^\\\[(.*)\\\]$',
|
||||||
|
r'^\\\((.*)\\\)$',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
m = re.match(pat, latex, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
latex = m.group(1).strip()
|
||||||
|
break
|
||||||
|
# 清理控制字符、防止mathtext解析失败
|
||||||
|
latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
|
||||||
|
# 常见兼容:\tfrac/\dfrac -> \frac
|
||||||
|
latex = latex.replace(r'\tfrac', r'\frac').replace(r'\dfrac', r'\frac')
|
||||||
|
return latex
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_first_math_in_text(text: Any) -> tuple[str, bool] | None:
|
||||||
|
"""从纯文本中提取首个数学片段,返回(内容, 是否display)"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return None
|
||||||
|
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
|
||||||
|
m = pattern.search(text)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
raw = next(g for g in m.groups() if g is not None)
|
||||||
|
latex = raw.strip()
|
||||||
|
is_display = bool(m.group(1) or m.group(4)) # $$ or \[ \]
|
||||||
|
return latex, is_display
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_all_math_in_text(text: Any) -> list[tuple[str, bool]]:
|
||||||
|
"""从纯文本中提取所有数学片段,返回[(内容, 是否display)]"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return []
|
||||||
|
pattern = re.compile(r'\$\$(.+?)\$\$|\$(.+?)\$|\\\((.+?)\\\)|\\\[(.+?)\\\]', re.S)
|
||||||
|
results = []
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
raw = next(g for g in m.groups() if g is not None)
|
||||||
|
latex = raw.strip()
|
||||||
|
is_display = bool(m.group(1) or m.group(4))
|
||||||
|
results.append((latex, is_display))
|
||||||
|
return results
|
||||||
|
|
||||||
def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str:
|
def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str:
|
||||||
"""
|
"""
|
||||||
将词云PNG data URI注入HTML,替换对应canvas
|
将词云PNG data URI注入HTML,替换对应canvas
|
||||||
|
|||||||
Reference in New Issue
Block a user