Optimize the Rendering of Inline Formulas, Subscripts and Superscripts, Bubble Charts, and Horizontal Bars
This commit is contained in:
@@ -535,6 +535,33 @@ class PDFRenderer:
|
||||
if block_counter is None:
|
||||
block_counter = [0]
|
||||
|
||||
def _extract_inline_math_from_inlines(inlines: list):
|
||||
"""从段落内联节点中提取数学公式"""
|
||||
if not isinstance(inlines, list):
|
||||
return
|
||||
for run in inlines:
|
||||
if not isinstance(run, dict):
|
||||
continue
|
||||
marks = run.get('marks') or []
|
||||
math_mark = next((m for m in marks if m.get('type') == 'math'), None)
|
||||
if not math_mark:
|
||||
continue
|
||||
latex = (math_mark.get('value') or run.get('text') or '').strip()
|
||||
if not latex:
|
||||
continue
|
||||
block_counter[0] += 1
|
||||
math_id = f"math-inline-{block_counter[0]}"
|
||||
try:
|
||||
svg_content = self.math_converter.convert_inline_to_svg(latex)
|
||||
if svg_content:
|
||||
svg_map[math_id] = svg_content
|
||||
run['mathId'] = math_id
|
||||
logger.debug(f"公式 {math_id} 转换为SVG成功")
|
||||
else:
|
||||
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
|
||||
except Exception as exc:
|
||||
logger.error(f"转换内联公式 {latex[:50]}... 时出错: {exc}")
|
||||
|
||||
for block in blocks:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
@@ -547,7 +574,6 @@ class PDFRenderer:
|
||||
if latex:
|
||||
block_counter[0] += 1
|
||||
math_id = f"math-block-{block_counter[0]}"
|
||||
|
||||
try:
|
||||
svg_content = self.math_converter.convert_display_to_svg(latex)
|
||||
if svg_content:
|
||||
@@ -559,6 +585,11 @@ class PDFRenderer:
|
||||
logger.warning(f"公式 {math_id} 转换为SVG失败: {latex[:50]}...")
|
||||
except Exception as e:
|
||||
logger.error(f"转换公式 {latex[:50]}... 时出错: {e}")
|
||||
else:
|
||||
# 提取段落、表格等内部的内联公式
|
||||
inlines = block.get('inlines')
|
||||
if inlines:
|
||||
_extract_inline_math_from_inlines(inlines)
|
||||
|
||||
# 递归处理嵌套的blocks
|
||||
nested_blocks = block.get('blocks')
|
||||
@@ -614,9 +645,8 @@ class PDFRenderer:
|
||||
# 创建SVG容器HTML
|
||||
svg_html = f'<div class="chart-svg-container">{svg_content}</div>'
|
||||
|
||||
# 查找包含此widgetId的配置脚本
|
||||
# 格式: <script type="application/json" id="chart-config-N">{"widgetId":"widget_id",...}</script>
|
||||
config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>\s*\{{[^}}]*"widgetId"\s*:\s*"{re.escape(widget_id)}"[^}}]*\}}'
|
||||
# 查找包含此widgetId的配置脚本(限制在同一个</script>内,避免跨标签误配)
|
||||
config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>(?:(?!</script>).)*?"widgetId"\s*:\s*"{re.escape(widget_id)}"(?:(?!</script>).)*?</script>'
|
||||
match = re.search(config_pattern, html, re.DOTALL)
|
||||
|
||||
if match:
|
||||
@@ -627,8 +657,11 @@ class PDFRenderer:
|
||||
canvas_pattern = rf'<canvas[^>]+data-config-id="{re.escape(config_id)}"[^>]*></canvas>'
|
||||
|
||||
# 【修复】替换canvas为SVG,使用lambda避免反斜杠转义问题
|
||||
html = re.sub(canvas_pattern, lambda m: svg_html, html)
|
||||
logger.debug(f"已替换图表 {widget_id} 的canvas为SVG")
|
||||
html, replaced = re.subn(canvas_pattern, lambda m: svg_html, html, count=1)
|
||||
if replaced:
|
||||
logger.debug(f"已替换图表 {widget_id} 的canvas为SVG")
|
||||
else:
|
||||
logger.warning(f"未找到图表 {widget_id} 的canvas进行替换")
|
||||
|
||||
# 将对应fallback标记为隐藏,避免PDF中出现重复表格
|
||||
fallback_pattern = rf'<div class="chart-fallback"([^>]*data-widget-id="{re.escape(widget_id)}"[^>]*)>'
|
||||
@@ -661,7 +694,7 @@ class PDFRenderer:
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>\s*\{{[^}}]*"widgetId"\s*:\s*"{re.escape(widget_id)}"[^}}]*\}}'
|
||||
config_pattern = rf'<script[^>]+id="([^"]+)"[^>]*>(?:(?!</script>).)*?"widgetId"\s*:\s*"{re.escape(widget_id)}"(?:(?!</script>).)*?</script>'
|
||||
match = re.search(config_pattern, html, re.DOTALL)
|
||||
if not match:
|
||||
logger.debug(f"未找到词云 {widget_id} 的配置脚本,跳过注入")
|
||||
@@ -670,8 +703,11 @@ class PDFRenderer:
|
||||
config_id = match.group(1)
|
||||
canvas_pattern = rf'<canvas[^>]+data-config-id="{re.escape(config_id)}"[^>]*></canvas>'
|
||||
|
||||
html = re.sub(canvas_pattern, lambda m: img_html, html)
|
||||
logger.debug(f"已替换词云 {widget_id} 的canvas为PNG图片")
|
||||
html, replaced = re.subn(canvas_pattern, lambda m: img_html, html, count=1)
|
||||
if replaced:
|
||||
logger.debug(f"已替换词云 {widget_id} 的canvas为PNG图片")
|
||||
else:
|
||||
logger.warning(f"未找到词云 {widget_id} 的canvas进行替换")
|
||||
|
||||
fallback_pattern = rf'<div class="chart-fallback"([^>]*data-widget-id="{re.escape(widget_id)}"[^>]*)>'
|
||||
|
||||
@@ -701,32 +737,40 @@ class PDFRenderer:
|
||||
|
||||
import re
|
||||
|
||||
# 为每个math block查找对应的div并替换为SVG
|
||||
# 优先替换内联公式,再替换块级公式,保持顺序一致
|
||||
for math_id, svg_content in svg_map.items():
|
||||
# 清理SVG内容(移除XML声明,因为SVG将嵌入HTML)
|
||||
svg_content = re.sub(r'<\?xml[^>]+\?>', '', svg_content)
|
||||
svg_content = re.sub(r'<!DOCTYPE[^>]+>', '', svg_content)
|
||||
svg_content = svg_content.strip()
|
||||
|
||||
# 创建SVG容器HTML
|
||||
svg_html = f'<div class="math-svg-container">{svg_content}</div>'
|
||||
svg_block_html = f'<div class="math-svg-container">{svg_content}</div>'
|
||||
svg_inline_html = f'<span class="math-svg-inline">{svg_content}</span>'
|
||||
|
||||
# 查找对应的math-block div
|
||||
# 格式: <div class="math-block">$$ latex $$</div>
|
||||
# 我们需要找到包含特定LaTeX内容的div
|
||||
# 但由于我们在转换时已经给block添加了mathId,我们可以用另一种方式
|
||||
replaced = False
|
||||
# 优先按 data-math-id 精确替换
|
||||
inline_pattern = rf'<span class="math-inline"[^>]*data-math-id="{re.escape(math_id)}"[^>]*>.*?</span>'
|
||||
if re.search(inline_pattern, html, re.DOTALL):
|
||||
html = re.sub(inline_pattern, lambda m: svg_inline_html, html, count=1)
|
||||
replaced = True
|
||||
else:
|
||||
block_pattern = rf'<div class="math-block"[^>]*data-math-id="{re.escape(math_id)}"[^>]*>.*?</div>'
|
||||
if re.search(block_pattern, html, re.DOTALL):
|
||||
html = re.sub(block_pattern, lambda m: svg_block_html, html, count=1)
|
||||
replaced = True
|
||||
|
||||
# 方案:在HTML渲染器中为math-block添加data-math-id属性
|
||||
# 但这需要修改HTMLRenderer,暂时我们使用更简单的方法:
|
||||
# 按顺序替换所有math-block
|
||||
# 如果没有找到特定ID,按出现顺序兜底替换
|
||||
if not replaced:
|
||||
html, sub_inline = re.subn(r'<span class="math-inline">[^<]*</span>', lambda m: svg_inline_html, html, count=1)
|
||||
if sub_inline:
|
||||
replaced = True
|
||||
else:
|
||||
html, sub_block = re.subn(r'<div class="math-block">\$\$[^$]*\$\$</div>', lambda m: svg_block_html, html, count=1)
|
||||
if sub_block:
|
||||
replaced = True
|
||||
|
||||
# 暂时使用简单的替换方案
|
||||
# 找到第一个math-block div并替换
|
||||
math_block_pattern = r'<div class="math-block">\$\$[^$]*\$\$</div>'
|
||||
# 【修复】使用lambda函数避免re.sub将SVG内容中的反斜杠解释为转义序列
|
||||
# lambda函数中的返回值会被当作字面字符串,不会进行转义处理
|
||||
html = re.sub(math_block_pattern, lambda m: svg_html, html, count=1)
|
||||
logger.debug(f"已替换公式 {math_id} 为SVG")
|
||||
if replaced:
|
||||
logger.debug(f"已替换公式 {math_id} 为SVG")
|
||||
|
||||
return html
|
||||
|
||||
@@ -787,10 +831,8 @@ class PDFRenderer:
|
||||
logger.info("开始转换数学公式为SVG矢量图形...")
|
||||
math_svg_map = self._convert_math_to_svg(preprocessed_ir)
|
||||
|
||||
# 使用HTML渲染器生成基础HTML(使用原始IR,因为HTMLRenderer会自己修复)
|
||||
# 注意:这里仍使用原始document_ir,因为HTMLRenderer内部会进行相同的修复
|
||||
# 这确保了HTML和SVG使用相同的修复逻辑
|
||||
html = self.html_renderer.render(document_ir)
|
||||
# 使用HTML渲染器生成基础HTML(使用预处理后的IR,以便复用mathId等标记)
|
||||
html = self.html_renderer.render(preprocessed_ir)
|
||||
|
||||
# 注入图表SVG
|
||||
if svg_map:
|
||||
|
||||
Reference in New Issue
Block a user