diff --git a/ReportEngine/renderers/pdf_renderer.py b/ReportEngine/renderers/pdf_renderer.py index bfadad7..a9bbf20 100644 --- a/ReportEngine/renderers/pdf_renderer.py +++ b/ReportEngine/renderers/pdf_renderer.py @@ -9,6 +9,7 @@ import base64 import copy import os import sys +import io from pathlib import Path from typing import Any, Dict from datetime import datetime @@ -69,6 +70,12 @@ from .html_renderer import HTMLRenderer from .pdf_layout_optimizer import PDFLayoutOptimizer, PDFLayoutConfig from .chart_to_svg import create_chart_converter from .math_to_svg import MathToSVG +try: + from wordcloud import WordCloud + WORDCLOUD_AVAILABLE = True +except ImportError: + WORDCLOUD_AVAILABLE = False + logger = logger # ensure logger exists even before declaration class PDFRenderer: @@ -272,6 +279,26 @@ class PDFRenderer: logger.info(f"成功转换 {len(svg_map)} 个图表为SVG") return svg_map + def _convert_wordclouds_to_images(self, document_ir: Dict[str, Any]) -> Dict[str, str]: + """ + 将document_ir中的词云widget转换为PNG并返回data URI映射 + """ + img_map: Dict[str, str] = {} + + if not WORDCLOUD_AVAILABLE: + logger.debug("wordcloud库未安装,词云将使用表格兜底") + return img_map + + # 遍历所有章节 + chapters = document_ir.get('chapters', []) + for chapter in chapters: + blocks = chapter.get('blocks', []) + self._extract_wordcloud_widgets(blocks, img_map) + + if img_map: + logger.info(f"成功转换 {len(img_map)} 个词云为图片") + return img_map + def _extract_and_convert_widgets( self, blocks: list, @@ -334,6 +361,109 @@ class PDFRenderer: if isinstance(cell_blocks, list): self._extract_and_convert_widgets(cell_blocks, svg_map) + def _extract_wordcloud_widgets( + self, + blocks: list, + img_map: Dict[str, str] + ) -> None: + """ + 递归遍历blocks,找到词云widget并生成图片 + """ + for block in blocks: + if not isinstance(block, dict): + continue + + block_type = block.get('type') + if block_type == 'widget': + widget_id = block.get('widgetId') + widget_type = block.get('widgetType', '') + + if widget_id and isinstance(widget_type, str) and 'wordcloud' in widget_type.lower(): + try: + data_uri = self._generate_wordcloud_image(block) + if data_uri: + img_map[widget_id] = data_uri + logger.debug(f"词云 {widget_id} 转换为图片成功") + except Exception as exc: + logger.warning(f"生成词云图片失败 {widget_id}: {exc}") + + nested_blocks = block.get('blocks') + if isinstance(nested_blocks, list): + self._extract_wordcloud_widgets(nested_blocks, img_map) + + if block_type == 'list': + items = block.get('items', []) + for item in items: + if isinstance(item, list): + self._extract_wordcloud_widgets(item, img_map) + + if block_type == 'table': + rows = block.get('rows', []) + for row in rows: + cells = row.get('cells', []) + for cell in cells: + cell_blocks = cell.get('blocks', []) + if isinstance(cell_blocks, list): + self._extract_wordcloud_widgets(cell_blocks, img_map) + + def _normalize_wordcloud_items(self, block: Dict[str, Any]) -> list: + """ + 从widget block中提取词云数据 + """ + props = block.get('props') or {} + raw_items = props.get('data') + if not isinstance(raw_items, list): + return [] + normalized = [] + for item in raw_items: + if not isinstance(item, dict): + continue + word = item.get('word') or item.get('text') or item.get('label') + if not word: + continue + weight = item.get('weight') + try: + weight_val = float(weight) + if weight_val <= 0: + weight_val = 1.0 + except (TypeError, ValueError): + weight_val = 1.0 + category = (item.get('category') or '').lower() + normalized.append({'word': str(word), 'weight': weight_val, 'category': category}) + return normalized + + def _generate_wordcloud_image(self, block: Dict[str, Any]) -> str | None: + """ + 生成词云PNG并返回data URI + """ + items = self._normalize_wordcloud_items(block) + if not items: + return None + + # 使用频次形式馈入wordcloud库 + frequencies = {} + for item in items: + weight = item['weight'] + # 兼容权重为0-1的小数,放大以体现差异 + freq = weight * 100 if 0 < weight <= 1.5 else weight + frequencies[item['word']] = max(1, freq) + + font_path = str(self._get_font_path()) + wc = WordCloud( + width=900, + height=520, + background_color="white", + font_path=font_path, + prefer_horizontal=0.9, + random_state=42, + ) + wc.generate_from_frequencies(frequencies) + + buffer = io.BytesIO() + wc.to_image().save(buffer, format='PNG') + encoded = base64.b64encode(buffer.getvalue()).decode('ascii') + return f"data:image/png;base64,{encoded}" + def _convert_math_to_svg(self, document_ir: Dict[str, Any]) -> Dict[str, str]: """ 将document_ir中的所有数学公式转换为SVG @@ -486,6 +616,46 @@ class PDFRenderer: return html + def _inject_wordcloud_images(self, html: str, img_map: Dict[str, str]) -> str: + """ + 将词云PNG data URI注入HTML,替换对应canvas + """ + if not img_map: + return html + + import re + + for widget_id, data_uri in img_map.items(): + img_html = ( + f'
' + f'词云' + f'
' + ) + + config_pattern = rf']+id="([^"]+)"[^>]*>\s*\{{[^}}]*"widgetId"\s*:\s*"{re.escape(widget_id)}"[^}}]*\}}' + match = re.search(config_pattern, html, re.DOTALL) + if not match: + logger.debug(f"未找到词云 {widget_id} 的配置脚本,跳过注入") + continue + + config_id = match.group(1) + canvas_pattern = rf']+data-config-id="{re.escape(config_id)}"[^>]*>' + + html = re.sub(canvas_pattern, lambda m: img_html, html) + logger.debug(f"已替换词云 {widget_id} 的canvas为PNG图片") + + fallback_pattern = rf'
]*data-widget-id="{re.escape(widget_id)}"[^>]*)>' + + def _hide_fallback(m: re.Match) -> str: + tag = m.group(0) + if 'svg-hidden' in tag: + return tag + return tag.replace('chart-fallback"', 'chart-fallback svg-hidden"', 1) + + html = re.sub(fallback_pattern, _hide_fallback, html, count=1) + + return html + def _inject_math_svg_into_html(self, html: str, svg_map: Dict[str, str]) -> str: """ 将数学公式SVG内容注入到HTML中 @@ -580,6 +750,10 @@ class PDFRenderer: logger.info("开始转换图表为SVG矢量图形...") svg_map = self._convert_charts_to_svg(preprocessed_ir) + # 转换词云为PNG + logger.info("开始转换词云为图片...") + wordcloud_map = self._convert_wordclouds_to_images(preprocessed_ir) + # 转换数学公式为SVG logger.info("开始转换数学公式为SVG矢量图形...") math_svg_map = self._convert_math_to_svg(preprocessed_ir) @@ -594,6 +768,10 @@ class PDFRenderer: html = self._inject_svg_into_html(html, svg_map) logger.info(f"已注入 {len(svg_map)} 个SVG图表") + if wordcloud_map: + html = self._inject_wordcloud_images(html, wordcloud_map) + logger.info(f"已注入 {len(wordcloud_map)} 个词云图片") + # 注入数学公式SVG if math_svg_map: html = self._inject_math_svg_into_html(html, math_svg_map) @@ -652,6 +830,10 @@ body {{ max-width: 100%; height: auto; }} +.chart-svg-container img {{ + max-width: 100%; + height: auto; +}} /* 数学公式SVG容器样式 */ .math-svg-container {{