"""文件解析器:将上传文件转为文本,供 LLM 处理。 支持: - 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本 - PDF (.pdf) → 文本提取 - Word (.docx) → 文本提取 - 纯文本 (.txt/.csv/.json/.xml) → 直接读取 策略选择: - 原生多模态: 模型支持图片时直接传文件(当前 MiniMax 不支持,自动退回文本转换) - 文本转换: 所有文件转为 UTF-8 文本后注入 prompt """ import os import io from pathlib import Path from typing import Optional import PIL.Image MODELS_WITH_VISION = { "gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview", "claude-3", "claude-3.5", "claude-4", "gemini-1.5", "gemini-2", } def can_use_vision(model: str = "") -> bool: """检查当前模型是否支持原生多模态(图片直接上传)。""" if not model: model = os.getenv("LLM_MODEL", "") return any(v in model.lower() for v in MODELS_WITH_VISION) def parse_file(file_path: str, file_type: str = "") -> dict: """解析任意文件为文本。 返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]} """ path = Path(file_path) if not path.exists(): return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"} suffix = file_type or path.suffix.lower() parsers = { ".png": _parse_image, ".jpg": _parse_image, ".jpeg": _parse_image, ".bmp": _parse_image, ".webp": _parse_image, ".pdf": _parse_pdf, ".docx": _parse_docx, ".xlsx": _parse_xlsx, ".xls": _parse_xls, ".doc": _parse_doc, } parser = parsers.get(suffix) if parser: return parser(path) else: return _parse_text(path) # --------------------------------------------------------------------------- # 各类型解析器 # --------------------------------------------------------------------------- def _parse_image(path: Path) -> dict: """OCR 提取图片中的文字。优先 EasyOCR,回退 PaddleOCR。""" try: img = PIL.Image.open(path) info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]" except Exception: info = "[图片: 无法读取元数据]" # 优先 PaddleOCR(精确识别) try: from paddleocr import PaddleOCR ocr = PaddleOCR(lang="ch") result = ocr.ocr(str(path)) lines = [] if result and result[0]: for line in result[0]: text = line[1][0] if len(line) > 1 else "" if text.strip(): lines.append(text.strip()) if lines: return { "text": f"{info}\n识别文本:\n" + "\n".join(lines), "file_type": "image", "method": "paddleocr", "error": None, } except ImportError: pass except Exception: pass # 回退 EasyOCR try: import easyocr import numpy as np reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False) result = reader.readtext(np.array(img)) lines = [text.strip() for (_, text, _) in result if text.strip()] if lines: return { "text": f"{info}\n识别文本:\n" + "\n".join(lines), "file_type": "image", "method": "easyocr", "error": None, } except ImportError: pass except Exception: pass # OCR 不可用 → 返回图片元信息 + 安装提示 return { "text": f"{info}\n(如需 OCR 文字识别,请安装: pip install easyocr)", "file_type": "image", "method": "metadata_only", "error": "OCR 引擎未安装,已返回图片元信息", } def _parse_pdf(path: Path) -> dict: """提取 PDF 中的文本。""" try: import pdfplumber with pdfplumber.open(path) as pdf: pages = [] for page in pdf.pages: text = page.extract_text() if text: pages.append(text) full = "\n\n".join(pages) return { "text": full, "file_type": "pdf", "method": "pdfplumber", "error": None, } except ImportError: pass except Exception as e: pass # Fallback: 尝试 PyMuPDF try: import fitz doc = fitz.open(path) pages = [] for page in doc: pages.append(page.get_text()) doc.close() return { "text": "\n\n".join(pages), "file_type": "pdf", "method": "pymupdf", "error": None, } except ImportError: pass except Exception: pass return {"text": "", "file_type": "pdf", "method": "none", "error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"} def _parse_docx(path: Path) -> dict: """提取 Word 文档中的文本。""" try: from docx import Document doc = Document(path) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] # 同时提取表格内容 for table in doc.tables: for row in table.rows: cells = [cell.text for cell in row.cells if cell.text.strip()] if cells: paragraphs.append(" | ".join(cells)) return { "text": "\n\n".join(paragraphs), "file_type": "docx", "method": "python-docx", "error": None, } except ImportError: pass except Exception as e: pass return {"text": "", "file_type": "docx", "method": "none", "error": "DOCX 解析需要安装 python-docx"} def _parse_xlsx(path: Path) -> dict: """提取 Excel .xlsx 文件中的文本。""" try: from openpyxl import load_workbook wb = load_workbook(path, read_only=True, data_only=True) parts = [] for name in wb.sheetnames: ws = wb[name] rows = [] for row in ws.iter_rows(values_only=True): cells = [str(c) if c is not None else "" for c in row] if any(c for c in cells): rows.append("\t".join(cells)) if rows: parts.append(f"[Sheet: {name}]\n" + "\n".join(rows)) wb.close() text = "\n\n".join(parts) return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None} except ImportError: pass except Exception as e: return {"text": "", "file_type": "xlsx", "method": "none", "error": f"XLSX 解析失败: {e}"} return {"text": "", "file_type": "xlsx", "method": "none", "error": "XLSX 解析需要安装 openpyxl"} def _parse_xls(path: Path) -> dict: """提取旧版 Excel .xls 文件中的文本。""" try: import xlrd wb = xlrd.open_workbook(path) parts = [] for name in wb.sheet_names(): ws = wb.sheet_by_name(name) rows = [] for rx in range(ws.nrows): cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else "" for cx in range(ws.ncols)] if any(c for c in cells): rows.append("\t".join(cells)) if rows: parts.append(f"[Sheet: {name}]\n" + "\n".join(rows)) text = "\n\n".join(parts) return {"text": text, "file_type": "xls", "method": "xlrd", "error": None} except ImportError: pass except Exception as e: return {"text": "", "file_type": "xls", "method": "none", "error": f"XLS 解析失败: {e}"} return {"text": "", "file_type": "xls", "method": "none", "error": "XLS 解析需要安装 xlrd"} def _parse_doc(path: Path) -> dict: """提取旧版 Word .doc 文件中的文本(尽力而为,二进制格式)。""" try: import olefile ole = olefile.OleFileIO(path) if not ole.exists("WordDocument"): ole.close() return {"text": "", "file_type": "doc", "method": "none", "error": "不是有效的 .doc 文件"} raw = ole.openstream("WordDocument").read() ole.close() # 提取可打印 UTF-16LE 字符段 text = "" try: decoded = raw.decode("utf-16-le", errors="ignore") text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t") except Exception: pass if not text.strip(): return {"text": "", "file_type": "doc", "method": "olefile", "error": "无法提取文本(.doc 为二进制格式,建议转换为 .docx)"} return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None} except ImportError: pass except Exception as e: return {"text": "", "file_type": "doc", "method": "none", "error": f"DOC 解析失败: {e}"} return {"text": "", "file_type": "doc", "method": "none", "error": "DOC 解析需要安装 olefile"} def _parse_text(path: Path) -> dict: """读取纯文本文件。""" try: text = path.read_text(encoding="utf-8") return {"text": text, "file_type": path.suffix, "method": "direct", "error": None} except UnicodeDecodeError: try: text = path.read_text(encoding="gbk") return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None} except Exception: return {"text": "", "file_type": path.suffix, "method": "none", "error": "无法解码文件"} except Exception: return {"text": "", "file_type": path.suffix, "method": "none", "error": "读取失败"}