agent_jrxml/backend/file_parser.py

"""文件解析器：将上传文件转为文本，供 LLM 处理。

支持:
- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
- PDF (.pdf) → 文本提取
- Word (.docx) → 文本提取
- 纯文本 (.txt/.csv/.json/.xml) → 直接读取

策略选择:
- 原生多模态: 模型支持图片时直接传文件（当前 MiniMax 不支持，自动退回文本转换）
- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
"""

import os
import io
from pathlib import Path
from typing import Optional

import PIL.Image

MODELS_WITH_VISION = {
    "gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
    "claude-3", "claude-3.5", "claude-4",
    "gemini-1.5", "gemini-2",
}


def can_use_vision(model: str = "") -> bool:
    """检查当前模型是否支持原生多模态（图片直接上传）。"""
    if not model:
        model = os.getenv("LLM_MODEL", "")
    return any(v in model.lower() for v in MODELS_WITH_VISION)


def parse_file(file_path: str, file_type: str = "") -> dict:
    """解析任意文件为文本。

    返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
    """
    path = Path(file_path)
    if not path.exists():
        return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}

    suffix = file_type or path.suffix.lower()

    parsers = {
        ".png":  _parse_image,
        ".jpg":  _parse_image,
        ".jpeg": _parse_image,
        ".bmp":  _parse_image,
        ".webp": _parse_image,
        ".pdf":  _parse_pdf,
        ".docx": _parse_docx,
        ".xlsx": _parse_xlsx,
        ".xls":  _parse_xls,
        ".doc":  _parse_doc,
    }

    parser = parsers.get(suffix)
    if parser:
        return parser(path)
    else:
        return _parse_text(path)


# ---------------------------------------------------------------------------
# 各类型解析器
# ---------------------------------------------------------------------------

def _parse_image(path: Path) -> dict:
    """OCR 提取图片中的文字。优先 EasyOCR，回退 PaddleOCR。"""
    try:
        img = PIL.Image.open(path)
        info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
    except Exception:
        info = "[图片: 无法读取元数据]"

    # 优先 PaddleOCR（精确识别）
    try:
        from paddleocr import PaddleOCR
        ocr = PaddleOCR(lang="ch")
        result = ocr.ocr(str(path))
        lines = []
        if result and result[0]:
            for line in result[0]:
                text = line[1][0] if len(line) > 1 else ""
                if text.strip():
                    lines.append(text.strip())
        if lines:
            return {
                "text": f"{info}\n识别文本:\n" + "\n".join(lines),
                "file_type": "image",
                "method": "paddleocr",
                "error": None,
            }
    except ImportError:
        pass
    except Exception:
        pass

    # 回退 EasyOCR
    try:
        import easyocr
        import numpy as np
        reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
        result = reader.readtext(np.array(img))
        lines = [text.strip() for (_, text, _) in result if text.strip()]
        if lines:
            return {
                "text": f"{info}\n识别文本:\n" + "\n".join(lines),
                "file_type": "image",
                "method": "easyocr",
                "error": None,
            }
    except ImportError:
        pass
    except Exception:
        pass

    # OCR 不可用 → 返回图片元信息 + 安装提示
    return {
        "text": f"{info}\n(如需 OCR 文字识别，请安装: pip install easyocr)",
        "file_type": "image",
        "method": "metadata_only",
        "error": "OCR 引擎未安装，已返回图片元信息",
    }


def _parse_pdf(path: Path) -> dict:
    """提取 PDF 中的文本。"""
    try:
        import pdfplumber
        with pdfplumber.open(path) as pdf:
            pages = []
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    pages.append(text)
            full = "\n\n".join(pages)
            return {
                "text": full,
                "file_type": "pdf",
                "method": "pdfplumber",
                "error": None,
            }
    except ImportError:
        pass
    except Exception as e:
        pass

    # Fallback: 尝试 PyMuPDF
    try:
        import fitz
        doc = fitz.open(path)
        pages = []
        for page in doc:
            pages.append(page.get_text())
        doc.close()
        return {
            "text": "\n\n".join(pages),
            "file_type": "pdf",
            "method": "pymupdf",
            "error": None,
        }
    except ImportError:
        pass
    except Exception:
        pass

    return {"text": "", "file_type": "pdf", "method": "none",
            "error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}


def _parse_docx(path: Path) -> dict:
    """提取 Word 文档中的文本。"""
    try:
        from docx import Document
        doc = Document(path)
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        # 同时提取表格内容
        for table in doc.tables:
            for row in table.rows:
                cells = [cell.text for cell in row.cells if cell.text.strip()]
                if cells:
                    paragraphs.append(" | ".join(cells))
        return {
            "text": "\n\n".join(paragraphs),
            "file_type": "docx",
            "method": "python-docx",
            "error": None,
        }
    except ImportError:
        pass
    except Exception as e:
        pass

    return {"text": "", "file_type": "docx", "method": "none",
            "error": "DOCX 解析需要安装 python-docx"}


def _parse_xlsx(path: Path) -> dict:
    """提取 Excel .xlsx 文件中的文本。"""
    try:
        from openpyxl import load_workbook
        wb = load_workbook(path, read_only=True, data_only=True)
        parts = []
        for name in wb.sheetnames:
            ws = wb[name]
            rows = []
            for row in ws.iter_rows(values_only=True):
                cells = [str(c) if c is not None else "" for c in row]
                if any(c for c in cells):
                    rows.append("\t".join(cells))
            if rows:
                parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
        wb.close()
        text = "\n\n".join(parts)
        return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None}
    except ImportError:
        pass
    except Exception as e:
        return {"text": "", "file_type": "xlsx", "method": "none",
                "error": f"XLSX 解析失败: {e}"}
    return {"text": "", "file_type": "xlsx", "method": "none",
            "error": "XLSX 解析需要安装 openpyxl"}


def _parse_xls(path: Path) -> dict:
    """提取旧版 Excel .xls 文件中的文本。"""
    try:
        import xlrd
        wb = xlrd.open_workbook(path)
        parts = []
        for name in wb.sheet_names():
            ws = wb.sheet_by_name(name)
            rows = []
            for rx in range(ws.nrows):
                cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else ""
                         for cx in range(ws.ncols)]
                if any(c for c in cells):
                    rows.append("\t".join(cells))
            if rows:
                parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
        text = "\n\n".join(parts)
        return {"text": text, "file_type": "xls", "method": "xlrd", "error": None}
    except ImportError:
        pass
    except Exception as e:
        return {"text": "", "file_type": "xls", "method": "none",
                "error": f"XLS 解析失败: {e}"}
    return {"text": "", "file_type": "xls", "method": "none",
            "error": "XLS 解析需要安装 xlrd"}


def _parse_doc(path: Path) -> dict:
    """提取旧版 Word .doc 文件中的文本（尽力而为，二进制格式）。"""
    try:
        import olefile
        ole = olefile.OleFileIO(path)
        if not ole.exists("WordDocument"):
            ole.close()
            return {"text": "", "file_type": "doc", "method": "none",
                    "error": "不是有效的 .doc 文件"}
        raw = ole.openstream("WordDocument").read()
        ole.close()
        # 提取可打印 UTF-16LE 字符段
        text = ""
        try:
            decoded = raw.decode("utf-16-le", errors="ignore")
            text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t")
        except Exception:
            pass
        if not text.strip():
            return {"text": "", "file_type": "doc", "method": "olefile",
                    "error": "无法提取文本（.doc 为二进制格式，建议转换为 .docx）"}
        return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None}
    except ImportError:
        pass
    except Exception as e:
        return {"text": "", "file_type": "doc", "method": "none",
                "error": f"DOC 解析失败: {e}"}
    return {"text": "", "file_type": "doc", "method": "none",
            "error": "DOC 解析需要安装 olefile"}


def _parse_text(path: Path) -> dict:
    """读取纯文本文件。"""
    try:
        text = path.read_text(encoding="utf-8")
        return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
    except UnicodeDecodeError:
        try:
            text = path.read_text(encoding="gbk")
            return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
        except Exception:
            return {"text": "", "file_type": path.suffix, "method": "none",
                    "error": "无法解码文件"}
    except Exception:
        return {"text": "", "file_type": path.suffix, "method": "none",
                "error": "读取失败"}