fix: image files silently falling to text parser due to suffix dot mismatch
api_server.py passed "jpg" (no dot) from rsplit, but file_parser.py
parser dict keys all have dots (".jpg"), causing image files to fall
through to _parse_text() which fails on binary data, skipping ALL OCR
and layout analysis. Every image upload was affected.
- file_parser.py: normalize file_type to always have leading dot
- api_server.py: use Path.suffix instead of manual rsplit
This commit is contained in:
+1
-1
@@ -400,7 +400,7 @@ def _process_files(file_ids: list[str], session_id: str) -> dict:
|
|||||||
file_path = info["path"]
|
file_path = info["path"]
|
||||||
uploaded_paths.append(file_path)
|
uploaded_paths.append(file_path)
|
||||||
|
|
||||||
parsed = parse_file(file_path, info["filename"].rsplit(".", 1)[-1] if "." in info["filename"] else "")
|
parsed = parse_file(file_path, Path(info["filename"]).suffix)
|
||||||
if parsed.get("error"):
|
if parsed.get("error"):
|
||||||
parts.append(f"[文件: {info['filename']}]\n解析失败: {parsed['error']}")
|
parts.append(f"[文件: {info['filename']}]\n解析失败: {parsed['error']}")
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -41,7 +41,9 @@ def parse_file(file_path: str, file_type: str = "") -> dict:
|
|||||||
if not path.exists():
|
if not path.exists():
|
||||||
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
|
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
|
||||||
|
|
||||||
suffix = file_type or path.suffix.lower()
|
suffix = path.suffix.lower()
|
||||||
|
if file_type:
|
||||||
|
suffix = file_type if file_type.startswith(".") else f".{file_type}"
|
||||||
|
|
||||||
parsers = {
|
parsers = {
|
||||||
".png": _parse_image,
|
".png": _parse_image,
|
||||||
|
|||||||
Reference in New Issue
Block a user