refactor: 重构项目配置管理,统一使用.env配置

- 新增config.py统一读取.env配置,移除硬编码路径和参数
- 重构collect_jrxml.py支持命令行参数和环境变量配置源目录
- 新增.env.example示例配置文件,整理所有可配置项
- 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置
- 新增Windows一键部署脚本setup.bat
- 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
2026-05-12 08:29:17 +08:00
parent bd98486de0
commit 9d78a49625
9 changed files with 396 additions and 67 deletions
+27 -20
View File
@@ -1,7 +1,7 @@
"""
embed_chunks.py
使用本地 Qwen3-Embedding-4B 模型对 JRXML chunks 进行向量化
支持 GPU (CUDA) 或 CPU
使用嵌入模型对 JRXML chunks 进行向量化
支持 GPU (CUDA) 或 CPU,模型通过 .env / config.py 配置
"""
import os
@@ -12,6 +12,10 @@ from pathlib import Path
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from config import (
EMBEDDING_MODEL_PATH, CHUNKER_OUTPUT_DIR, EMBEDDINGS_DIR,
USE_FP16, BATCH_SIZE, resolve_model_path
)
def build_text_for_embedding(chunk: dict) -> str:
"""
@@ -48,8 +52,8 @@ def build_text_for_embedding(chunk: dict) -> str:
def main(chunks_json_path: str = None, output_dir: str = None,
model_path: str = None, batch_size: int = 64, normalize: bool = True,
use_fp16: bool = True):
model_path: str = None, batch_size: int = None, normalize: bool = True,
use_fp16: bool = None):
"""
主流程:
1. 加载 chunk JSON
@@ -60,19 +64,25 @@ def main(chunks_json_path: str = None, output_dir: str = None,
project_root = Path(__file__).resolve().parent
if chunks_json_path is None:
chunks_json_path = project_root / "jrxml_chunker_output" / "all_chunks.json"
chunks_json_path = CHUNKER_OUTPUT_DIR / "all_chunks.json"
else:
chunks_json_path = Path(chunks_json_path)
if output_dir is None:
output_dir = project_root / "embeddings"
output_dir = EMBEDDINGS_DIR
else:
output_dir = Path(output_dir)
if model_path is None:
model_path = project_root / "models" / "Qwen3-Embedding-4B"
model_path = resolve_model_path()
else:
model_path = Path(model_path)
model_path = str(model_path)
if batch_size is None:
batch_size = BATCH_SIZE
if use_fp16 is None:
use_fp16 = USE_FP16
if not chunks_json_path.exists():
print(f"❌ Chunks 文件不存在: {chunks_json_path}")
@@ -91,19 +101,16 @@ def main(chunks_json_path: str = None, output_dir: str = None,
print(f"\n🧠 加载嵌入模型: {model_path}")
print(f" 设备: {device}")
# 检查是否是 HuggingFace Hub 模型(格式为 username/model_name
model_path_str = str(model_path)
# Windows PowerShell 会把 / 自动转成 \,需要还原
if "\\" in model_path_str and not os.path.exists(model_path_str):
model_path_str = model_path_str.replace("\\", "/")
is_hub_model = "/" in model_path_str and not os.path.exists(model_path_str)
# 如果是本地路径但不存在,则报错
if not is_hub_model and not os.path.exists(model_path_str):
print(f"❌ 模型目录不存在: {model_path}")
print(f" 请先下载模型到 {model_path}")
print(f"者使用 HuggingFace Hub 模型,例如: sentence-transformers/all-MiniLM-L6-v2")
print(f" 请先运行 down_embedding_model.py 下载模型")
print(f"在 .env 中配置 EMBEDDING_MODEL_NAME 为 Hub 模型名")
return None
model = SentenceTransformer(model_path_str, device=device)
@@ -183,17 +190,17 @@ def main(chunks_json_path: str = None, output_dir: str = None,
if __name__ == "__main__":
import argparse
project_root = Path(__file__).resolve().parent
default_chunks = project_root / "jrxml_chunker_output" / "all_chunks.json"
default_chunks = CHUNKER_OUTPUT_DIR / "all_chunks.json"
parser = argparse.ArgumentParser(description="JRXML Chunks 向量化工具")
parser.add_argument("chunks_json", nargs="?", default=str(default_chunks),
help=f"Chunks JSON 文件路径 (默认: {default_chunks})")
parser.add_argument("--output_dir", "-o", default=None,
help="输出目录 (默认: embeddings)")
help=f"输出目录 (默认: {EMBEDDINGS_DIR})")
parser.add_argument("--model_path", "-m", default=None,
help="模型路径 (默认: models/Qwen3-Embedding-4B)")
parser.add_argument("--batch_size", "-b", type=int, default=64,
help="批处理大小 (默认: 64)")
help=f"模型路径 (默认: {resolve_model_path()})")
parser.add_argument("--batch_size", "-b", type=int, default=BATCH_SIZE,
help=f"批处理大小 (默认: {BATCH_SIZE})")
parser.add_argument("--no_normalize", action="store_true",
help="不做向量归一化")
parser.add_argument("--no_fp16", action="store_true",