refactor: 重构项目配置管理,统一使用.env配置
- 新增config.py统一读取.env配置,移除硬编码路径和参数 - 重构collect_jrxml.py支持命令行参数和环境变量配置源目录 - 新增.env.example示例配置文件,整理所有可配置项 - 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置 - 新增Windows一键部署脚本setup.bat - 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
+27
-20
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
embed_chunks.py
|
||||
使用本地 Qwen3-Embedding-4B 模型对 JRXML chunks 进行向量化
|
||||
支持 GPU (CUDA) 或 CPU
|
||||
使用嵌入模型对 JRXML chunks 进行向量化
|
||||
支持 GPU (CUDA) 或 CPU,模型通过 .env / config.py 配置
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -12,6 +12,10 @@ from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from config import (
|
||||
EMBEDDING_MODEL_PATH, CHUNKER_OUTPUT_DIR, EMBEDDINGS_DIR,
|
||||
USE_FP16, BATCH_SIZE, resolve_model_path
|
||||
)
|
||||
|
||||
def build_text_for_embedding(chunk: dict) -> str:
|
||||
"""
|
||||
@@ -48,8 +52,8 @@ def build_text_for_embedding(chunk: dict) -> str:
|
||||
|
||||
|
||||
def main(chunks_json_path: str = None, output_dir: str = None,
|
||||
model_path: str = None, batch_size: int = 64, normalize: bool = True,
|
||||
use_fp16: bool = True):
|
||||
model_path: str = None, batch_size: int = None, normalize: bool = True,
|
||||
use_fp16: bool = None):
|
||||
"""
|
||||
主流程:
|
||||
1. 加载 chunk JSON
|
||||
@@ -60,19 +64,25 @@ def main(chunks_json_path: str = None, output_dir: str = None,
|
||||
project_root = Path(__file__).resolve().parent
|
||||
|
||||
if chunks_json_path is None:
|
||||
chunks_json_path = project_root / "jrxml_chunker_output" / "all_chunks.json"
|
||||
chunks_json_path = CHUNKER_OUTPUT_DIR / "all_chunks.json"
|
||||
else:
|
||||
chunks_json_path = Path(chunks_json_path)
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = project_root / "embeddings"
|
||||
output_dir = EMBEDDINGS_DIR
|
||||
else:
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
if model_path is None:
|
||||
model_path = project_root / "models" / "Qwen3-Embedding-4B"
|
||||
model_path = resolve_model_path()
|
||||
else:
|
||||
model_path = Path(model_path)
|
||||
model_path = str(model_path)
|
||||
|
||||
if batch_size is None:
|
||||
batch_size = BATCH_SIZE
|
||||
|
||||
if use_fp16 is None:
|
||||
use_fp16 = USE_FP16
|
||||
|
||||
if not chunks_json_path.exists():
|
||||
print(f"❌ Chunks 文件不存在: {chunks_json_path}")
|
||||
@@ -91,19 +101,16 @@ def main(chunks_json_path: str = None, output_dir: str = None,
|
||||
print(f"\n🧠 加载嵌入模型: {model_path}")
|
||||
print(f" 设备: {device}")
|
||||
|
||||
# 检查是否是 HuggingFace Hub 模型(格式为 username/model_name)
|
||||
model_path_str = str(model_path)
|
||||
# Windows PowerShell 会把 / 自动转成 \,需要还原
|
||||
if "\\" in model_path_str and not os.path.exists(model_path_str):
|
||||
model_path_str = model_path_str.replace("\\", "/")
|
||||
|
||||
|
||||
is_hub_model = "/" in model_path_str and not os.path.exists(model_path_str)
|
||||
|
||||
# 如果是本地路径但不存在,则报错
|
||||
|
||||
if not is_hub_model and not os.path.exists(model_path_str):
|
||||
print(f"❌ 模型目录不存在: {model_path}")
|
||||
print(f" 请先下载模型到 {model_path}")
|
||||
print(f" 或者使用 HuggingFace Hub 模型,例如: sentence-transformers/all-MiniLM-L6-v2")
|
||||
print(f" 请先运行 down_embedding_model.py 下载模型")
|
||||
print(f" 或在 .env 中配置 EMBEDDING_MODEL_NAME 为 Hub 模型名")
|
||||
return None
|
||||
|
||||
model = SentenceTransformer(model_path_str, device=device)
|
||||
@@ -183,17 +190,17 @@ def main(chunks_json_path: str = None, output_dir: str = None,
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
project_root = Path(__file__).resolve().parent
|
||||
default_chunks = project_root / "jrxml_chunker_output" / "all_chunks.json"
|
||||
default_chunks = CHUNKER_OUTPUT_DIR / "all_chunks.json"
|
||||
|
||||
parser = argparse.ArgumentParser(description="JRXML Chunks 向量化工具")
|
||||
parser.add_argument("chunks_json", nargs="?", default=str(default_chunks),
|
||||
help=f"Chunks JSON 文件路径 (默认: {default_chunks})")
|
||||
parser.add_argument("--output_dir", "-o", default=None,
|
||||
help="输出目录 (默认: embeddings)")
|
||||
help=f"输出目录 (默认: {EMBEDDINGS_DIR})")
|
||||
parser.add_argument("--model_path", "-m", default=None,
|
||||
help="模型路径 (默认: models/Qwen3-Embedding-4B)")
|
||||
parser.add_argument("--batch_size", "-b", type=int, default=64,
|
||||
help="批处理大小 (默认: 64)")
|
||||
help=f"模型路径 (默认: {resolve_model_path()})")
|
||||
parser.add_argument("--batch_size", "-b", type=int, default=BATCH_SIZE,
|
||||
help=f"批处理大小 (默认: {BATCH_SIZE})")
|
||||
parser.add_argument("--no_normalize", action="store_true",
|
||||
help="不做向量归一化")
|
||||
parser.add_argument("--no_fp16", action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user