refactor: 重构项目配置管理,统一使用.env配置

- 新增config.py统一读取.env配置,移除硬编码路径和参数
- 重构collect_jrxml.py支持命令行参数和环境变量配置源目录
- 新增.env.example示例配置文件,整理所有可配置项
- 重构down_embedding_model.py、import_to_chroma.py等所有脚本使用统一配置
- 新增Windows一键部署脚本setup.bat
- 修正jrxml_banch_chunker.py的文件名拼写错误
This commit is contained in:
2026-05-12 08:29:17 +08:00
parent bd98486de0
commit 9d78a49625
9 changed files with 396 additions and 67 deletions
+19 -14
View File
@@ -2,6 +2,7 @@
query_chroma.py
查询 Chroma 数据库,从自然语言查找相关 JRXML chunk
支持命令行单次查询和交互式连续查询
模型通过 .env / config.py 配置
"""
import os
@@ -12,19 +13,27 @@ import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from config import (
CHROMA_DB_PATH, CHROMA_COLLECTION_NAME, USE_FP16,
DEFAULT_N_RESULTS, SIMILARITY_THRESHOLD, resolve_model_path
)
class JRXMLSearcher:
def __init__(self, chroma_path: str = None,
collection_name: str = "jrxml_chunks",
collection_name: str = None,
model_path: str = None,
use_fp16: bool = True):
use_fp16: bool = None):
project_root = Path(__file__).resolve().parent
if chroma_path is None:
chroma_path = str(project_root / "chroma_db")
chroma_path = str(CHROMA_DB_PATH)
if collection_name is None:
collection_name = CHROMA_COLLECTION_NAME
if model_path is None:
model_path = str(project_root / "models" / "Qwen3-Embedding-4B")
model_path = resolve_model_path()
if use_fp16 is None:
use_fp16 = USE_FP16
# 处理 Hub 模型名称
model_path_str = str(model_path)
@@ -110,13 +119,13 @@ def main():
parser.add_argument("query", nargs="?", default="",
help="搜索关键词(不提供则进入交互模式)")
parser.add_argument("--chroma_path", "-c", default=None,
help=f"Chroma 数据库路径 (默认: chroma_db)")
parser.add_argument("--collection", "-n", default="jrxml_chunks",
help=f"Chroma 数据库路径 (默认: {CHROMA_DB_PATH})")
parser.add_argument("--collection", "-n", default=CHROMA_COLLECTION_NAME,
help="集合名称")
parser.add_argument("--model_path", "-m", default=None,
help="嵌入模型路径")
parser.add_argument("--n_results", "-k", type=int, default=5,
help="返回结果数 (默认: 5)")
parser.add_argument("--n_results", "-k", type=int, default=DEFAULT_N_RESULTS,
help=f"返回结果数 (默认: {DEFAULT_N_RESULTS})")
parser.add_argument("--filter_field", "-f",
help="按 chunk_type 过滤,例如: field, query, chart")
parser.add_argument("--threshold", "-t", type=float,
@@ -127,14 +136,10 @@ def main():
args = parser.parse_args()
if args.chroma_path is None:
args.chroma_path = str(project_root / "chroma_db")
args.chroma_path = str(CHROMA_DB_PATH)
if args.model_path is None:
default_model = project_root / "models" / "Qwen3-Embedding-4B"
if not default_model.exists():
args.model_path = "sentence-transformers/all-MiniLM-L6-v2"
else:
args.model_path = str(default_model)
args.model_path = resolve_model_path()
# 检查数据库
if not os.path.exists(args.chroma_path):