chore: 初始化JRXML RAG项目，添加基础文件

创建了完整的JRXML语义检索RAG项目，包含： 1. 新增.gitignore忽略项目生成的缓存、依赖目录和本地文件 2. 编写详细的项目README文档 3. 补充文件功能说明文档 4. 实现向量导入、向量化、查询等核心脚本
2026-05-12 08:14:55 +08:00
parent 4f475e9e36
commit bd98486de0
6 changed files with 1030 additions and 42 deletions
@@ -4,10 +4,13 @@ embed_chunks.py
 支持 GPU (CUDA) 或 CPU
 """

-import os, sys, json, pickle
+import os
+import sys
+import json
+import pickle
+from pathlib import Path
 import numpy as np
 import torch
-from tqdm import tqdm
 from sentence_transformers import SentenceTransformer

 def build_text_for_embedding(chunk: dict) -> str:
@@ -22,13 +25,11 @@ def build_text_for_embedding(chunk: dict) -> str:
    context = chunk.get('context', '')
    if context:
        parts.append(f"Context: {context}")
-    
-    # 添加部分 XML (前500字符)
+
    raw_xml = chunk.get('raw_xml', '')
    if raw_xml:
        parts.append(f"XML: {raw_xml[:500]}")
-    
-    # 添加元数据
+
    meta = chunk.get('metadata', {})
    if meta:
        if 'field_names' in meta:
@@ -45,9 +46,10 @@ def build_text_for_embedding(chunk: dict) -> str:
            parts.append(f"QueryLang: {meta['query_language']}")
    return "\n".join(parts)

-def main(chunks_json_path: str, output_dir: str = "./embeddings",
-         model_path: str = "./models/Qwen3-Embedding-4B",
-         batch_size: int = 16, normalize: bool = True):
+
+def main(chunks_json_path: str = None, output_dir: str = None,
+         model_path: str = None, batch_size: int = 64, normalize: bool = True,
+         use_fp16: bool = True):
    """
    主流程：
    1. 加载 chunk JSON
@@ -55,29 +57,80 @@ def main(chunks_json_path: str, output_dir: str = "./embeddings",
    3. 构造文本并向量化
    4. 保存向量及映射文件
    """
-    # --- 1. 加载 chunks ---
-    print(f"📄 Loading chunks from {chunks_json_path}")
+    project_root = Path(__file__).resolve().parent
+
+    if chunks_json_path is None:
+        chunks_json_path = project_root / "jrxml_chunker_output" / "all_chunks.json"
+    else:
+        chunks_json_path = Path(chunks_json_path)
+
+    if output_dir is None:
+        output_dir = project_root / "embeddings"
+    else:
+        output_dir = Path(output_dir)
+
+    if model_path is None:
+        model_path = project_root / "models" / "Qwen3-Embedding-4B"
+    else:
+        model_path = Path(model_path)
+
+    if not chunks_json_path.exists():
+        print(f"❌ Chunks 文件不存在: {chunks_json_path}")
+        print(f"   请先运行 jrxml_banch_chunker.py 生成 chunks")
+        return None
+
+    print(f"\n{'='*60}")
+    print(f"JRXML Chunks 向量化")
+    print(f"{'='*60}")
+    print(f"📄 加载 chunks: {chunks_json_path}")
    with open(chunks_json_path, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    print(f"   Total chunks: {len(chunks)}")

-    # --- 2. 加载模型 ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"🧠 Loading embedding model from {model_path} on {device}")
-    model = SentenceTransformer(model_path, device=device)
-    if device == "cuda":
-        print(f"   GPU memory allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
+    print(f"\n🧠 加载嵌入模型: {model_path}")
+    print(f"   设备: {device}")

-    # --- 3. 构造文本 ---
-    print("🛠️  Building text representations...")
+    # 检查是否是 HuggingFace Hub 模型（格式为 username/model_name）
+    model_path_str = str(model_path)
+    # Windows PowerShell 会把 / 自动转成 \，需要还原
+    if "\\" in model_path_str and not os.path.exists(model_path_str):
+        model_path_str = model_path_str.replace("\\", "/")
+    
+    is_hub_model = "/" in model_path_str and not os.path.exists(model_path_str)
+    
+    # 如果是本地路径但不存在，则报错
+    if not is_hub_model and not os.path.exists(model_path_str):
+        print(f"❌ 模型目录不存在: {model_path}")
+        print(f"   请先下载模型到 {model_path}")
+        print(f"   或者使用 HuggingFace Hub 模型，例如: sentence-transformers/all-MiniLM-L6-v2")
+        return None
+
+    model = SentenceTransformer(model_path_str, device=device)
+
+    if device == "cuda" and use_fp16:
+        model = model.half()
+        torch.cuda.empty_cache()
+        mem_used = torch.cuda.memory_allocated(0) / 1024**3
+        total_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
+        print(f"   FP16 已启用")
+        print(f"   GPU: {torch.cuda.get_device_name(0)}")
+        print(f"   GPU memory: {mem_used:.2f} GB / {total_mem:.2f} GB (FP16)")
+    elif device == "cuda":
+        print(f"   GPU: {torch.cuda.get_device_name(0)}")
+        print(f"   GPU memory: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB / {torch.cuda.get_device_properties(0).total_memory/1024**3:.2f} GB")
+
+    print(f"\n🛠️ 构建文本表示...")
    texts = []
    chunk_ids = []
+    chunk_types = []
+
    for chunk in chunks:
        texts.append(build_text_for_embedding(chunk))
        chunk_ids.append(chunk.get('chunk_id', -1))
+        chunk_types.append(chunk.get('chunk_type', 'unknown'))

-    # --- 4. 向量化 ---
-    print(f"🔢 Encoding {len(texts)} texts (batch_size={batch_size})...")
+    print(f"\n🔢 向量化 {len(texts)} 个文本 (batch_size={batch_size})...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
@@ -87,19 +140,16 @@ def main(chunks_json_path: str, output_dir: str = "./embeddings",
    )
    print(f"   Embeddings shape: {embeddings.shape}")

-    # --- 5. 保存到输出目录 ---
-    os.makedirs(output_dir, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)

-    # 向量矩阵 (float32)
-    np.save(os.path.join(output_dir, "embeddings.npy"), embeddings.astype('float32'))
-    # chunk_id 映射
-    with open(os.path.join(output_dir, "chunk_id_map.json"), 'w') as f:
+    np.save(output_dir / "embeddings.npy", embeddings.astype('float32'))
+    with open(output_dir / "chunk_id_map.json", 'w', encoding='utf-8') as f:
        json.dump(chunk_ids, f, ensure_ascii=False, indent=2)
-    # 原始 chunks 副本
-    with open(os.path.join(output_dir, "chunks.json"), 'w') as f:
+    with open(output_dir / "chunk_type_map.json", 'w', encoding='utf-8') as f:
+        json.dump(chunk_types, f, ensure_ascii=False, indent=2)
+    with open(output_dir / "chunks.json", 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
-    # pickle 方便调试
-    with open(os.path.join(output_dir, "embeddings.pkl"), 'wb') as f:
+    with open(output_dir / "embeddings.pkl", 'wb') as f:
        pickle.dump({
            'chunks': chunks,
            'embeddings': embeddings,
@@ -107,24 +157,48 @@ def main(chunks_json_path: str, output_dir: str = "./embeddings",
            'normalized': normalize
        }, f)

-    # --- 6. 质量检查 ---
    nan_count = np.isnan(embeddings).sum()
-    print(f"\n📊 Quality check:")
+    print(f"\n📊 质量检查:")
    print(f"   NaN values: {nan_count}")
    norms = np.linalg.norm(embeddings, axis=1)
    print(f"   Norms: min={norms.min():.4f}, max={norms.max():.4f}, mean={norms.mean():.4f}")
-    print(f"\n✅ Embeddings saved to {output_dir}/")
-    print(f"   Files: embeddings.npy, chunk_id_map.json, chunks.json, embeddings.pkl")
+
+    print(f"\n✅ 向量数据已保存到: {output_dir}/")
+    print(f"   文件: embeddings.npy, chunk_id_map.json, chunk_type_map.json, chunks.json, embeddings.pkl")
+
+    type_counts = {}
+    for ct in chunk_types:
+        type_counts[ct] = type_counts.get(ct, 0) + 1
+    print(f"\n📈 Chunk 类型分布:")
+    for ct, count in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"   {ct}: {count}")
+
+    return {
+        "chunks": len(chunks),
+        "embedding_dim": embeddings.shape[1],
+        "output_dir": str(output_dir)
+    }
+

 if __name__ == "__main__":
    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("chunks_json", help="Path to all_chunks.json")
-    parser.add_argument("--output_dir", "-o", default="./embeddings")
-    parser.add_argument("--model_path", "-m", default="./models/Qwen3-Embedding-4B")
-    parser.add_argument("--batch_size", "-b", type=int, default=8,
-                        help="Batch size (lower if OOM)")
-    parser.add_argument("--no_normalize", action="store_true")
+    project_root = Path(__file__).resolve().parent
+    default_chunks = project_root / "jrxml_chunker_output" / "all_chunks.json"
+
+    parser = argparse.ArgumentParser(description="JRXML Chunks 向量化工具")
+    parser.add_argument("chunks_json", nargs="?", default=str(default_chunks),
+                        help=f"Chunks JSON 文件路径 (默认: {default_chunks})")
+    parser.add_argument("--output_dir", "-o", default=None,
+                        help="输出目录 (默认: embeddings)")
+    parser.add_argument("--model_path", "-m", default=None,
+                        help="模型路径 (默认: models/Qwen3-Embedding-4B)")
+    parser.add_argument("--batch_size", "-b", type=int, default=64,
+                        help="批处理大小 (默认: 64)")
+    parser.add_argument("--no_normalize", action="store_true",
+                        help="不做向量归一化")
+    parser.add_argument("--no_fp16", action="store_true",
+                        help="禁用 FP16 半精度（默认启用，可节省约 50%% 显存）")
+
    args = parser.parse_args()

    main(
@@ -132,5 +206,6 @@ if __name__ == "__main__":
        output_dir=args.output_dir,
        model_path=args.model_path,
        batch_size=args.batch_size,
-        normalize=not args.no_normalize
+        normalize=not args.no_normalize,
+        use_fp16=not args.no_fp16
    )