""" embed_chunks.py 使用本地 Qwen3-Embedding-4B 模型对 JRXML chunks 进行向量化 支持 GPU (CUDA) 或 CPU """ import os, sys, json, pickle import numpy as np import torch from tqdm import tqdm from sentence_transformers import SentenceTransformer def build_text_for_embedding(chunk: dict) -> str: """ 将单个 chunk 转换为适合向量化的文本 拼接:类型、描述、上下文、关键元数据、部分 XML """ parts = [ f"[ChunkType: {chunk.get('chunk_type', 'unknown')}]", chunk.get('human_description', ''), ] context = chunk.get('context', '') if context: parts.append(f"Context: {context}") # 添加部分 XML (前500字符) raw_xml = chunk.get('raw_xml', '') if raw_xml: parts.append(f"XML: {raw_xml[:500]}") # 添加元数据 meta = chunk.get('metadata', {}) if meta: if 'field_names' in meta: parts.append(f"Fields: {', '.join(meta['field_names'])}") if 'parameter_names' in meta: parts.append(f"Parameters: {', '.join(meta['parameter_names'])}") if 'report_name' in meta: parts.append(f"Report: {meta['report_name']}") if 'band_name' in meta: parts.append(f"Band: {meta['band_name']}") if 'element_kind' in meta: parts.append(f"Element: {meta['element_kind']}") if 'query_language' in meta: parts.append(f"QueryLang: {meta['query_language']}") return "\n".join(parts) def main(chunks_json_path: str, output_dir: str = "./embeddings", model_path: str = "./models/Qwen3-Embedding-4B", batch_size: int = 16, normalize: bool = True): """ 主流程: 1. 加载 chunk JSON 2. 加载嵌入模型 3. 构造文本并向量化 4. 保存向量及映射文件 """ # --- 1. 加载 chunks --- print(f"📄 Loading chunks from {chunks_json_path}") with open(chunks_json_path, 'r', encoding='utf-8') as f: chunks = json.load(f) print(f" Total chunks: {len(chunks)}") # --- 2. 加载模型 --- device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🧠 Loading embedding model from {model_path} on {device}") model = SentenceTransformer(model_path, device=device) if device == "cuda": print(f" GPU memory allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB") # --- 3. 构造文本 --- print("🛠️ Building text representations...") texts = [] chunk_ids = [] for chunk in chunks: texts.append(build_text_for_embedding(chunk)) chunk_ids.append(chunk.get('chunk_id', -1)) # --- 4. 向量化 --- print(f"🔢 Encoding {len(texts)} texts (batch_size={batch_size})...") embeddings = model.encode( texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=normalize, convert_to_numpy=True ) print(f" Embeddings shape: {embeddings.shape}") # --- 5. 保存到输出目录 --- os.makedirs(output_dir, exist_ok=True) # 向量矩阵 (float32) np.save(os.path.join(output_dir, "embeddings.npy"), embeddings.astype('float32')) # chunk_id 映射 with open(os.path.join(output_dir, "chunk_id_map.json"), 'w') as f: json.dump(chunk_ids, f, ensure_ascii=False, indent=2) # 原始 chunks 副本 with open(os.path.join(output_dir, "chunks.json"), 'w') as f: json.dump(chunks, f, ensure_ascii=False, indent=2) # pickle 方便调试 with open(os.path.join(output_dir, "embeddings.pkl"), 'wb') as f: pickle.dump({ 'chunks': chunks, 'embeddings': embeddings, 'texts': texts, 'normalized': normalize }, f) # --- 6. 质量检查 --- nan_count = np.isnan(embeddings).sum() print(f"\n📊 Quality check:") print(f" NaN values: {nan_count}") norms = np.linalg.norm(embeddings, axis=1) print(f" Norms: min={norms.min():.4f}, max={norms.max():.4f}, mean={norms.mean():.4f}") print(f"\n✅ Embeddings saved to {output_dir}/") print(f" Files: embeddings.npy, chunk_id_map.json, chunks.json, embeddings.pkl") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("chunks_json", help="Path to all_chunks.json") parser.add_argument("--output_dir", "-o", default="./embeddings") parser.add_argument("--model_path", "-m", default="./models/Qwen3-Embedding-4B") parser.add_argument("--batch_size", "-b", type=int, default=8, help="Batch size (lower if OOM)") parser.add_argument("--no_normalize", action="store_true") args = parser.parse_args() main( chunks_json_path=args.chunks_json, output_dir=args.output_dir, model_path=args.model_path, batch_size=args.batch_size, normalize=not args.no_normalize )