4f475e9e36
添加Qwen3-4B嵌入模型配置文件及权重文件 添加多个JRXML报告的数据查询和字段定义文件 添加PdfEncryptReport.jrxml示例报告文件
136 lines
4.8 KiB
Python
136 lines
4.8 KiB
Python
"""
|
|
embed_chunks.py
|
|
使用本地 Qwen3-Embedding-4B 模型对 JRXML chunks 进行向量化
|
|
支持 GPU (CUDA) 或 CPU
|
|
"""
|
|
|
|
import os, sys, json, pickle
|
|
import numpy as np
|
|
import torch
|
|
from tqdm import tqdm
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
def build_text_for_embedding(chunk: dict) -> str:
|
|
"""
|
|
将单个 chunk 转换为适合向量化的文本
|
|
拼接:类型、描述、上下文、关键元数据、部分 XML
|
|
"""
|
|
parts = [
|
|
f"[ChunkType: {chunk.get('chunk_type', 'unknown')}]",
|
|
chunk.get('human_description', ''),
|
|
]
|
|
context = chunk.get('context', '')
|
|
if context:
|
|
parts.append(f"Context: {context}")
|
|
|
|
# 添加部分 XML (前500字符)
|
|
raw_xml = chunk.get('raw_xml', '')
|
|
if raw_xml:
|
|
parts.append(f"XML: {raw_xml[:500]}")
|
|
|
|
# 添加元数据
|
|
meta = chunk.get('metadata', {})
|
|
if meta:
|
|
if 'field_names' in meta:
|
|
parts.append(f"Fields: {', '.join(meta['field_names'])}")
|
|
if 'parameter_names' in meta:
|
|
parts.append(f"Parameters: {', '.join(meta['parameter_names'])}")
|
|
if 'report_name' in meta:
|
|
parts.append(f"Report: {meta['report_name']}")
|
|
if 'band_name' in meta:
|
|
parts.append(f"Band: {meta['band_name']}")
|
|
if 'element_kind' in meta:
|
|
parts.append(f"Element: {meta['element_kind']}")
|
|
if 'query_language' in meta:
|
|
parts.append(f"QueryLang: {meta['query_language']}")
|
|
return "\n".join(parts)
|
|
|
|
def main(chunks_json_path: str, output_dir: str = "./embeddings",
|
|
model_path: str = "./models/Qwen3-Embedding-4B",
|
|
batch_size: int = 16, normalize: bool = True):
|
|
"""
|
|
主流程:
|
|
1. 加载 chunk JSON
|
|
2. 加载嵌入模型
|
|
3. 构造文本并向量化
|
|
4. 保存向量及映射文件
|
|
"""
|
|
# --- 1. 加载 chunks ---
|
|
print(f"📄 Loading chunks from {chunks_json_path}")
|
|
with open(chunks_json_path, 'r', encoding='utf-8') as f:
|
|
chunks = json.load(f)
|
|
print(f" Total chunks: {len(chunks)}")
|
|
|
|
# --- 2. 加载模型 ---
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
print(f"🧠 Loading embedding model from {model_path} on {device}")
|
|
model = SentenceTransformer(model_path, device=device)
|
|
if device == "cuda":
|
|
print(f" GPU memory allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
|
|
|
|
# --- 3. 构造文本 ---
|
|
print("🛠️ Building text representations...")
|
|
texts = []
|
|
chunk_ids = []
|
|
for chunk in chunks:
|
|
texts.append(build_text_for_embedding(chunk))
|
|
chunk_ids.append(chunk.get('chunk_id', -1))
|
|
|
|
# --- 4. 向量化 ---
|
|
print(f"🔢 Encoding {len(texts)} texts (batch_size={batch_size})...")
|
|
embeddings = model.encode(
|
|
texts,
|
|
batch_size=batch_size,
|
|
show_progress_bar=True,
|
|
normalize_embeddings=normalize,
|
|
convert_to_numpy=True
|
|
)
|
|
print(f" Embeddings shape: {embeddings.shape}")
|
|
|
|
# --- 5. 保存到输出目录 ---
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# 向量矩阵 (float32)
|
|
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings.astype('float32'))
|
|
# chunk_id 映射
|
|
with open(os.path.join(output_dir, "chunk_id_map.json"), 'w') as f:
|
|
json.dump(chunk_ids, f, ensure_ascii=False, indent=2)
|
|
# 原始 chunks 副本
|
|
with open(os.path.join(output_dir, "chunks.json"), 'w') as f:
|
|
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
|
# pickle 方便调试
|
|
with open(os.path.join(output_dir, "embeddings.pkl"), 'wb') as f:
|
|
pickle.dump({
|
|
'chunks': chunks,
|
|
'embeddings': embeddings,
|
|
'texts': texts,
|
|
'normalized': normalize
|
|
}, f)
|
|
|
|
# --- 6. 质量检查 ---
|
|
nan_count = np.isnan(embeddings).sum()
|
|
print(f"\n📊 Quality check:")
|
|
print(f" NaN values: {nan_count}")
|
|
norms = np.linalg.norm(embeddings, axis=1)
|
|
print(f" Norms: min={norms.min():.4f}, max={norms.max():.4f}, mean={norms.mean():.4f}")
|
|
print(f"\n✅ Embeddings saved to {output_dir}/")
|
|
print(f" Files: embeddings.npy, chunk_id_map.json, chunks.json, embeddings.pkl")
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("chunks_json", help="Path to all_chunks.json")
|
|
parser.add_argument("--output_dir", "-o", default="./embeddings")
|
|
parser.add_argument("--model_path", "-m", default="./models/Qwen3-Embedding-4B")
|
|
parser.add_argument("--batch_size", "-b", type=int, default=8,
|
|
help="Batch size (lower if OOM)")
|
|
parser.add_argument("--no_normalize", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
main(
|
|
chunks_json_path=args.chunks_json,
|
|
output_dir=args.output_dir,
|
|
model_path=args.model_path,
|
|
batch_size=args.batch_size,
|
|
normalize=not args.no_normalize
|
|
) |