feat: 添加Markdown分块器与统一批量分块入口,支持增量向量化与导入
- 新增 md_chunker.py: Markdown语义分块引擎,支持标题/代码块/表格智能拆分 - 新增 batch_chunker.py: 统一批量分块入口,支持JRXML+Markdown混合处理 - 新增 requirements.txt: 整理项目依赖 - embed_chunks.py: 新增 --incremental 增量模式,追加新向量到已有数据 - import_to_chroma.py: 新增 --incremental 增量模式,不再每次清空数据库 - 更新 README.md 与 docs/file_guide.md 反映最新架构 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+59
-17
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
import_to_chroma.py
|
||||
将已生成的 chunk 向量导入 Chroma 数据库
|
||||
将 chunk 向量导入 Chroma 数据库
|
||||
支持 JRXML chunks 和 Markdown chunks 混合导入
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -16,7 +17,8 @@ from config import EMBEDDINGS_DIR, CHROMA_DB_PATH, CHROMA_COLLECTION_NAME
|
||||
|
||||
def main(embeddings_dir: str = None,
|
||||
chroma_path: str = None,
|
||||
collection_name: str = None):
|
||||
collection_name: str = None,
|
||||
incremental: bool = False):
|
||||
"""
|
||||
从 embeddings 目录读取向量和 chunks,导入 Chroma 持久化数据库
|
||||
|
||||
@@ -69,33 +71,55 @@ def main(embeddings_dir: str = None,
|
||||
chroma_path.mkdir(parents=True, exist_ok=True)
|
||||
client = chromadb.PersistentClient(path=str(chroma_path))
|
||||
|
||||
try:
|
||||
client.delete_collection(collection_name)
|
||||
print(f" 已删除旧集合 '{collection_name}'")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
if incremental:
|
||||
try:
|
||||
collection = client.get_collection(collection_name)
|
||||
existing_ids = set(collection.get()['ids'])
|
||||
print(f" 增量模式: 集合 '{collection_name}' 已有 {len(existing_ids)} 条记录")
|
||||
except Exception:
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
existing_ids = set()
|
||||
print(f" 增量模式: 创建新集合 '{collection_name}'")
|
||||
else:
|
||||
try:
|
||||
client.delete_collection(collection_name)
|
||||
print(f" 已删除旧集合 '{collection_name}'")
|
||||
except Exception:
|
||||
pass
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
existing_ids = set()
|
||||
|
||||
print(f"\n🛠️ 准备导入数据...")
|
||||
ids = []
|
||||
documents = []
|
||||
metadatas = []
|
||||
embeddings_list = []
|
||||
skipped = 0
|
||||
|
||||
seen_ids = {}
|
||||
for i, chunk in enumerate(tqdm(chunks, desc="准备数据")):
|
||||
raw_id = str(chunk.get("chunk_id", i))
|
||||
context = chunk.get("context", "")
|
||||
|
||||
if raw_id in seen_ids:
|
||||
seen_ids[raw_id] += 1
|
||||
chunk_id = f"{raw_id}_{seen_ids[raw_id]}"
|
||||
unique_chunk_id = f"{raw_id}_{seen_ids[raw_id]}"
|
||||
else:
|
||||
seen_ids[raw_id] = 0
|
||||
chunk_id = raw_id
|
||||
ids.append(chunk_id)
|
||||
unique_chunk_id = raw_id
|
||||
|
||||
# 增量模式:跳过已导入的
|
||||
if incremental and unique_chunk_id in existing_ids:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ids.append(unique_chunk_id)
|
||||
|
||||
doc_text = chunk.get("human_description", "")
|
||||
documents.append(doc_text)
|
||||
@@ -105,7 +129,6 @@ def main(embeddings_dir: str = None,
|
||||
if chunk_type:
|
||||
meta["chunk_type"] = chunk_type
|
||||
|
||||
context = chunk.get("context", "")
|
||||
if context:
|
||||
meta["context"] = context
|
||||
|
||||
@@ -118,10 +141,26 @@ def main(embeddings_dir: str = None,
|
||||
meta["element_kind"] = chunk_meta["element_kind"]
|
||||
if "query_language" in chunk_meta:
|
||||
meta["query_language"] = chunk_meta["query_language"]
|
||||
# Markdown-specific metadata
|
||||
if "heading" in chunk_meta:
|
||||
meta["heading"] = chunk_meta["heading"]
|
||||
if "heading_level" in chunk_meta:
|
||||
meta["heading_level"] = chunk_meta["heading_level"]
|
||||
if "language" in chunk_meta:
|
||||
meta["code_language"] = chunk_meta["language"]
|
||||
|
||||
metadatas.append(meta)
|
||||
embeddings_list.append(embeddings[i].tolist())
|
||||
|
||||
if incremental and skipped > 0:
|
||||
print(f" 增量模式: 跳过 {skipped} 条已存在记录")
|
||||
|
||||
if not ids:
|
||||
print(f"\n✅ 没有新数据需要导入,集合已是最新")
|
||||
print(f" 数据库路径: {chroma_path}")
|
||||
print(f" 集合数量: {collection.count()}")
|
||||
return collection
|
||||
|
||||
print(f"\n📥 分批导入到 Chroma (每批 1000 条)...")
|
||||
import_batch_size = 1000
|
||||
start_time = time.time()
|
||||
@@ -173,11 +212,14 @@ if __name__ == "__main__":
|
||||
help=f"Chroma 数据库路径 (默认: {CHROMA_DB_PATH})")
|
||||
parser.add_argument("--collection_name", "-n", default=CHROMA_COLLECTION_NAME,
|
||||
help=f"集合名称 (默认: {CHROMA_COLLECTION_NAME})")
|
||||
parser.add_argument("--incremental", "-i", action="store_true",
|
||||
help="增量模式:只导入新增记录,不删除已有数据")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(
|
||||
embeddings_dir=args.embeddings_dir,
|
||||
chroma_path=args.chroma_path,
|
||||
collection_name=args.collection_name
|
||||
collection_name=args.collection_name,
|
||||
incremental=args.incremental
|
||||
)
|
||||
Reference in New Issue
Block a user