Preserve ingestion data across rebuilds
This commit is contained in:
+49
-35
@@ -15,7 +15,7 @@ from .config import settings
|
||||
from .chunking import chunk_text, estimate_tokens
|
||||
from .embeddings import embed_texts
|
||||
from .vector_store import upsert_chunks
|
||||
from .db import insert_document_chunk, upsert_library, clear_library_documents
|
||||
from .db import replace_library_documents, upsert_library
|
||||
from .git_source import ingest_git_source
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
|
||||
@@ -123,15 +123,7 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
|
||||
|
||||
print(f" [Library] Found {len(doc_files)} document(s)")
|
||||
|
||||
# Clear old chunks for this library
|
||||
print(f" [Library] Clearing existing chunks...")
|
||||
clear_result = clear_library_documents(library_id)
|
||||
if not clear_result.get('success'):
|
||||
print(f" Warning: Could not clear library docs: {clear_result}")
|
||||
else:
|
||||
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
|
||||
|
||||
# Process documents
|
||||
# Prepare the complete replacement before touching the existing index.
|
||||
all_chunks = []
|
||||
processed_files = 0
|
||||
|
||||
@@ -158,56 +150,78 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
|
||||
embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||
|
||||
# Build chunk dicts
|
||||
chunk_dicts = []
|
||||
base_path = file_path.relative_to(library_dir).as_posix()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_dict = {
|
||||
"id": f"{file_path.stem}-{i}",
|
||||
"id": f"{base_path}:{i}",
|
||||
"library_id": library_id,
|
||||
"path": base_path,
|
||||
"title": Path(base_path).stem,
|
||||
"content": chunk,
|
||||
"chunk_index": i,
|
||||
"embedding": embeddings[i]
|
||||
"embedding": embeddings[i],
|
||||
"token_estimate": estimate_tokens(chunk),
|
||||
}
|
||||
all_chunks.append(chunk_dict)
|
||||
|
||||
processed_files += 1
|
||||
|
||||
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
|
||||
|
||||
# Save chunks to SQLite
|
||||
if all_chunks:
|
||||
for chunk in all_chunks:
|
||||
insert_result = insert_document_chunk(
|
||||
doc_id=chunk["id"],
|
||||
library_id=chunk["library_id"],
|
||||
path=chunk["path"],
|
||||
title=chunk.get("title"),
|
||||
content=chunk["content"],
|
||||
chunk_index=chunk["chunk_index"],
|
||||
token_estimate=estimate_tokens(chunk["content"])
|
||||
)
|
||||
if insert_result.get('success'):
|
||||
continue
|
||||
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
|
||||
else:
|
||||
print(f" [Library] No chunks to save to SQLite")
|
||||
|
||||
# Save vectors to Qdrant
|
||||
|
||||
if doc_files and not all_chunks:
|
||||
error = "No document chunks were produced; keeping the existing index"
|
||||
print(f" [Library] {error}")
|
||||
return {
|
||||
"success": False,
|
||||
"library_id": library_id,
|
||||
"files_processed": processed_files,
|
||||
"chunks_created": 0,
|
||||
"vectors_added": 0,
|
||||
"error": error,
|
||||
}
|
||||
|
||||
# Update vectors first. If this fails, the previous SQLite index remains usable.
|
||||
if all_chunks:
|
||||
upsert_result = await upsert_chunks(all_chunks)
|
||||
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
|
||||
if not upsert_result.get("success"):
|
||||
return {
|
||||
"success": False,
|
||||
"library_id": library_id,
|
||||
"files_processed": processed_files,
|
||||
"chunks_created": len(all_chunks),
|
||||
"vectors_added": 0,
|
||||
"error": upsert_result.get("error", "Vector store update failed"),
|
||||
}
|
||||
else:
|
||||
print(f" [Library] No vectors to add to Qdrant")
|
||||
|
||||
upsert_result = {"success": True, "points_added": 0}
|
||||
|
||||
# Replace SQLite rows in one transaction only after preparation succeeds.
|
||||
replace_result = replace_library_documents(library_id, all_chunks)
|
||||
if not replace_result.get("success"):
|
||||
print(f" [Library] SQLite replacement failed: {replace_result.get('error')}")
|
||||
return {
|
||||
"success": False,
|
||||
"library_id": library_id,
|
||||
"files_processed": processed_files,
|
||||
"chunks_created": len(all_chunks),
|
||||
"vectors_added": upsert_result.get("points_added", 0),
|
||||
"error": replace_result.get("error", "SQLite replacement failed"),
|
||||
}
|
||||
|
||||
print(
|
||||
f" [Library] Replaced {replace_result.get('deleted', 0)} old chunks "
|
||||
f"with {replace_result.get('inserted', 0)} new chunks"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"library_id": library_id,
|
||||
"files_processed": processed_files,
|
||||
"chunks_created": len(all_chunks),
|
||||
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
|
||||
"vectors_added": upsert_result.get("points_added", 0),
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user