Preserve ingestion data across rebuilds

2026-06-06 12:44:02 +01:00
parent f3509a363e
commit 7707a6306d
6 changed files with 194 additions and 38 deletions
@@ -15,7 +15,7 @@ from .config import settings
 from .chunking import chunk_text, estimate_tokens
 from .embeddings import embed_texts
 from .vector_store import upsert_chunks
-from .db import insert_document_chunk, upsert_library, clear_library_documents
+from .db import replace_library_documents, upsert_library
 from .git_source import ingest_git_source

 SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json', 
@@ -123,15 +123,7 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
    
    print(f"  [Library] Found {len(doc_files)} document(s)")
    
-    # Clear old chunks for this library
-    print(f"  [Library] Clearing existing chunks...")
-    clear_result = clear_library_documents(library_id)
-    if not clear_result.get('success'):
-        print(f"  Warning: Could not clear library docs: {clear_result}")
-    else:
-        print(f"  [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
-    
-    # Process documents
+    # Prepare the complete replacement before touching the existing index.
    all_chunks = []
    processed_files = 0
    
@@ -158,56 +150,78 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
        embeddings = await asyncio.to_thread(embed_texts, chunks)
        
        # Build chunk dicts
-        chunk_dicts = []
        base_path = file_path.relative_to(library_dir).as_posix()
        
        for i, chunk in enumerate(chunks):
            chunk_dict = {
-                "id": f"{file_path.stem}-{i}",
+                "id": f"{base_path}:{i}",
                "library_id": library_id,
                "path": base_path,
                "title": Path(base_path).stem,
                "content": chunk,
                "chunk_index": i,
-                "embedding": embeddings[i]
+                "embedding": embeddings[i],
+                "token_estimate": estimate_tokens(chunk),
            }
            all_chunks.append(chunk_dict)
        
        processed_files += 1
    
    print(f"  [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
-    
-    # Save chunks to SQLite
-    if all_chunks:
-        for chunk in all_chunks:
-            insert_result = insert_document_chunk(
-                doc_id=chunk["id"],
-                library_id=chunk["library_id"],
-                path=chunk["path"],
-                title=chunk.get("title"),
-                content=chunk["content"],
-                chunk_index=chunk["chunk_index"],
-                token_estimate=estimate_tokens(chunk["content"])
-            )
-            if insert_result.get('success'):
-                continue
-        print(f"  [Library] Saved {len(all_chunks)} chunks to SQLite")
-    else:
-        print(f"  [Library] No chunks to save to SQLite")
-    
-    # Save vectors to Qdrant
+
+    if doc_files and not all_chunks:
+        error = "No document chunks were produced; keeping the existing index"
+        print(f"  [Library] {error}")
+        return {
+            "success": False,
+            "library_id": library_id,
+            "files_processed": processed_files,
+            "chunks_created": 0,
+            "vectors_added": 0,
+            "error": error,
+        }
+
+    # Update vectors first. If this fails, the previous SQLite index remains usable.
    if all_chunks:
        upsert_result = await upsert_chunks(all_chunks)
        print(f"  [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
+        if not upsert_result.get("success"):
+            return {
+                "success": False,
+                "library_id": library_id,
+                "files_processed": processed_files,
+                "chunks_created": len(all_chunks),
+                "vectors_added": 0,
+                "error": upsert_result.get("error", "Vector store update failed"),
+            }
    else:
        print(f"  [Library] No vectors to add to Qdrant")
-    
+        upsert_result = {"success": True, "points_added": 0}
+
+    # Replace SQLite rows in one transaction only after preparation succeeds.
+    replace_result = replace_library_documents(library_id, all_chunks)
+    if not replace_result.get("success"):
+        print(f"  [Library] SQLite replacement failed: {replace_result.get('error')}")
+        return {
+            "success": False,
+            "library_id": library_id,
+            "files_processed": processed_files,
+            "chunks_created": len(all_chunks),
+            "vectors_added": upsert_result.get("points_added", 0),
+            "error": replace_result.get("error", "SQLite replacement failed"),
+        }
+
+    print(
+        f"  [Library] Replaced {replace_result.get('deleted', 0)} old chunks "
+        f"with {replace_result.get('inserted', 0)} new chunks"
+    )
+
    return {
        "success": True,
        "library_id": library_id,
        "files_processed": processed_files,
        "chunks_created": len(all_chunks),
-        "vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
+        "vectors_added": upsert_result.get("points_added", 0),
    }