diff --git a/README.md b/README.md index cc42848..ec6274e 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,21 @@ docker compose exec qdrant sh -c "tar czf /backups/qdrant-backup-$(date +%Y%m%d) docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage ``` +### Rebuild Without Losing Sources or Ingestion + +Normal image rebuilds preserve Git source definitions, cloned repositories, +uploaded documents, SQLite metadata, Qdrant vectors, and the embedding model +cache because they are bind-mounted from the host. + +```bash +git pull +docker compose up -d --build +``` + +Do not delete `data/`, `docs/`, or `docs_sources.yaml`. Do not run the reset +commands below unless you intentionally want to erase the indexed data and +source configuration. + ### Safe Reset Command To reset both SQLite and Qdrant cleanly: @@ -428,4 +443,4 @@ For Docker-based deployment, consider using an authentication middleware or a de - Chunk overlap configuration via environment variables - Batch index endpoint improvements - Metrics/logging aggregation (e.g., Prometheus + Grafana) -- Plugin system for additional data sources \ No newline at end of file +- Plugin system for additional data sources diff --git a/backend/app/db.py b/backend/app/db.py index e4277fa..971013d 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -245,6 +245,57 @@ def clear_library_documents(library_id: str) -> Dict[str, Any]: conn.close() +def replace_library_documents( + library_id: str, + chunks: List[Dict[str, Any]], +) -> Dict[str, Any]: + """Atomically replace all document chunks for a library.""" + conn = get_connection() + + try: + now = datetime.utcnow().isoformat() + conn.execute("BEGIN") + cursor = conn.execute( + "DELETE FROM documents WHERE library_id = ?", (library_id,) + ) + deleted = cursor.rowcount + + conn.executemany( + """ + INSERT INTO documents + (id, library_id, path, title, content, chunk_index, + token_estimate, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + ( + chunk["id"], + library_id, + chunk["path"], + chunk.get("title"), + chunk.get("content"), + chunk.get("chunk_index"), + chunk.get("token_estimate", 0), + now, + ) + for chunk in chunks + ], + ) + + conn.commit() + return { + "success": True, + "deleted": deleted, + "inserted": len(chunks), + "library_id": library_id, + } + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + def delete_library(library_id: str) -> Dict[str, Any]: """Delete a library row and its document chunks.""" conn = get_connection() diff --git a/backend/app/embeddings.py b/backend/app/embeddings.py index 735e305..cc41efc 100644 --- a/backend/app/embeddings.py +++ b/backend/app/embeddings.py @@ -1,5 +1,6 @@ # Local Embedding Generation using FastEmbed import asyncio +import os from typing import List from functools import lru_cache @@ -20,7 +21,11 @@ def _load_model(): print("Loading embedding model (this may take a few minutes on first run)...") # Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline - _embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache") + cache_dir = os.getenv("EMBEDDING_CACHE_DIR", ".embed_cache") + _embedding_model = TextEmbedding( + model_name="BAAI/bge-small-en-v1.5", + cache_dir=cache_dir, + ) print("Embedding model loaded successfully.") return _embedding_model @@ -178,4 +183,4 @@ if __name__ == "__main__": assert embed_texts([]) == [], "Empty list should return empty list" print("āœ“ Empty input handling works") - print("\nāœ… All tests passed!") \ No newline at end of file + print("\nāœ… All tests passed!") diff --git a/backend/app/ingest.py b/backend/app/ingest.py index 4283517..1cb2dec 100644 --- a/backend/app/ingest.py +++ b/backend/app/ingest.py @@ -15,7 +15,7 @@ from .config import settings from .chunking import chunk_text, estimate_tokens from .embeddings import embed_texts from .vector_store import upsert_chunks -from .db import insert_document_chunk, upsert_library, clear_library_documents +from .db import replace_library_documents, upsert_library from .git_source import ingest_git_source SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json', @@ -123,15 +123,7 @@ async def ingest_library(library_id: str, name: str, description: Optional[str] print(f" [Library] Found {len(doc_files)} document(s)") - # Clear old chunks for this library - print(f" [Library] Clearing existing chunks...") - clear_result = clear_library_documents(library_id) - if not clear_result.get('success'): - print(f" Warning: Could not clear library docs: {clear_result}") - else: - print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks") - - # Process documents + # Prepare the complete replacement before touching the existing index. all_chunks = [] processed_files = 0 @@ -158,56 +150,78 @@ async def ingest_library(library_id: str, name: str, description: Optional[str] embeddings = await asyncio.to_thread(embed_texts, chunks) # Build chunk dicts - chunk_dicts = [] base_path = file_path.relative_to(library_dir).as_posix() for i, chunk in enumerate(chunks): chunk_dict = { - "id": f"{file_path.stem}-{i}", + "id": f"{base_path}:{i}", "library_id": library_id, "path": base_path, "title": Path(base_path).stem, "content": chunk, "chunk_index": i, - "embedding": embeddings[i] + "embedding": embeddings[i], + "token_estimate": estimate_tokens(chunk), } all_chunks.append(chunk_dict) processed_files += 1 print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks") - - # Save chunks to SQLite - if all_chunks: - for chunk in all_chunks: - insert_result = insert_document_chunk( - doc_id=chunk["id"], - library_id=chunk["library_id"], - path=chunk["path"], - title=chunk.get("title"), - content=chunk["content"], - chunk_index=chunk["chunk_index"], - token_estimate=estimate_tokens(chunk["content"]) - ) - if insert_result.get('success'): - continue - print(f" [Library] Saved {len(all_chunks)} chunks to SQLite") - else: - print(f" [Library] No chunks to save to SQLite") - - # Save vectors to Qdrant + + if doc_files and not all_chunks: + error = "No document chunks were produced; keeping the existing index" + print(f" [Library] {error}") + return { + "success": False, + "library_id": library_id, + "files_processed": processed_files, + "chunks_created": 0, + "vectors_added": 0, + "error": error, + } + + # Update vectors first. If this fails, the previous SQLite index remains usable. if all_chunks: upsert_result = await upsert_chunks(all_chunks) print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)") + if not upsert_result.get("success"): + return { + "success": False, + "library_id": library_id, + "files_processed": processed_files, + "chunks_created": len(all_chunks), + "vectors_added": 0, + "error": upsert_result.get("error", "Vector store update failed"), + } else: print(f" [Library] No vectors to add to Qdrant") - + upsert_result = {"success": True, "points_added": 0} + + # Replace SQLite rows in one transaction only after preparation succeeds. + replace_result = replace_library_documents(library_id, all_chunks) + if not replace_result.get("success"): + print(f" [Library] SQLite replacement failed: {replace_result.get('error')}") + return { + "success": False, + "library_id": library_id, + "files_processed": processed_files, + "chunks_created": len(all_chunks), + "vectors_added": upsert_result.get("points_added", 0), + "error": replace_result.get("error", "SQLite replacement failed"), + } + + print( + f" [Library] Replaced {replace_result.get('deleted', 0)} old chunks " + f"with {replace_result.get('inserted', 0)} new chunks" + ) + return { "success": True, "library_id": library_id, "files_processed": processed_files, "chunks_created": len(all_chunks), - "vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks) + "vectors_added": upsert_result.get("points_added", 0), } diff --git a/docker-compose.yml b/docker-compose.yml index aef617b..0f41953 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,7 @@ services: - VECTOR_STORE_PORT=6333 - DOCS_PATH=/docs - DB_PATH=/data/db.sqlite + - EMBEDDING_CACHE_DIR=/data/embed_cache - LOG_LEVEL=INFO - API_KEY_DOCS_API=${DOCS_API_KEY:-} volumes: diff --git a/tests/test_db.py b/tests/test_db.py index b854d3c..e04ef65 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -233,6 +233,76 @@ class TestDocumentChunkOperations: remaining = get_chunks_for_library("/local/cleartest") assert len(remaining) == 0 + def test_replace_library_documents_is_atomic(self, test_database): + """Replacing chunks should remove old rows and insert the new set.""" + from backend.app.db import ( + get_chunks_for_library, + insert_document_chunk, + replace_library_documents, + upsert_library, + ) + + library_id = "/local/replacetest" + upsert_library(library_id, "Replace test", source_path=library_id) + insert_document_chunk( + "old-chunk", + library_id, + "old.md", + content="old content", + chunk_index=0, + ) + + result = replace_library_documents( + library_id, + [ + { + "id": "new-chunk", + "path": "new.md", + "title": "new", + "content": "new content", + "chunk_index": 0, + "token_estimate": 2, + } + ], + ) + + chunks = get_chunks_for_library(library_id) + assert result["success"] is True + assert result["deleted"] >= 1 + assert result["inserted"] == 1 + assert [chunk["id"] for chunk in chunks] == ["new-chunk"] + + def test_failed_replacement_keeps_existing_chunks(self, test_database): + """A bad replacement must roll back instead of erasing the old index.""" + from backend.app.db import ( + get_chunks_for_library, + insert_document_chunk, + replace_library_documents, + upsert_library, + ) + + library_id = "/local/rollbacktest" + upsert_library(library_id, "Rollback test", source_path=library_id) + insert_document_chunk( + "old-chunk", + library_id, + "old.md", + content="old content", + chunk_index=0, + ) + + duplicate = { + "id": "duplicate", + "path": "new.md", + "content": "new content", + "chunk_index": 0, + } + result = replace_library_documents(library_id, [duplicate, duplicate]) + + chunks = get_chunks_for_library(library_id) + assert result["success"] is False + assert [chunk["id"] for chunk in chunks] == ["old-chunk"] + class TestDatabaseEdgeCases: """Tests for edge cases and error handling."""