Preserve ingestion data across rebuilds

This commit is contained in:
george
2026-06-06 12:44:02 +01:00
parent f3509a363e
commit 7707a6306d
6 changed files with 194 additions and 38 deletions
+15
View File
@@ -64,6 +64,21 @@ docker compose exec qdrant sh -c "tar czf /backups/qdrant-backup-$(date +%Y%m%d)
docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage
``` ```
### Rebuild Without Losing Sources or Ingestion
Normal image rebuilds preserve Git source definitions, cloned repositories,
uploaded documents, SQLite metadata, Qdrant vectors, and the embedding model
cache because they are bind-mounted from the host.
```bash
git pull
docker compose up -d --build
```
Do not delete `data/`, `docs/`, or `docs_sources.yaml`. Do not run the reset
commands below unless you intentionally want to erase the indexed data and
source configuration.
### Safe Reset Command ### Safe Reset Command
To reset both SQLite and Qdrant cleanly: To reset both SQLite and Qdrant cleanly:
+51
View File
@@ -245,6 +245,57 @@ def clear_library_documents(library_id: str) -> Dict[str, Any]:
conn.close() conn.close()
def replace_library_documents(
library_id: str,
chunks: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Atomically replace all document chunks for a library."""
conn = get_connection()
try:
now = datetime.utcnow().isoformat()
conn.execute("BEGIN")
cursor = conn.execute(
"DELETE FROM documents WHERE library_id = ?", (library_id,)
)
deleted = cursor.rowcount
conn.executemany(
"""
INSERT INTO documents
(id, library_id, path, title, content, chunk_index,
token_estimate, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
[
(
chunk["id"],
library_id,
chunk["path"],
chunk.get("title"),
chunk.get("content"),
chunk.get("chunk_index"),
chunk.get("token_estimate", 0),
now,
)
for chunk in chunks
],
)
conn.commit()
return {
"success": True,
"deleted": deleted,
"inserted": len(chunks),
"library_id": library_id,
}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def delete_library(library_id: str) -> Dict[str, Any]: def delete_library(library_id: str) -> Dict[str, Any]:
"""Delete a library row and its document chunks.""" """Delete a library row and its document chunks."""
conn = get_connection() conn = get_connection()
+6 -1
View File
@@ -1,5 +1,6 @@
# Local Embedding Generation using FastEmbed # Local Embedding Generation using FastEmbed
import asyncio import asyncio
import os
from typing import List from typing import List
from functools import lru_cache from functools import lru_cache
@@ -20,7 +21,11 @@ def _load_model():
print("Loading embedding model (this may take a few minutes on first run)...") print("Loading embedding model (this may take a few minutes on first run)...")
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline # Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache") cache_dir = os.getenv("EMBEDDING_CACHE_DIR", ".embed_cache")
_embedding_model = TextEmbedding(
model_name="BAAI/bge-small-en-v1.5",
cache_dir=cache_dir,
)
print("Embedding model loaded successfully.") print("Embedding model loaded successfully.")
return _embedding_model return _embedding_model
+46 -32
View File
@@ -15,7 +15,7 @@ from .config import settings
from .chunking import chunk_text, estimate_tokens from .chunking import chunk_text, estimate_tokens
from .embeddings import embed_texts from .embeddings import embed_texts
from .vector_store import upsert_chunks from .vector_store import upsert_chunks
from .db import insert_document_chunk, upsert_library, clear_library_documents from .db import replace_library_documents, upsert_library
from .git_source import ingest_git_source from .git_source import ingest_git_source
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json', SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
@@ -123,15 +123,7 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
print(f" [Library] Found {len(doc_files)} document(s)") print(f" [Library] Found {len(doc_files)} document(s)")
# Clear old chunks for this library # Prepare the complete replacement before touching the existing index.
print(f" [Library] Clearing existing chunks...")
clear_result = clear_library_documents(library_id)
if not clear_result.get('success'):
print(f" Warning: Could not clear library docs: {clear_result}")
else:
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
# Process documents
all_chunks = [] all_chunks = []
processed_files = 0 processed_files = 0
@@ -158,18 +150,18 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
embeddings = await asyncio.to_thread(embed_texts, chunks) embeddings = await asyncio.to_thread(embed_texts, chunks)
# Build chunk dicts # Build chunk dicts
chunk_dicts = []
base_path = file_path.relative_to(library_dir).as_posix() base_path = file_path.relative_to(library_dir).as_posix()
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
chunk_dict = { chunk_dict = {
"id": f"{file_path.stem}-{i}", "id": f"{base_path}:{i}",
"library_id": library_id, "library_id": library_id,
"path": base_path, "path": base_path,
"title": Path(base_path).stem, "title": Path(base_path).stem,
"content": chunk, "content": chunk,
"chunk_index": i, "chunk_index": i,
"embedding": embeddings[i] "embedding": embeddings[i],
"token_estimate": estimate_tokens(chunk),
} }
all_chunks.append(chunk_dict) all_chunks.append(chunk_dict)
@@ -177,37 +169,59 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks") print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
# Save chunks to SQLite if doc_files and not all_chunks:
if all_chunks: error = "No document chunks were produced; keeping the existing index"
for chunk in all_chunks: print(f" [Library] {error}")
insert_result = insert_document_chunk( return {
doc_id=chunk["id"], "success": False,
library_id=chunk["library_id"], "library_id": library_id,
path=chunk["path"], "files_processed": processed_files,
title=chunk.get("title"), "chunks_created": 0,
content=chunk["content"], "vectors_added": 0,
chunk_index=chunk["chunk_index"], "error": error,
token_estimate=estimate_tokens(chunk["content"]) }
)
if insert_result.get('success'):
continue
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
else:
print(f" [Library] No chunks to save to SQLite")
# Save vectors to Qdrant # Update vectors first. If this fails, the previous SQLite index remains usable.
if all_chunks: if all_chunks:
upsert_result = await upsert_chunks(all_chunks) upsert_result = await upsert_chunks(all_chunks)
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)") print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
if not upsert_result.get("success"):
return {
"success": False,
"library_id": library_id,
"files_processed": processed_files,
"chunks_created": len(all_chunks),
"vectors_added": 0,
"error": upsert_result.get("error", "Vector store update failed"),
}
else: else:
print(f" [Library] No vectors to add to Qdrant") print(f" [Library] No vectors to add to Qdrant")
upsert_result = {"success": True, "points_added": 0}
# Replace SQLite rows in one transaction only after preparation succeeds.
replace_result = replace_library_documents(library_id, all_chunks)
if not replace_result.get("success"):
print(f" [Library] SQLite replacement failed: {replace_result.get('error')}")
return {
"success": False,
"library_id": library_id,
"files_processed": processed_files,
"chunks_created": len(all_chunks),
"vectors_added": upsert_result.get("points_added", 0),
"error": replace_result.get("error", "SQLite replacement failed"),
}
print(
f" [Library] Replaced {replace_result.get('deleted', 0)} old chunks "
f"with {replace_result.get('inserted', 0)} new chunks"
)
return { return {
"success": True, "success": True,
"library_id": library_id, "library_id": library_id,
"files_processed": processed_files, "files_processed": processed_files,
"chunks_created": len(all_chunks), "chunks_created": len(all_chunks),
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks) "vectors_added": upsert_result.get("points_added", 0),
} }
+1
View File
@@ -32,6 +32,7 @@ services:
- VECTOR_STORE_PORT=6333 - VECTOR_STORE_PORT=6333
- DOCS_PATH=/docs - DOCS_PATH=/docs
- DB_PATH=/data/db.sqlite - DB_PATH=/data/db.sqlite
- EMBEDDING_CACHE_DIR=/data/embed_cache
- LOG_LEVEL=INFO - LOG_LEVEL=INFO
- API_KEY_DOCS_API=${DOCS_API_KEY:-} - API_KEY_DOCS_API=${DOCS_API_KEY:-}
volumes: volumes:
+70
View File
@@ -233,6 +233,76 @@ class TestDocumentChunkOperations:
remaining = get_chunks_for_library("/local/cleartest") remaining = get_chunks_for_library("/local/cleartest")
assert len(remaining) == 0 assert len(remaining) == 0
def test_replace_library_documents_is_atomic(self, test_database):
"""Replacing chunks should remove old rows and insert the new set."""
from backend.app.db import (
get_chunks_for_library,
insert_document_chunk,
replace_library_documents,
upsert_library,
)
library_id = "/local/replacetest"
upsert_library(library_id, "Replace test", source_path=library_id)
insert_document_chunk(
"old-chunk",
library_id,
"old.md",
content="old content",
chunk_index=0,
)
result = replace_library_documents(
library_id,
[
{
"id": "new-chunk",
"path": "new.md",
"title": "new",
"content": "new content",
"chunk_index": 0,
"token_estimate": 2,
}
],
)
chunks = get_chunks_for_library(library_id)
assert result["success"] is True
assert result["deleted"] >= 1
assert result["inserted"] == 1
assert [chunk["id"] for chunk in chunks] == ["new-chunk"]
def test_failed_replacement_keeps_existing_chunks(self, test_database):
"""A bad replacement must roll back instead of erasing the old index."""
from backend.app.db import (
get_chunks_for_library,
insert_document_chunk,
replace_library_documents,
upsert_library,
)
library_id = "/local/rollbacktest"
upsert_library(library_id, "Rollback test", source_path=library_id)
insert_document_chunk(
"old-chunk",
library_id,
"old.md",
content="old content",
chunk_index=0,
)
duplicate = {
"id": "duplicate",
"path": "new.md",
"content": "new content",
"chunk_index": 0,
}
result = replace_library_documents(library_id, [duplicate, duplicate])
chunks = get_chunks_for_library(library_id)
assert result["success"] is False
assert [chunk["id"] for chunk in chunks] == ["old-chunk"]
class TestDatabaseEdgeCases: class TestDatabaseEdgeCases:
"""Tests for edge cases and error handling.""" """Tests for edge cases and error handling."""