Preserve ingestion data across rebuilds
This commit is contained in:
@@ -64,6 +64,21 @@ docker compose exec qdrant sh -c "tar czf /backups/qdrant-backup-$(date +%Y%m%d)
|
|||||||
docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage
|
docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Rebuild Without Losing Sources or Ingestion
|
||||||
|
|
||||||
|
Normal image rebuilds preserve Git source definitions, cloned repositories,
|
||||||
|
uploaded documents, SQLite metadata, Qdrant vectors, and the embedding model
|
||||||
|
cache because they are bind-mounted from the host.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git pull
|
||||||
|
docker compose up -d --build
|
||||||
|
```
|
||||||
|
|
||||||
|
Do not delete `data/`, `docs/`, or `docs_sources.yaml`. Do not run the reset
|
||||||
|
commands below unless you intentionally want to erase the indexed data and
|
||||||
|
source configuration.
|
||||||
|
|
||||||
### Safe Reset Command
|
### Safe Reset Command
|
||||||
|
|
||||||
To reset both SQLite and Qdrant cleanly:
|
To reset both SQLite and Qdrant cleanly:
|
||||||
|
|||||||
@@ -245,6 +245,57 @@ def clear_library_documents(library_id: str) -> Dict[str, Any]:
|
|||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def replace_library_documents(
|
||||||
|
library_id: str,
|
||||||
|
chunks: List[Dict[str, Any]],
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Atomically replace all document chunks for a library."""
|
||||||
|
conn = get_connection()
|
||||||
|
|
||||||
|
try:
|
||||||
|
now = datetime.utcnow().isoformat()
|
||||||
|
conn.execute("BEGIN")
|
||||||
|
cursor = conn.execute(
|
||||||
|
"DELETE FROM documents WHERE library_id = ?", (library_id,)
|
||||||
|
)
|
||||||
|
deleted = cursor.rowcount
|
||||||
|
|
||||||
|
conn.executemany(
|
||||||
|
"""
|
||||||
|
INSERT INTO documents
|
||||||
|
(id, library_id, path, title, content, chunk_index,
|
||||||
|
token_estimate, created_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
chunk["id"],
|
||||||
|
library_id,
|
||||||
|
chunk["path"],
|
||||||
|
chunk.get("title"),
|
||||||
|
chunk.get("content"),
|
||||||
|
chunk.get("chunk_index"),
|
||||||
|
chunk.get("token_estimate", 0),
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
for chunk in chunks
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"deleted": deleted,
|
||||||
|
"inserted": len(chunks),
|
||||||
|
"library_id": library_id,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
return {"success": False, "error": str(e)}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
def delete_library(library_id: str) -> Dict[str, Any]:
|
def delete_library(library_id: str) -> Dict[str, Any]:
|
||||||
"""Delete a library row and its document chunks."""
|
"""Delete a library row and its document chunks."""
|
||||||
conn = get_connection()
|
conn = get_connection()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# Local Embedding Generation using FastEmbed
|
# Local Embedding Generation using FastEmbed
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
@@ -20,7 +21,11 @@ def _load_model():
|
|||||||
print("Loading embedding model (this may take a few minutes on first run)...")
|
print("Loading embedding model (this may take a few minutes on first run)...")
|
||||||
|
|
||||||
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
|
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
|
||||||
_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache")
|
cache_dir = os.getenv("EMBEDDING_CACHE_DIR", ".embed_cache")
|
||||||
|
_embedding_model = TextEmbedding(
|
||||||
|
model_name="BAAI/bge-small-en-v1.5",
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
)
|
||||||
print("Embedding model loaded successfully.")
|
print("Embedding model loaded successfully.")
|
||||||
|
|
||||||
return _embedding_model
|
return _embedding_model
|
||||||
|
|||||||
+46
-32
@@ -15,7 +15,7 @@ from .config import settings
|
|||||||
from .chunking import chunk_text, estimate_tokens
|
from .chunking import chunk_text, estimate_tokens
|
||||||
from .embeddings import embed_texts
|
from .embeddings import embed_texts
|
||||||
from .vector_store import upsert_chunks
|
from .vector_store import upsert_chunks
|
||||||
from .db import insert_document_chunk, upsert_library, clear_library_documents
|
from .db import replace_library_documents, upsert_library
|
||||||
from .git_source import ingest_git_source
|
from .git_source import ingest_git_source
|
||||||
|
|
||||||
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
|
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
|
||||||
@@ -123,15 +123,7 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
|
|||||||
|
|
||||||
print(f" [Library] Found {len(doc_files)} document(s)")
|
print(f" [Library] Found {len(doc_files)} document(s)")
|
||||||
|
|
||||||
# Clear old chunks for this library
|
# Prepare the complete replacement before touching the existing index.
|
||||||
print(f" [Library] Clearing existing chunks...")
|
|
||||||
clear_result = clear_library_documents(library_id)
|
|
||||||
if not clear_result.get('success'):
|
|
||||||
print(f" Warning: Could not clear library docs: {clear_result}")
|
|
||||||
else:
|
|
||||||
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
|
|
||||||
|
|
||||||
# Process documents
|
|
||||||
all_chunks = []
|
all_chunks = []
|
||||||
processed_files = 0
|
processed_files = 0
|
||||||
|
|
||||||
@@ -158,18 +150,18 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
|
|||||||
embeddings = await asyncio.to_thread(embed_texts, chunks)
|
embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||||
|
|
||||||
# Build chunk dicts
|
# Build chunk dicts
|
||||||
chunk_dicts = []
|
|
||||||
base_path = file_path.relative_to(library_dir).as_posix()
|
base_path = file_path.relative_to(library_dir).as_posix()
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
chunk_dict = {
|
chunk_dict = {
|
||||||
"id": f"{file_path.stem}-{i}",
|
"id": f"{base_path}:{i}",
|
||||||
"library_id": library_id,
|
"library_id": library_id,
|
||||||
"path": base_path,
|
"path": base_path,
|
||||||
"title": Path(base_path).stem,
|
"title": Path(base_path).stem,
|
||||||
"content": chunk,
|
"content": chunk,
|
||||||
"chunk_index": i,
|
"chunk_index": i,
|
||||||
"embedding": embeddings[i]
|
"embedding": embeddings[i],
|
||||||
|
"token_estimate": estimate_tokens(chunk),
|
||||||
}
|
}
|
||||||
all_chunks.append(chunk_dict)
|
all_chunks.append(chunk_dict)
|
||||||
|
|
||||||
@@ -177,37 +169,59 @@ async def ingest_library(library_id: str, name: str, description: Optional[str]
|
|||||||
|
|
||||||
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
|
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
|
||||||
|
|
||||||
# Save chunks to SQLite
|
if doc_files and not all_chunks:
|
||||||
if all_chunks:
|
error = "No document chunks were produced; keeping the existing index"
|
||||||
for chunk in all_chunks:
|
print(f" [Library] {error}")
|
||||||
insert_result = insert_document_chunk(
|
return {
|
||||||
doc_id=chunk["id"],
|
"success": False,
|
||||||
library_id=chunk["library_id"],
|
"library_id": library_id,
|
||||||
path=chunk["path"],
|
"files_processed": processed_files,
|
||||||
title=chunk.get("title"),
|
"chunks_created": 0,
|
||||||
content=chunk["content"],
|
"vectors_added": 0,
|
||||||
chunk_index=chunk["chunk_index"],
|
"error": error,
|
||||||
token_estimate=estimate_tokens(chunk["content"])
|
}
|
||||||
)
|
|
||||||
if insert_result.get('success'):
|
|
||||||
continue
|
|
||||||
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
|
|
||||||
else:
|
|
||||||
print(f" [Library] No chunks to save to SQLite")
|
|
||||||
|
|
||||||
# Save vectors to Qdrant
|
# Update vectors first. If this fails, the previous SQLite index remains usable.
|
||||||
if all_chunks:
|
if all_chunks:
|
||||||
upsert_result = await upsert_chunks(all_chunks)
|
upsert_result = await upsert_chunks(all_chunks)
|
||||||
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
|
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
|
||||||
|
if not upsert_result.get("success"):
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"library_id": library_id,
|
||||||
|
"files_processed": processed_files,
|
||||||
|
"chunks_created": len(all_chunks),
|
||||||
|
"vectors_added": 0,
|
||||||
|
"error": upsert_result.get("error", "Vector store update failed"),
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
print(f" [Library] No vectors to add to Qdrant")
|
print(f" [Library] No vectors to add to Qdrant")
|
||||||
|
upsert_result = {"success": True, "points_added": 0}
|
||||||
|
|
||||||
|
# Replace SQLite rows in one transaction only after preparation succeeds.
|
||||||
|
replace_result = replace_library_documents(library_id, all_chunks)
|
||||||
|
if not replace_result.get("success"):
|
||||||
|
print(f" [Library] SQLite replacement failed: {replace_result.get('error')}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"library_id": library_id,
|
||||||
|
"files_processed": processed_files,
|
||||||
|
"chunks_created": len(all_chunks),
|
||||||
|
"vectors_added": upsert_result.get("points_added", 0),
|
||||||
|
"error": replace_result.get("error", "SQLite replacement failed"),
|
||||||
|
}
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" [Library] Replaced {replace_result.get('deleted', 0)} old chunks "
|
||||||
|
f"with {replace_result.get('inserted', 0)} new chunks"
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"library_id": library_id,
|
"library_id": library_id,
|
||||||
"files_processed": processed_files,
|
"files_processed": processed_files,
|
||||||
"chunks_created": len(all_chunks),
|
"chunks_created": len(all_chunks),
|
||||||
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
|
"vectors_added": upsert_result.get("points_added", 0),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ services:
|
|||||||
- VECTOR_STORE_PORT=6333
|
- VECTOR_STORE_PORT=6333
|
||||||
- DOCS_PATH=/docs
|
- DOCS_PATH=/docs
|
||||||
- DB_PATH=/data/db.sqlite
|
- DB_PATH=/data/db.sqlite
|
||||||
|
- EMBEDDING_CACHE_DIR=/data/embed_cache
|
||||||
- LOG_LEVEL=INFO
|
- LOG_LEVEL=INFO
|
||||||
- API_KEY_DOCS_API=${DOCS_API_KEY:-}
|
- API_KEY_DOCS_API=${DOCS_API_KEY:-}
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
@@ -233,6 +233,76 @@ class TestDocumentChunkOperations:
|
|||||||
remaining = get_chunks_for_library("/local/cleartest")
|
remaining = get_chunks_for_library("/local/cleartest")
|
||||||
assert len(remaining) == 0
|
assert len(remaining) == 0
|
||||||
|
|
||||||
|
def test_replace_library_documents_is_atomic(self, test_database):
|
||||||
|
"""Replacing chunks should remove old rows and insert the new set."""
|
||||||
|
from backend.app.db import (
|
||||||
|
get_chunks_for_library,
|
||||||
|
insert_document_chunk,
|
||||||
|
replace_library_documents,
|
||||||
|
upsert_library,
|
||||||
|
)
|
||||||
|
|
||||||
|
library_id = "/local/replacetest"
|
||||||
|
upsert_library(library_id, "Replace test", source_path=library_id)
|
||||||
|
insert_document_chunk(
|
||||||
|
"old-chunk",
|
||||||
|
library_id,
|
||||||
|
"old.md",
|
||||||
|
content="old content",
|
||||||
|
chunk_index=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = replace_library_documents(
|
||||||
|
library_id,
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "new-chunk",
|
||||||
|
"path": "new.md",
|
||||||
|
"title": "new",
|
||||||
|
"content": "new content",
|
||||||
|
"chunk_index": 0,
|
||||||
|
"token_estimate": 2,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = get_chunks_for_library(library_id)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["deleted"] >= 1
|
||||||
|
assert result["inserted"] == 1
|
||||||
|
assert [chunk["id"] for chunk in chunks] == ["new-chunk"]
|
||||||
|
|
||||||
|
def test_failed_replacement_keeps_existing_chunks(self, test_database):
|
||||||
|
"""A bad replacement must roll back instead of erasing the old index."""
|
||||||
|
from backend.app.db import (
|
||||||
|
get_chunks_for_library,
|
||||||
|
insert_document_chunk,
|
||||||
|
replace_library_documents,
|
||||||
|
upsert_library,
|
||||||
|
)
|
||||||
|
|
||||||
|
library_id = "/local/rollbacktest"
|
||||||
|
upsert_library(library_id, "Rollback test", source_path=library_id)
|
||||||
|
insert_document_chunk(
|
||||||
|
"old-chunk",
|
||||||
|
library_id,
|
||||||
|
"old.md",
|
||||||
|
content="old content",
|
||||||
|
chunk_index=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
duplicate = {
|
||||||
|
"id": "duplicate",
|
||||||
|
"path": "new.md",
|
||||||
|
"content": "new content",
|
||||||
|
"chunk_index": 0,
|
||||||
|
}
|
||||||
|
result = replace_library_documents(library_id, [duplicate, duplicate])
|
||||||
|
|
||||||
|
chunks = get_chunks_for_library(library_id)
|
||||||
|
assert result["success"] is False
|
||||||
|
assert [chunk["id"] for chunk in chunks] == ["old-chunk"]
|
||||||
|
|
||||||
|
|
||||||
class TestDatabaseEdgeCases:
|
class TestDatabaseEdgeCases:
|
||||||
"""Tests for edge cases and error handling."""
|
"""Tests for edge cases and error handling."""
|
||||||
|
|||||||
Reference in New Issue
Block a user