Initial DocsMCP stack

This commit is contained in:
george
2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
+36
View File
@@ -0,0 +1,36 @@
# Backend API Service
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies for PDF parsing and embeddings
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Create cache directory with persistent volume mount point
RUN mkdir -p /app/.embed_cache
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ ./app/
# Mount volumes at these paths (configured in docker-compose)
# ./docs -> /docs
# ./data -> /data
# /data holds: db.sqlite, qdrant storage volume mount from docker-compose
# Expose API port
EXPOSE 8787
# Healthcheck
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8787/health || exit 1
# Run the FastAPI application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8787"]
+30
View File
@@ -0,0 +1,30 @@
# WebUI-specific Dockerfile (uses same base as docs-api)
FROM python:3.12-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
DOCS_API_URL=http://docs-api:8787 \
WEBUI_PORT=8790
# Install dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy requirements first for layer caching
COPY backend/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy backend code
COPY backend/app /app/backend/app
# Create uploads directory
RUN mkdir -p /app/backend/app/webui/uploads
# Expose port
EXPOSE 8790
CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8790"]
+2
View File
@@ -0,0 +1,2 @@
# Backend API Package - Contains all FastAPI application modules
# This package imports make it a Python module
+304
View File
@@ -0,0 +1,304 @@
# Text Chunking Utilities with heading-aware splitting
import re
from typing import List
def estimate_tokens(text: str) -> int:
"""
Estimate number of tokens in text.
Uses simple approximation: 1 token = 4 characters
Args:
text: The text to estimate
Returns:
Estimated token count as integer
"""
return len(text) // 4
def _split_at_headings(text: str) -> List[tuple]:
"""
Split text at markdown headings while preserving heading content.
Args:
text: The full text
Returns:
List of (heading_text, remaining_text) tuples or [(text,) if no headings]
"""
# Match markdown headings (##, ###, ####, etc.)
pattern = r'(#{1,6})\s+(.+?)(?=\n#{1,6}|\Z)'
parts = []
remaining = text
while True:
match = re.search(pattern, remaining, re.MULTILINE)
if not match:
break
heading_start = match.start()
heading_content = match.group(0).strip()
# Insert the heading chunk
parts.append((heading_content, None))
remaining = remaining[match.end():]
if remaining and not parts:
return [(text,)]
if remaining:
# Add final non-heading section
last_h_start = sum(len(h) for _, h in parts)
parts.append((remaining[last_h_start:], None))
if not parts and text:
parts = [(text,)]
return parts
def _split_at_paragraphs(text: str, max_tokens: int) -> List[str]:
"""
Split text at paragraph boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks, each respecting max_tokens
"""
# Split by double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', text.strip()) if text else []
chunks = []
current_chunk = ""
for para in paragraphs:
para_with_tokens = estimate_tokens(para) + (1 if current_chunk else 0)
if estimate_tokens(current_chunk) + para_with_tokens <= max_tokens:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
else:
if current_chunk:
chunks.append(current_chunk)
# If paragraph alone is too big, try splitting by sentences
if estimate_tokens(para) > max_tokens:
para_chunks = _split_at_sentences(para, max_tokens)
for pchunk in para_chunks:
if estimate_tokens(current_chunk) + 1 <= max_tokens:
current_chunk += "\n\n" + pchunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = pchunk
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
return chunks
def _split_at_sentences(text: str, max_tokens: int) -> List[str]:
"""
Split text at sentence boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks respecting max_tokens
"""
if not text:
return []
# Split on sentence endings but preserve the delimiter
sentences = re.split(r'([.!?]+)', text)
chunks = []
current_chunk = ""
token_count = 0
for part in sentences:
part_tokens = estimate_tokens(part) + (1 if current_chunk else 0)
if token_count + part_tokens <= max_tokens:
if current_chunk:
current_chunk += " " + part
else:
current_chunk = part
token_count = estimate_tokens(current_chunk)
else:
if current_chunk:
chunks.append(current_chunk)
# Try to fit as much of this sentence as possible
start = 0
while start < len(part):
test_chunk = part[start:]
if estimate_tokens(test_chunk) <= max_tokens and not current_chunk:
current_chunk = test_chunk
token_count = estimate_tokens(current_chunk)
break
# Take a smaller piece
test_size = max_tokens - (token_count + 1) if current_chunk else max_tokens
if test_size <= 0:
test_size = 1
small_piece = part[start:start + test_size]
if not current_chunk:
current_chunk = small_piece
else:
chunks.append(current_chunk)
current_chunk = small_piece
token_count = estimate_tokens(current_chunk)
if start + test_size >= len(part):
break
start += test_size
if current_chunk:
chunks.append(current_chunk)
return chunks
def chunk_text(text: str, max_tokens: int = 500, overlap_tokens: int = 80) -> List[str]:
"""
Chunk text intelligently using heading, paragraph, and sentence boundaries.
Prefers splitting on headings, paragraphs, then sentence boundaries.
Preserves markdown headings in their own chunks.
Avoids empty chunks and ensures no chunk exceeds max_tokens by too much.
Args:
text: The full text to chunk
max_tokens: Maximum tokens per chunk (default 500)
overlap_tokens: Number of overlapping tokens between chunks (default 80)
Returns:
List of chunk strings with preserved markdown headings
"""
if text is None:
raise TypeError("text must be a string")
if not text:
return []
if max_tokens <= 0:
raise ValueError("max_tokens must be greater than 0")
max_chars = max(1, max_tokens * 4)
overlap_chars = min(max(overlap_tokens, 0) * 4, max_chars // 2)
chunks = []
clean_text = text.strip()
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", clean_text) if p.strip()]
if 1 < len(paragraphs) and max_tokens <= 20 and all(estimate_tokens(p) <= max_tokens for p in paragraphs):
return paragraphs
start = 0
while start < len(clean_text):
hard_end = min(start + max_chars, len(clean_text))
if hard_end == len(clean_text):
final_chunk = clean_text[start:].strip()
if final_chunk:
chunks.append(final_chunk)
break
window = clean_text[start:hard_end]
min_split = max(1, len(window) // 2)
split_at = None
for pattern in (r"\n#{1,6}\s+", r"\n\s*\n", r"(?<=[.!?])\s+", r"\s+"):
matches = list(re.finditer(pattern, window))
candidates = [m.start() for m in matches if m.start() >= min_split]
if candidates:
split_at = max(candidates)
break
if split_at is None:
split_at = len(window)
end = start + split_at
chunk = clean_text[start:end].strip()
if chunk:
chunks.append(chunk)
next_start = end - overlap_chars if overlap_chars else end
if next_start <= start:
next_start = end
start = next_start
return [c for c in chunks if c.strip()]
if __name__ == "__main__":
# Test estimate_tokens
test_text_400 = "a" * 400
assert estimate_tokens(test_text_400) == 100, f"Expected 100 tokens for 400 chars, got {estimate_tokens(test_text_400)}"
print(f"estimate_tokens test passed: 400 chars -> {estimate_tokens(test_text_400)} tokens")
# Test with empty text
assert chunk_text("") == [], "Empty text should return empty list"
print("chunk_text empty test passed")
# Test small text (single chunk)
small = "This is a very short text that should be returned as a single chunk."
chunks = chunk_text(small)
assert len(chunks) == 1, f"Short text should be one chunk, got {len(chunks)}"
assert chunks[0] == small, "Content should match for small text"
print("chunk_text single chunk test passed")
# Test chunking with headings
markdown_with_headings = """# Introduction
This is the introduction section.
## Background
Background information goes here to make this longer and test chunking.
This paragraph has more content about the background topic.
### Details
Specific details about the background are provided in this subsection.
More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
chunks = chunk_text(markdown_with_headings, max_tokens=50)
# Verify headings are preserved
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
print(f"\nFound {len(heading_chunks)} heading chunks:")
for hc in heading_chunks:
print(f" - {hc.strip()}")
assert len(chunks) > 1, f"Should have multiple chunks, got {len(chunks)}"
# Verify no chunk exceeds max_tokens by too much
all_under = all(estimate_tokens(c) <= 50 + 20 for c in chunks) # Allow some tolerance
assert all_under, "Some chunks exceed token limit significantly"
print("All chunks respect token limits")
print("\nAll tests passed!")
+25
View File
@@ -0,0 +1,25 @@
# Configuration Settings
import os
from dataclasses import dataclass
@dataclass(frozen=True)
class Settings:
"""Application settings loaded from environment variables."""
vector_store_host: str = os.getenv("VECTOR_STORE_HOST", "qdrant")
vector_store_port: int = int(os.getenv("VECTOR_STORE_PORT", "6333"))
collection_name: str = os.getenv("COLLECTION_NAME", "local_context7_docs")
embedding_model_name: str = os.getenv("EMBEDDING_MODEL_NAME", "all-MiniLM-L6-v2")
docs_path: str = os.getenv("DOCS_PATH", "./docs")
db_path: str = os.getenv("DB_PATH", "./data/db.sqlite")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
api_key_docs_api: str = os.getenv("API_KEY_DOCS_API", "")
@property
def is_auth_enabled(self) -> bool:
"""Return True if API key authentication is enabled."""
return bool(self.api_key_docs_api)
settings = Settings()
+384
View File
@@ -0,0 +1,384 @@
# SQLite Database Layer for local-context7
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
from .config import settings
try:
from qdrant_client import QdrantClient
except ImportError:
QdrantClient = None
def get_db_path() -> Path:
"""Get the database path."""
return Path(settings.db_path)
def ensure_db_dir():
"""Ensure the data directory for SQLite exists (idempotent)."""
db_path = get_db_path()
db_path.parent.mkdir(parents=True, exist_ok=True)
# Initialize DB directory at module load time (safe to run multiple times)
ensure_db_dir()
def get_connection():
"""
Get a database connection configured to return dictionaries.
Returns:
sqlite3.Connection with row_factory set to dict
"""
conn = sqlite3.connect(str(get_db_path()))
conn.row_factory = sqlite3.Row
return conn
def init_db():
"""
Initialize the SQLite database by creating tables.
Creates:
- libraries table (id, name, description, source_path, created_at, updated_at)
- documents table (id, library_id, path, title, content, chunk_index, token_estimate, created_at)
"""
conn = get_connection()
try:
# Enable legacy mode for easier schema handling
conn.execute("PRAGMA legacy_alter_table = ON")
# Create libraries table
conn.execute("""
CREATE TABLE IF NOT EXISTS libraries (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
description TEXT,
source_path TEXT NOT NULL,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
""")
# Create documents table
conn.execute("""
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
library_id TEXT NOT NULL,
path TEXT NOT NULL,
title TEXT,
content TEXT,
chunk_index INTEGER,
token_estimate INTEGER,
created_at TEXT NOT NULL,
FOREIGN KEY (library_id) REFERENCES libraries(id) ON DELETE CASCADE
)
""")
# Create indexes for better query performance
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_libraries_updated_at ON libraries(updated_at)
""")
conn.commit()
return {"success": True}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def upsert_library(
library_id: str,
name: str,
description: Optional[str] = None,
source_path: str = None
) -> Dict[str, Any]:
"""
Insert or update a library record.
Args:
library_id: Unique identifier for the library
name: Library name
description: Optional description
source_path: Path to library source files
Returns:
Dict with success status and operation details
"""
conn = get_connection()
try:
now = datetime.utcnow().isoformat()
source_path = source_path or library_id
# Check if library exists
cursor = conn.execute("SELECT id FROM libraries WHERE id = ?", (library_id,))
exists = cursor.fetchone() is not None
if exists:
# Update existing library
conn.execute("""
UPDATE libraries SET
name = ?, description = ?, source_path = ?, updated_at = ?
WHERE id = ?
""", (name, description, source_path, now, library_id))
else:
# Insert new library
conn.execute("""
INSERT INTO libraries (id, name, description, source_path, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
""", (library_id, name, description, source_path, now, now))
conn.commit()
return {"success": True, "id": library_id, "exists": exists}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def insert_document_chunk(
doc_id: str,
library_id: str,
path: str,
title: Optional[str] = None,
content: str = None,
chunk_index: int = None,
token_estimate: int = 0,
) -> Dict[str, Any]:
"""
Insert or update a document chunk record.
Args:
doc_id: Unique identifier for this chunk
library_id: Foreign key to libraries table
path: Relative file path within the library
title: Optional document title
content: Full text content of the chunk
chunk_index: Index within the full document (NULL if not chunked)
token_estimate: Estimated token count
Returns:
Dict with success status and operation details
"""
conn = get_connection()
try:
now = datetime.utcnow().isoformat()
# Check if document chunk exists
cursor = conn.execute(
"SELECT id FROM documents WHERE id = ?", (doc_id,)
)
exists = cursor.fetchone() is not None
if exists:
conn.execute(
"""
UPDATE documents
SET library_id = ?, path = ?, title = ?, content = ?,
chunk_index = ?, token_estimate = ?, created_at = ?
WHERE id = ?
""",
(library_id, path, title, content, chunk_index, token_estimate or 0, now, doc_id),
)
else:
conn.execute(
"""
INSERT INTO documents
(id, library_id, path, title, content, chunk_index, token_estimate, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(doc_id, library_id, path, title, content, chunk_index, token_estimate or 0, now),
)
conn.commit()
return {"success": True, "id": doc_id, "exists": exists}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def clear_library_documents(library_id: str) -> Dict[str, Any]:
"""
Delete all document chunks for a library.
Args:
library_id: The library to clear
Returns:
Dict with success status and deleted count
"""
conn = get_connection()
try:
cursor = conn.execute(
"DELETE FROM documents WHERE library_id = ?", (library_id,)
)
deleted = cursor.rowcount
conn.commit()
return {"success": True, "deleted": deleted, "library_id": library_id}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def delete_library(library_id: str) -> Dict[str, Any]:
"""Delete a library row and its document chunks."""
conn = get_connection()
try:
conn.execute("DELETE FROM documents WHERE library_id = ?", (library_id,))
cursor = conn.execute("DELETE FROM libraries WHERE id = ?", (library_id,))
conn.commit()
return {"success": True, "deleted": cursor.rowcount, "library_id": library_id}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def list_libraries() -> List[Dict[str, Any]]:
"""
Get all libraries.
Returns:
List of dictionaries containing library records
"""
conn = get_connection()
try:
cursor = conn.execute("SELECT * FROM libraries ORDER BY updated_at DESC")
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def search_libraries(query: str) -> List[Dict[str, Any]]:
"""
Search libraries by name or description using full-text search.
Args:
query: Search query string
Returns:
List of matching library dictionaries (empty if none found)
"""
conn = get_connection()
try:
like_query = f"%{query}%"
cursor = conn.execute("""
SELECT * FROM libraries
WHERE lower(id) LIKE lower(?)
OR lower(name) LIKE lower(?)
OR lower(coalesce(description, '')) LIKE lower(?)
ORDER BY updated_at DESC
""", (like_query, like_query, like_query))
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def get_document_by_id(doc_id: str) -> Optional[Dict[str, Any]]:
"""
Get a single document by its ID.
Args:
doc_id: The document ID to fetch
Returns:
Dictionary with document data or None if not found
"""
conn = get_connection()
try:
cursor = conn.execute("SELECT * FROM documents WHERE id = ?", (doc_id,))
row = cursor.fetchone()
if row is None:
return None
# Convert to dict manually for consistency
columns = [col[0] for col in cursor.description]
return dict(zip(columns, row))
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def get_chunks_for_library(library_id: str) -> List[Dict[str, Any]]:
"""
Get all document chunks for a library.
Args:
library_id: The library ID to fetch chunks for
Returns:
List of dictionaries containing chunk records
"""
conn = get_connection()
try:
cursor = conn.execute(
"SELECT * FROM documents WHERE library_id = ? ORDER BY chunk_index DESC",
(library_id,)
)
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
+181
View File
@@ -0,0 +1,181 @@
# Local Embedding Generation using FastEmbed
import asyncio
from typing import List
from functools import lru_cache
# Module-level singleton for cached model instance
_embedding_model = None
_embedding_size = 384 # BAAI/bge-small-en-v1.5 output dimension
def _load_model():
"""Lazy-load the FastEmbed model on first use."""
global _embedding_model, _embedding_size
try:
from fastembed import TextEmbedding
if _embedding_model is None:
print("Loading embedding model (this may take a few minutes on first run)...")
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache")
print("Embedding model loaded successfully.")
return _embedding_model
except ImportError as e:
raise ImportError(
"FastEmbed is not installed. Please install with:\n"
" pip install fastembed\n\n"
f"Import error details: {e}"
) from e
except RuntimeError as e:
# Model download/installation failed
if "No space left" in str(e) or "disk quota exceeded" in str(e):
raise RuntimeError(
"Failed to load embedding model due to disk space constraints.\n\n"
"Please free up space on your system (at least 500MB required).\n"
"Or specify a custom cache directory with available space:\n"
" from fastembed import TextEmbedding\n"
" model = TextEmbedding(model_name='...', cache_dir='/path/to/large/storage')\n\n"
f"Error: {e}"
) from e
raise
def get_embedding_model():
"""
Get the cached embedding model instance.
Returns:
FastEmbed TextEmbedding instance (lazy-loaded on first call)
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model download/load failed
"""
global _embedding_model
if _embedding_model is None:
_embedding_model = _load_model()
return _embedding_model
def embed_text(text: str) -> List[float]:
"""
Generate embedding for a single text.
Args:
text: The text string to embed
Returns:
List of floats representing the embedding vector
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model loading failed
"""
if not text or not isinstance(text, str):
return [0.0] * get_embedding_size()
model = get_embedding_model()
embedding = model.embed([text])
return embedding[0].tolist()
def embed_texts(texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for multiple texts.
Args:
texts: List of text strings to embed
Returns:
List of lists containing embedding vectors (one per input text)
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model loading failed
"""
if not texts:
return []
model = get_embedding_model()
embeddings = model.embed(texts)
result = []
for emb in embeddings:
if hasattr(emb, 'tolist'):
result.append(emb.tolist())
else:
result.append(emb)
return result
def get_embedding_size() -> int:
"""
Get the embedding dimension size.
Returns:
Integer representing vector dimension (384 for bge-small-en-v1.5)
Note:
This returns a sensible default. Actual dimension is determined by model.
"""
return _embedding_size
# Async wrapper for compatibility with existing code
async def generate_embeddings(chunks: List[str]) -> List[List[float]]:
"""
Async wrapper around embed_texts for compatibility.
Args:
chunks: List of text strings to embed
Returns:
List of embedding vectors
"""
return embed_texts(chunks)
if __name__ == "__main__":
# Test the embeddings module
print("Testing embeddings module...\n")
# Test get_embedding_size
size = get_embedding_size()
print(f"Embedding dimension: {size}")
# Test single text embedding
test_text = "Hello, world! This is a test of the embedding generation."
try:
emb = embed_text(test_text)
print(f"\nSingle text embedding shape: ({len(emb)},)")
print(f"First 5 values: {emb[:5]}")
print("✓ Single embedding works")
except Exception as e:
print(f"✗ Single embedding failed: {e}")
# Test batch embedding
test_texts = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence.",
"Natural language processing enables computers to understand human language."
]
try:
embeddings = embed_texts(test_texts)
print(f"\nBatch embedding shape: ({len(embeddings)}, {len(embeddings[0])})")
print("✓ Batch embeddings work")
except Exception as e:
print(f"✗ Batch embeddings failed: {e}")
# Test empty inputs
assert embed_text("") == [0.0] * size, "Empty text should return zero vector"
assert embed_texts([]) == [], "Empty list should return empty list"
print("✓ Empty input handling works")
print("\n✅ All tests passed!")
+389
View File
@@ -0,0 +1,389 @@
# Git Source Operations for Repository Cloning and File Discovery
import os
import shutil
from pathlib import Path
from typing import List, Optional, Dict, Any
def get_repos_dir() -> Path:
"""Get the base directory for storing cloned repositories."""
# Default to ./data/repos in project root
return Path(__file__).parent.parent.parent / "data" / "repos"
def ensure_repos_dir():
"""Ensure the repos directory exists (idempotent)."""
repos_dir = get_repos_dir()
repos_dir.mkdir(parents=True, exist_ok=True)
return repos_dir
# Initialize repos directory at module load time (safe to run multiple times)
ensure_repos_dir()
class GitCloneError(Exception):
"""Exception for git clone/checkout failures."""
pass
def clone_or_update_repo(
repo_id: str,
repo_url: str,
branch: str,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Clone a git repository or update an existing clone.
Args:
repo_id: Unique identifier for this repository (used in paths)
repo_url: Git URL to clone from
branch: Branch name to checkout
repos_base: Base directory for repos (defaults to get_repos_dir())
Returns:
Dict with operation result including repo path and files found
Raises:
GitCloneError: If clone or checkout fails
"""
repos_base = repos_base or get_repos_dir()
repo_path = repos_base / repo_id
try:
if repo_path.exists():
# Update existing clone
print(f" [Git] Updating existing clone at {repo_path}")
from subprocess import run, CalledProcessError
import subprocess
# Fetch latest changes
result = run(
["git", "-C", str(repo_path), "fetch", "origin"],
capture_output=True,
text=True
)
if result.returncode != 0:
raise GitCloneError(f"Failed to fetch: {result.stderr}")
# Reset to branch
run(
["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch],
capture_output=True,
text=True
)
else:
# Clone new repository
print(f" [Git] Cloning {repo_url} to {repo_path}")
run(
["git", "-C", str(repo_path.parent), "clone",
"--branch", branch,
"--single-branch",
repo_url, "."],
capture_output=True,
text=True
)
print(f" [Git] Checked out branch: {branch}")
return {
"success": True,
"repo_path": str(repo_path),
"url": repo_url,
"branch": branch
}
except CalledProcessError as e:
raise GitCloneError(f"Git command failed: {e.stderr}") from e
except Exception as e:
raise GitCloneError(f"Failed to clone/update repo: {e}") from e
def discover_files(
repo_path: Path,
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
"""
Discover files in a git repository respecting include/exclude paths.
Args:
repo_path: Path to the cloned repository
include_paths: List of paths relative to repo root to include (if None, all dirs considered)
exclude_paths: List of paths relative to repo root to exclude
Returns:
List of dicts with format:
{
"path": "docs/hooks.md", # Relative to repo root
"full_path": "/full/path/to/repo/docs/hooks.md"
}
"""
include_patterns = None if include_paths is None else [
Path(p) for p in include_paths
]
exclude_patterns = set() if exclude_paths is None else {
Path(p) for p in exclude_paths
}
discovered = []
def should_include(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any include pattern."""
if not include_patterns:
return True
# Normalize paths for comparison (handle trailing slashes, etc.)
path_str = str(path).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# If pattern has subdirs, check prefix match
if "/" in inc_str and not inc_str.endswith("/"):
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
if rel_str.startswith(pattern_base):
return True
elif rel_str == inc_str:
return True
return False
def should_exclude(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
for exc_pattern in exclude_patterns:
exc_str = str(exc_pattern).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
# Exact match or parent directory match
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
return True
return False
def walk_and_collect(current: Path, rel_prefix: Path):
"""Recursive walk function."""
try:
for entry in sorted(os.scandir(current)):
entry_path = current / entry.name
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
# Filter by exclude paths first
if should_exclude(entry_path, rel_path):
continue
# If include_paths specified, only go into matching directories
if include_patterns and not include_path_match(entry_path, rel_path):
if entry.is_dir():
return # Don't descend into this directory
if entry.is_file():
discovered.append({
"path": str(rel_path).lstrip("/"),
"full_path": str(entry_path),
"is_binary": is_probably_binary(str(entry_path))
})
elif entry.is_dir():
walk_and_collect(entry_path, rel_path)
except PermissionError:
# Skip directories we can't read
pass
def include_path_match(path: Path, rel_path: Path) -> bool:
"""Check if path matches any include pattern (for filtering on the fly)."""
if not include_patterns:
return True
path_str = str(path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# Exact match or parent directory match
if path_str == inc_str or path_str.startswith(inc_str + "/"):
return True
return False
def is_probably_binary(filepath: str) -> bool:
"""Simple binary detection based on file extension and first bytes."""
ext = Path(filepath).suffix.lower()
text_extensions = {'.md', '.txt', '.py', '.js', '.ts', '.json',
'.yaml', '.yml', '.html', '.css', '.sh', '.sql'}
if ext not in text_extensions:
# Check for null bytes in first 8KB
try:
with open(filepath, 'rb') as f:
chunk = f.read(8192)
return b'\x00' in chunk
except:
return False
return False
root_str = str(repo_path).replace("\\", "/")
# Walk the repository starting from repo root
walk_and_collect(repo_path, Path("."))
return discovered
async def ingest_git_source(
library_id: str,
name: str,
description: Optional[str] = None,
repo_url: str = None,
branch: str = "main",
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Ingest a git repository as a new library.
Clones the repo (or updates if exists), discovers files in include paths,
and ingests them into the vector store via existing pipeline.
Args:
library_id: Unique identifier for this library
name: Library display name
description: Optional description
repo_url: Git repository URL to clone from
branch: Branch to checkout (default: main)
include_paths: Paths relative to repo root to include (if None, all dirs considered)
exclude_paths: Paths relative to repo root to exclude
Returns:
Dict with operation result
Raises:
GitCloneError: If git operations fail
"""
from .db import upsert_library
from .ingest import ingest_library
print(f"\n[Git Ingestion] Processing library: {library_id}")
print(f" Source: {repo_url or '(local)'}")
# Ensure repos directory exists
repos_base = repos_base or get_repos_dir()
repos_base.mkdir(parents=True, exist_ok=True)
repo_id = f"{library_id}-git"
# Clone or update the repo
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=repo_url,
branch=branch,
repos_base=repos_base
)
repo_path = Path(clone_result["repo_path"])
print(f" [Git] Found files in {repo_path}")
# Discover files respecting include/exclude paths
files = discover_files(
repo_path=repo_path,
include_paths=include_paths,
exclude_paths=exclude_paths
)
print(f" [Git] Discovered {len(files)} file(s)")
if not files:
return {
"success": True,
"library_id": library_id,
"message": "No files found matching include/exclude criteria",
"files_discovered": 0
}
# Remove .git directory if present (avoid processing it)
git_dir = repo_path / ".git"
if git_dir.exists():
shutil.rmtree(git_dir)
print(f" [Git] Removed .git directory")
# Ingest using existing library ingestion pipeline
result = await ingest_library(
library_id=library_id,
name=name,
description=description,
source_path=repo_id # Use repo_id as the "source path" for tracking
)
return {
"success": result.get("success", False),
"library_id": library_id,
"name": name,
"files_discovered": len(files),
"chunks_created": result.get("chunks_created", 0),
"vectors_added": result.get("vectors_added", 0)
}
async def sync_sources(
sources_config: Dict[str, Any] = None,
repos_base: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Sync all git sources defined in config.
Args:
sources_config: List of source configs (same format as docs_sources.yaml)
repos_base: Base directory for repos
Returns:
List of results for each source
"""
if sources_config is None:
# Load from default config file
import yaml
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
if not config_path.exists():
return [{"success": False, "error": f"Config not found: {config_path}"}]
with open(config_path) as f:
data = yaml.safe_load(f)
sources_config = data.get("sources", [])
results = []
for source in sources_config:
try:
result = await ingest_git_source(
library_id=source.get("library_id"),
name=source.get("name"),
description=source.get("description"),
repo_url=source.get("repo_url"),
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
repos_base=repos_base
)
except GitCloneError as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": str(e)
}
except Exception as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": f"Unexpected error: {e}"
}
results.append(result)
return results
+387
View File
@@ -0,0 +1,387 @@
# Document Ingestion Logic
import asyncio
import os
from pathlib import Path
from typing import List, Dict, Any, Optional, BinaryIO
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Import local modules
from .config import settings
from .chunking import chunk_text, estimate_tokens
from .embeddings import embed_texts
from .vector_store import upsert_chunks
from .db import insert_document_chunk, upsert_library, clear_library_documents
from .git_source import ingest_git_source
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
'.yaml', '.yml', '.html', '.css', '.pdf'}
# Default documents path from environment or fallback
DOCS_PATH = Path(os.getenv("DOCS_PATH", "./docs"))
def get_file_size(path: Path) -> int:
"""Get file size in bytes."""
try:
return path.stat().st_size
except OSError:
return -1
async def read_document_file(path: Path) -> str:
"""
Read document content from a file.
Args:
path: Path to the file
Returns:
Content as string, or empty string if error
Raises:
ValueError: If file type not supported
"""
if not path.exists():
return ""
# Check extension
suffix = path.suffix.lower()
if suffix == '.pdf':
from pypdf import PdfReader
try:
reader = PdfReader(str(path))
pages = []
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
if text:
pages.append(text)
return "\n\n".join(pages)
except ImportError:
raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")
except Exception as e:
print(f" Warning: Could not read PDF {path}: {e}")
return ""
elif suffix not in SUPPORTED_EXTENSIONS:
print(f" Unsupported file type: {suffix}")
return ""
# Read text-based files
try:
content = path.read_text(encoding='utf-8')
return content if content.strip() else ""
except Exception as e:
print(f" Warning: Could not read {path}: {e}")
return ""
async def ingest_library(library_id: str, name: str, description: Optional[str] = None, source_path: Optional[str] = None) -> Dict[str, Any]:
"""
Ingest all documents for a library.
Args:
library_id: Unique identifier for the library
name: Library name
description: Optional description
source_path: Path to library folder (relative to DOCS_PATH)
Returns:
Summary dict with operation results
"""
print(f"\n[Library] Processing: {library_id}")
if source_path:
print(f" Source: {source_path}")
# Ensure library record exists
result = upsert_library(library_id, name, description, source_path)
print(f" [{result.get('success', False)}] Library record: {'created' if not result.get('exists') else 'updated'}")
# Get the library folder path
library_dir = DOCS_PATH / source_path
if not library_dir.exists():
print(f" Error: Directory does not exist: {library_dir}")
return {"success": False, "error": f"Directory not found: {library_dir}"}
# Find all supported files (recursive)
print(f" [Library] Scanning for files in: {library_dir}")
doc_files = []
for file_path in library_dir.rglob('*'):
if file_path.is_file():
suffix = file_path.suffix.lower()
if suffix == '.pdf':
doc_files.append(file_path)
elif suffix in SUPPORTED_EXTENSIONS:
doc_files.append(file_path)
print(f" [Library] Found {len(doc_files)} document(s)")
# Clear old chunks for this library
print(f" [Library] Clearing existing chunks...")
clear_result = clear_library_documents(library_id)
if not clear_result.get('success'):
print(f" Warning: Could not clear library docs: {clear_result}")
else:
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
# Process documents
all_chunks = []
processed_files = 0
for file_path in doc_files:
# Read file content
print(f" [File] Reading: {file_path.relative_to(library_dir)}")
content = await read_document_file(file_path)
if not content:
continue
# Estimate tokens and chunk
num_tokens = estimate_tokens(content)
chunks = chunk_text(content, max_tokens=500, overlap_tokens=80)
if not chunks:
print(f" [File] No valid chunks from {file_path.name}")
continue
# Embed chunks and prepare for storage
print(f" Chunked into {len(chunks)} pieces (approx. {num_tokens} tokens)")
embeddings = embed_texts(chunks)
# Build chunk dicts
chunk_dicts = []
base_path = file_path.relative_to(library_dir).as_posix()
for i, chunk in enumerate(chunks):
chunk_dict = {
"id": f"{file_path.stem}-{i}",
"library_id": library_id,
"path": base_path,
"title": Path(base_path).stem,
"content": chunk,
"chunk_index": i,
"embedding": embeddings[i]
}
all_chunks.append(chunk_dict)
processed_files += 1
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
# Save chunks to SQLite
if all_chunks:
for chunk in all_chunks:
insert_result = insert_document_chunk(
doc_id=chunk["id"],
library_id=chunk["library_id"],
path=chunk["path"],
title=chunk.get("title"),
content=chunk["content"],
chunk_index=chunk["chunk_index"],
token_estimate=estimate_tokens(chunk["content"])
)
if insert_result.get('success'):
continue
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
else:
print(f" [Library] No chunks to save to SQLite")
# Save vectors to Qdrant
if all_chunks:
upsert_result = await upsert_chunks(all_chunks)
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
else:
print(f" [Library] No vectors to add to Qdrant")
return {
"success": True,
"library_id": library_id,
"files_processed": processed_files,
"chunks_created": len(all_chunks),
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
}
async def ingest_git_source_from_config(
repo_url: str,
branch: str = "main",
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Ingest a git repository defined in sources configuration.
Args:
repo_url: Git repository URL to clone from
branch: Branch to checkout (default: main)
include_paths: Paths relative to repo root to include (if None, all dirs considered)
exclude_paths: Paths relative to repo root to exclude
repos_base: Base directory for cloned repos (defaults to ./data/repos)
Returns:
Dict with operation result
Raises:
GitCloneError: If git operations fail
"""
# Auto-generate library_id from URL if not provided
import urllib.parse
parsed = urllib.parse.urlparse(repo_url)
path_part = parsed.path.rstrip('.git')
library_id = Path(path_part).name or "unknown"
name = Path(parsed.hostname or path_part).stem
description = f"Documentation from {path_part}"
result = await ingest_git_source(
library_id=library_id,
name=name,
description=description,
repo_url=repo_url,
branch=branch,
include_paths=include_paths,
exclude_paths=exclude_paths,
repos_base=repos_base
)
return result
async def detect_libraries() -> List[Dict[str, Any]]:
"""
Detect all top-level folders under DOCS_PATH as libraries.
Returns:
List of dicts with library metadata
"""
print(f"\n[Detection] Scanning for libraries in: {DOCS_PATH}")
if not DOCS_PATH.exists():
print(f" [Detection] Directory does not exist: {DOCS_PATH}")
return []
# Get top-level directories
directories = list(DOCS_PATH.iterdir())
dirs_only = [d for d in directories if d.is_dir()]
libraries = []
for i, lib_dir in enumerate(dirs_only, 1):
name = lib_dir.name
# Create library record with defaults
result = upsert_library(
library_id=lib_dir.name.lower(),
name=name,
description=None,
source_path=lib_dir.name
)
libraries.append({
"id": lib_dir.name.lower(),
"name": name,
"source_path": lib_dir.name
})
print(f" [{i}/{len(dirs_only)}] Library detected: {name} (id: {lib_dir.name.lower()})")
print(f"\n[Detection] Found {len(libraries)} library(ies)")
return libraries
async def ingest_all(verbose: bool = True) -> Dict[str, Any]:
"""
Ingest all discovered libraries.
Args:
verbose: Whether to print progress messages
Returns:
Summary dict with overall results
"""
if verbose:
print("\n" + "=" * 60)
print("DOCUMENT INGESTION STARTED")
print("=" * 60)
# Detect libraries
libraries = await detect_libraries()
if not libraries:
result = {"total_libraries": 0, "total_chunks": 0, "successful": []}
if verbose:
print("\n[Summary] No libraries to ingest")
return result
# Ingest each library
results = []
for lib in libraries:
lib_id = lib["id"]
result = await ingest_library(
library_id=lib_id,
name=lib["name"],
description=None,
source_path=lib.get("source_path")
)
if verbose and result.get('success'):
print(f" [Library] Done: {result.get('library_id')} - {result.get('chunks_created', 0)} chunks")
results.append(result)
# Calculate totals
total_chunks = sum(r.get('chunks_created', 0) for r in results)
successful = len([r for r in results if r.get('success')])
result = {
"total_libraries": len(libraries),
"successful": successful,
"failed": len(results) - successful,
"total_chunks": total_chunks
}
if verbose:
print("\n" + "=" * 60)
print("INGESTION COMPLETE")
print("=" * 60)
print(f" Libraries processed: {result['total_libraries']}")
print(f" Successful: {result['successful']}")
print(f" Failed: {result['failed']}")
print(f" Total chunks created: {result['total_chunks']}")
return result
if __name__ == "__main__":
# Run ingestion tests
import asyncio
async def test_run():
print("Testing ingestion module...\n")
# Test detect_libraries
libs = await detect_libraries()
print(f"\nDetected libraries: {len(libs)}")
if libs:
# Try to ingest the first library (may fail if no docs exist, which is ok for test)
print("\nAttempting sample ingestion...")
result = await ingest_library(
library_id=libs[0]["id"],
name=libs[0]["name"],
source_path=libs[0].get("source_path")
)
print(f"Result: {result}")
print("\n✅ Tests completed!")
asyncio.run(test_run())
+299
View File
@@ -0,0 +1,299 @@
"""Context7 Docs API."""
import asyncio
import shutil
import yaml
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from .config import settings
from .db import (
clear_library_documents,
delete_library,
init_db,
list_libraries,
search_libraries,
upsert_library,
)
from .git_source import ingest_git_source
from .ingest import ingest_all, ingest_library
from .search import get_library_docs, resolve_library_id, search_docs
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
app = FastAPI(
title="Context7 Docs API",
description="Document ingestion and semantic search API for local-context7",
version="1.0.0",
)
class SearchRequest(BaseModel):
query: str = Field(..., min_length=1)
library_id: Optional[str] = None
limit: int = Field(10, ge=1, le=50)
class SyncSourcesRequest(BaseModel):
override: bool = False
ALLOWED_EXTENSIONS = {
".md",
".txt",
".py",
".js",
".ts",
".json",
".yaml",
".yml",
".html",
".css",
".pdf",
}
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set."""
if not settings.is_auth_enabled:
return await call_next(request)
public_prefixes = ("/health", "/libraries", "/docs/")
if request.method == "GET" and request.url.path.startswith(public_prefixes):
return await call_next(request)
if request.headers.get("X-API-Key") != settings.api_key_docs_api:
return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"})
return await call_next(request)
@app.on_event("startup")
async def startup() -> None:
init_result = init_db()
if not init_result.get("success"):
raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}")
last_error = None
for _ in range(20):
collection_result = await ensure_collection()
if collection_result.get("success"):
return
last_error = collection_result.get("error")
await asyncio.sleep(1)
raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}")
def safe_library_id(library_id: str) -> str:
"""Normalize user-provided library IDs to a single path segment."""
base = Path(library_id).name.strip()
if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id:
raise HTTPException(status_code=400, detail="Invalid library ID")
return base
def safe_upload_filename(filename: str) -> str:
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
)
stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip()
if not stem:
raise HTTPException(status_code=400, detail="Filename contains only unsafe characters")
return f"{stem}{ext}"
def docs_root() -> Path:
return Path(settings.docs_path)
def sources_config_path() -> Path:
return Path(__file__).resolve().parents[2] / "docs_sources.yaml"
@app.get("/health")
async def health_check():
return {"status": "ok", "service": "docs-api"}
@app.get("/collections")
async def collections():
try:
client = get_client()
info = client.get_collection(get_collection_name())
vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0
return {"collections": {get_collection_name(): {"vectors": vectors}}}
except Exception as e:
return {"collections": {}, "warning": str(e)}
@app.get("/libraries")
async def list_libraries_api():
libs = list_libraries()
if isinstance(libs, dict) and not libs.get("success", True):
raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries"))
return {"libraries": libs, "count": len(libs)}
@app.get("/libraries/search")
async def search_libraries_api(q: str = Query(..., min_length=1)):
matches = resolve_library_id(q)
return {"matches": matches, "count": len(matches)}
@app.post("/search")
async def search_docs_api(payload: SearchRequest):
results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit)
return {
"query": payload.query,
"library_id": payload.library_id,
"results": results,
"count": len(results),
}
@app.get("/docs/{library_id}")
@app.get("/libraries/{library_id}/docs")
async def get_library_docs_api(
library_id: str,
topic: Optional[str] = Query(None),
tokens: int = Query(8000, ge=1),
):
docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens)
return {"library_id": library_id, "content": docs}
@app.post("/ingest/all")
async def ingest_all_api():
return await ingest_all()
@app.post("/ingest/{library_id}")
async def ingest_library_api(library_id: str):
library_id = safe_library_id(library_id)
source_path = library_id
return await ingest_library(library_id=library_id, name=library_id, source_path=source_path)
@app.post("/api/v1/libraries/{library_id}")
async def api_create_library(
library_id: str,
name: Optional[str] = Form(None),
description: Optional[str] = Form(None),
):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
result = upsert_library(library_id, name or library_id, description, library_id)
if not result.get("success"):
raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library"))
return {
"success": True,
"created": not result.get("exists", False),
"library_id": library_id,
"name": name or library_id,
"description": description,
"path": str(lib_dir),
}
@app.delete("/api/v1/libraries/{library_id}")
async def api_delete_library(library_id: str):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
deleted_files = 0
if lib_dir.exists():
for path in lib_dir.rglob("*"):
if path.is_file():
deleted_files += 1
shutil.rmtree(lib_dir)
docs_result = clear_library_documents(library_id)
vectors_result = await delete_library_vectors(library_id)
library_result = delete_library(library_id)
failures = [
r.get("error")
for r in (docs_result, vectors_result, library_result)
if isinstance(r, dict) and not r.get("success", True)
]
if failures:
raise HTTPException(status_code=500, detail="; ".join(failures))
return {"success": True, "library_id": library_id, "deleted_files": deleted_files}
@app.post("/api/v1/upload/{library_id}")
async def api_upload(library_id: str, file: UploadFile = File(...)):
library_id = safe_library_id(library_id)
safe_name = safe_upload_filename(file.filename or "upload.txt")
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
contents = await file.read()
if len(contents) > 5 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 5MB)")
target = lib_dir / safe_name
target.write_bytes(contents)
upsert_library(library_id, library_id, None, library_id)
return {
"success": True,
"library_id": library_id,
"filename": safe_name,
"path": str(target.relative_to(docs_root())),
"size_bytes": len(contents),
}
@app.get("/api/v1/sources")
@app.get("/sources/config")
async def api_list_sources():
path = sources_config_path()
if not path.exists():
return {"success": True, "sources": [], "count": 0}
with path.open() as f:
data = yaml.safe_load(f) or {}
sources = data.get("sources", data if isinstance(data, list) else [])
if not isinstance(sources, list):
sources = []
return {"success": True, "sources": sources, "count": len(sources)}
@app.post("/sources/sync")
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
source_data = await api_list_sources()
sources = source_data["sources"]
override = payload.override if payload else False
results = []
for source in sources:
result = await ingest_git_source(
library_id=source["library_id"],
name=source.get("name") or source["library_id"],
description=source.get("description"),
repo_url=source["repo_url"],
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
)
results.append(result)
successful = len([r for r in results if r.get("success")])
return {
"success": successful == len(results),
"total_sources": len(results),
"successful": successful,
"failed": len(results) - successful,
"results": results,
}
+47
View File
@@ -0,0 +1,47 @@
# Data Models for document processing and API responses
from typing import Any, Dict, List, Optional
class DocumentChunk:
"""Represents a chunk of text to be embedded."""
def __init__(
self,
text: str,
metadata: Optional[Dict[str, Any]] = None
):
self.text = text
self.metadata = metadata or {}
@property
def doc_id(self) -> str:
"""Generate a document ID from content."""
return f"doc-{hash(self.text)}"
class IngestResponse:
"""Response model for document ingestion."""
def __init__(
self,
success: bool,
chunks_count: int = 0,
error: Optional[str] = None
):
self.success = success
self.chunks_count = chunks_count
self.error = error
class SearchResponse:
"""Response model for search results."""
def __init__(
self,
results: List[Dict[str, Any]],
query: str,
total_results: int
):
self.results = results
self.query = query
self.total_results = total_results
+235
View File
@@ -0,0 +1,235 @@
# Search Operations for Semantic Query and Library Navigation
from typing import List, Dict, Any, Optional
from pathlib import Path
from .config import settings
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
from .embeddings import embed_text, get_embedding_size
from .db import get_chunks_for_library, list_libraries
def search_docs(
query: str,
library_id: Optional[str] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search documents by semantic similarity in Qdrant.
Args:
query: The search query string
library_id: Optional filter to search only within a library
limit: Maximum number of results to return
Returns:
List of dicts with format:
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
}
"""
try:
# Generate embedding for the query
query_embedding = embed_text(query)
client = get_client()
# Build filter if library_id is specified
search_filter = None
if library_id:
try:
from qdrant_client.models import FieldCondition, Filter, MatchValue
search_filter = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
except ImportError:
search_filter = None
# Perform vector search
results = client.search(
collection_name=VECTOR_COLLECTION,
query_vector=query_embedding,
limit=limit,
search_filter=search_filter
)
# Format and return results
formatted_results = []
for result in results:
if result.score > 0 and result.payload:
formatted_results.append({
"id": result.payload["id"],
"score": float(result.score),
"library_id": result.payload.get("library_id", ""),
"path": result.payload.get("path", ""),
"title": result.payload.get("title", ""),
"chunk_index": result.payload.get("chunk_index", 0)
})
return formatted_results
except Exception as e:
print(f"Search error: {e}")
return []
def get_library_docs(
library_id: str,
topic: Optional[str] = None,
token_limit: int = 8000
) -> str:
"""
Retrieve documentation content from a library.
Args:
library_id: The library ID to fetch docs from
topic: Optional topic filter - if provided, searches for topic first
token_limit: Maximum tokens to include in output
Returns:
Combined markdown content as string
"""
try:
# If topic is specified, search for relevant chunks
if topic:
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
search_results = search_docs(query=topic, library_id=library_id, limit=20)
if not search_results:
return f"No documents found in library '{library_id}' matching topic: {topic}"
print(f" [Search] Found {len(search_results)} relevant chunks")
else:
# Fetch all chunks for the library and select most useful ones
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
chunks_data = get_chunks_for_library(library_id)
if not chunks_data:
return f"No documents found in library '{library_id}'"
# Sort by chunk_index descending and pick top ones to respect token limit
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
selected_chunks = []
total_tokens = 0
for chunk in sorted_chunks:
content = chunk.get("content", "")
tokens = len(content) // 4 # Simple token estimate
if total_tokens + tokens <= token_limit:
selected_chunks.append(chunk)
total_tokens += tokens
else:
# Take part of this chunk to fill remaining space
remaining = token_limit - total_tokens
content_preview = content[:remaining * 4] if remaining > 0 else ""
if content_preview:
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
# Combine chunks into markdown
md_parts = []
for chunk in selected_chunks:
title = chunk.get("title")
content = chunk.get("content", "")
if title and content.strip():
# Add heading before first chunk or if this is the first chunk
if not md_parts or "\n\n" not in "".join(md_parts):
md_parts.append(f"# {title}")
elif not any(part.startswith("#") for part in md_parts[-5:]):
md_parts.append(f"\n# {title}\n")
md_parts.append(content)
result = "\n\n".join(md_parts)
# If no headings were added, prepend library title
if not any(part.startswith("#") for part in result.split("\n")[:3]):
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
return result.rstrip()
except Exception as e:
print(f"Error getting library docs: {e}")
return f"Error retrieving documents from library '{library_id}': {str(e)}"
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
"""
Resolve a library name to potential matches (Context7-style).
Args:
library_name: Partial or full library name to search for
Returns:
List of Context7-style candidate dicts:
{
"id": "/local/foundryvtt",
"name": "foundryvtt",
"description": "...",
"source": "local"
}
"""
try:
libraries = list_libraries()
if not libraries:
return []
# Filter by name match (case-insensitive)
candidates = []
for lib in libraries:
lib_name = lib.get("name", "").lower()
lib_id = lib.get("id", "").lower()
if library_name.lower() in lib_name or library_name.lower() in lib_id:
candidates.append({
"id": f"/local/{lib['id']}",
"name": lib["name"],
"description": lib.get("description", ""),
"source": "local"
})
# Return top matches (or all if less than 3)
candidates = candidates[:min(5, len(candidates))]
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
return candidates
except Exception as e:
print(f"Error resolving library ID: {e}")
return []
if __name__ == "__main__":
import asyncio
async def test_search():
"""Test search functionality."""
print("Testing search module...\n")
# Test 1: Simple search with dummy vector (simulated)
print("1. Testing resolve_library_id()...")
results = await resolve_library_id("foundryvtt")
print(f" Results: {len(results)} candidates\n")
# Test 2: Empty query should return empty list
print("2. Testing search_docs() with empty query...")
results = await search_docs("")
print(f" Results: {len(results)} chunks\n")
print("✅ All tests completed!")
asyncio.run(test_search())
+361
View File
@@ -0,0 +1,361 @@
# Vector Store Operations for Qdrant
import asyncio
import uuid
from typing import List, Dict, Any, Optional
try:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
except ImportError:
QdrantClient = None
Distance = VectorParams = PointStruct = Filter = FieldCondition = MatchValue = None
# Singleton client instance
_client: Optional[Any] = None
try:
from .config import settings
_collection_name = settings.collection_name
except Exception:
_collection_name = "local_context7_docs"
def get_client() -> Any:
"""Get or create the Qdrant client singleton using environment config."""
global _client
if _client is None:
if QdrantClient is None:
raise RuntimeError("qdrant-client is not installed")
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
# Use QDRANT_URL from environment if available, otherwise use host:port
import os
qdrant_url = os.getenv("QDRANT_URL")
if qdrant_url:
_client = QdrantClient(url=qdrant_url)
else:
from .config import settings
host = settings.vector_store_host
port = settings.vector_store_port
_client = QdrantClient(host=host, port=port)
return _client
def get_collection_name() -> str:
"""Get the collection name for vector storage."""
return _collection_name
def get_embedding_size() -> int:
"""Get embedding dimension size from embeddings module."""
try:
from .embeddings import get_embedding_size
return get_embedding_size()
except (ImportError, RuntimeError):
# Default fallback if embeddings module not loaded yet
return 384
async def ensure_collection(vector_size: Optional[int] = None) -> Dict[str, Any]:
"""
Ensure the Qdrant collection exists with proper schema.
Args:
vector_size: Override embedding dimension (uses get_embedding_size() if not provided)
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": False, "error": "qdrant-client is not installed"}
client = get_client()
size = vector_size or get_embedding_size()
distance = Distance.COSINE
# Check if collection exists
try:
collections = client.get_collections().collections
collection_exists = any(c.name == _collection_name for c in collections)
except Exception:
collection_exists = False
if not collection_exists:
# Create new collection
client.create_collection(
collection_name=_collection_name,
vectors=VectorParams(size=size, distance=distance),
wait=True
)
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": True
}
else:
# Verify current vector size matches expected
try:
collection_info = client.get_collection(_collection_name)
current_size = collection_info.config.params.vectors.size
if current_size != size:
# Collection exists with wrong size - delete and recreate
client.delete_collection(_collection_name)
client.create_collection(
collection_name=_collection_name,
vectors=VectorParams(size=size, distance=distance),
wait=True
)
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": False,
"resized": True
}
except Exception:
pass # Collection exists, don't worry about size for now
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": False
}
except Exception as e:
return {"success": False, "error": str(e)}
async def upsert_chunks(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Upsert chunks into the vector store.
Args:
chunks: List of chunk dicts with format:
{
"id": "...",
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0,
"content": "...",
"embedding": [...]
}
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": False, "error": "qdrant-client is not installed"}
if not chunks:
return {"success": True, "points_added": 0}
client = get_client()
# Build PointStruct points from chunk dicts
points = []
for chunk in chunks:
point_key = f"{chunk['library_id']}:{chunk['id']}"
point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, point_key))
points.append(PointStruct(
id=point_id,
vector=chunk["embedding"],
payload={
"id": chunk["id"],
"library_id": chunk["library_id"],
"path": chunk.get("path", ""),
"title": chunk.get("title", ""),
"chunk_index": chunk.get("chunk_index", 0),
"content": chunk.get("content", "")
}
))
# Upsert points into collection
client.upsert(_collection_name, points=points)
return {
"success": True,
"points_added": len(points)
}
except Exception as e:
return {"success": False, "error": str(e)}
async def search_vectors(
query_vector: List[float],
library_id: Optional[str] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search for semantically similar vectors.
Args:
query_vector: The embedding vector to search against
library_id: Optional filter by library ID
limit: Maximum results to return
Returns:
List of result dicts with format:
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
}
"""
try:
if QdrantClient is None:
return []
client = get_client()
# Build filter if library_id is specified
search_filter = None
if library_id:
search_filter = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
# Perform vector search
results = client.search(
collection_name=_collection_name,
query_vector=query_vector,
limit=limit,
search_filter=search_filter
)
# Format results
formatted_results = []
for result in results:
if result.score > 0 and result.payload:
formatted_results.append({
"id": result.payload["id"],
"score": float(result.score),
"library_id": result.payload["library_id"],
"path": result.payload.get("path", ""),
"title": result.payload.get("title", ""),
"chunk_index": result.payload.get("chunk_index", 0)
})
return formatted_results
except Exception as e:
return []
async def delete_library_vectors(library_id: str) -> Dict[str, Any]:
"""
Delete all vectors for a given library.
Args:
library_id: The library ID to delete vectors for
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": True, "library_id": library_id, "skipped": "qdrant-client is not installed"}
client = get_client()
# Use filter to delete only vectors matching the library_id
filter_condition = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
# Get all points with the filter (in batches)
batch_size = 100
offset = None
while True:
try:
# Scroll to get points matching filter
points, _ = client.scroll(
collection_name=_collection_name,
scroll_filter=filter_condition,
limit=batch_size,
offset=offset,
with_payload=True,
with_vectors=False
)
if not points:
break
# Collect IDs to delete
point_ids = [p.id for p in points]
# Delete the points
client.delete(
collection_name=_collection_name,
points_selector=point_ids
)
offset = points[-1].id if points else None
except Exception as e:
# If we hit end of dataset or other issue, break
break
return {
"success": True,
"library_id": library_id
}
except Exception as e:
return {"success": False, "error": str(e)}
if __name__ == "__main__":
# Test vector store module
import os
print("Testing vector store module...\n")
# Test ensure_collection
print("1. Testing ensure_collection()...")
result = asyncio.run(ensure_collection())
print(f" Result: {result}\n")
# Test search with empty query (will return empty since no vectors exist yet)
print("2. Testing search_vectors() with dummy vector...")
dummy_vector = [0.1] * 384
results = asyncio.run(search_vectors(dummy_vector, limit=5))
print(f" Results count: {len(results)}\n")
# Test delete_library_vectors (will succeed even if no vectors exist)
print("3. Testing delete_library_vectors()...")
result = asyncio.run(delete_library_vectors("test-library"))
print(f" Result: {result}\n")
print("✅ All tests completed!")
+1
View File
@@ -0,0 +1 @@
"""WebUI module for Context7 Docs."""
+166
View File
@@ -0,0 +1,166 @@
.container {
max-width: 1000px;
margin: 0 auto;
padding: 20px;
}
header {
border-bottom: 1px solid #ccc;
padding-bottom: 15px;
margin-bottom: 20px;
}
header h1 {
margin: 0 0 10px 0;
font-size: 1.5rem;
}
nav {
display: flex;
gap: 15px;
}
nav a {
text-decoration: none;
color: #0066cc;
font-size: 0.9rem;
}
nav a.active {
font-weight: bold;
text-decoration: underline;
}
main h2 {
margin-bottom: 15px;
}
footer {
margin-top: 40px;
padding-top: 15px;
border-top: 1px solid #ccc;
font-size: 0.8rem;
color: #666;
}
/* Status cards */
.status-card {
background: #f5f5f5;
padding: 20px;
border-radius: 8px;
border-left: 4px solid #00c467;
}
.status-message {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin: 5px 0;
}
/* Tables */
.library-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
}
.library-table th, .library-table td {
padding: 10px;
text-align: left;
border-bottom: 1px solid #ddd;
}
.library-table th {
background: #f5f5f5;
font-weight: bold;
}
/* Forms */
form input[type="text"], form textarea, form select {
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
margin-right: 10px;
margin-bottom: 10px;
}
button {
background: #0066cc;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
}
button:hover {
background: #0055aa;
}
/* Pre formatting */
pre {
background: #f5f5f5;
padding: 15px;
border-radius: 4px;
overflow-x: auto;
white-space: pre-wrap;
word-break: break-word;
}
/* Search results */
.result-card {
background: #fff;
border: 1px solid #ddd;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
}
.result-card h3 {
margin: 0 0 8px 0;
}
.hint {
color: #666;
font-size: 0.85rem;
margin-top: 15px;
}
/* Status colors */
.status-ok {
color: #00c467;
font-weight: bold;
}
.content-preview {
max-height: 300px;
overflow-y: auto;
}
.results-count {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin-bottom: 15px;
}
.source-card {
background: #f5f5f5;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
}
.actions-bar {
margin-top: 15px;
}
.actions-bar form {
display: inline-flex;
}
.doc-content {
max-height: 600px;
overflow-y: auto;
}
+568
View File
@@ -0,0 +1,568 @@
"""WebUI Views for Context7 Docs using Jinja2 templates."""
import os
import json
from pathlib import Path
from typing import Any, Optional
from fastapi import Request
from fastapi.responses import HTML, JSONResponse
import requests
# Internal API base URL
DOCS_API_URL = os.environ.get("DOCS_API_URL", "http://docs-api:8787")
def api_request(method: str, endpoint: str, data: Optional[dict] = None) -> dict:
"""Make internal API request to docs-api."""
url = f"{DOCS_API_URL}{endpoint}"
headers = {}
if os.environ.get("WEBUI_API_KEY"):
headers["X-API-Key"] = os.environ.get("WEBUI_API_KEY")
resp = requests.request(method, url, headers=headers, json=data)
return resp.json()
def navbar_html(current: str) -> str:
"""Generate navigation bar HTML."""
links = [
("/health", "Health"),
("/libraries", "Libraries"),
("/upload", "Upload"),
("/ingest/all", "Ingest All"),
("/sources/git", "Git Sources"),
("/search", "Search"),
]
items = []
for path, label in links:
cls = "active" if current == path else ""
items.append(f'<a href="{path}" class="{cls}">{label}</a>')
return f"""<nav>
{' '.join(items)}
</nav>""".strip()
def footer_html() -> str:
"""Generate footer HTML."""
return "<footer>Context7 Docs WebUI</footer>"
def health(request: Request) -> HTML:
"""System health dashboard."""
try:
data = api_request("GET", "/health")
status = data.get("status", "unknown")
service = data.get("service", "Service")
except Exception as e:
status = "error"
service = str(e)
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Health</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/health")}</header>
<main><h2>System Health</h2>
<div class="status-card" data-status="{status}"><h3>{service}</h3>
<p>Status: <span class="status-ok">{status}</span></p></div>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def libraries(request: Request) -> HTML:
"""List all libraries."""
try:
data = api_request("GET", "/libraries")
libs = data.get("libraries", [])
except Exception as e:
libs = [{"id": "error", "name": str(e)}]
table_rows = []
for lib in libs:
if lib.get("id") != "error":
table_rows.append(
f"""<tr><td>{lib.get('id')}</td>
<td>{lib.get('name', '')}</td>
<td>{lib.get('description', '') or '(no description)'}</td>
<td><a href="/docs/{lib.get('id')}">View Docs</a></td></tr>"""
)
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Libraries</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/libraries")}</header>
<main>
<h2>Libraries ({len(libs)})</h2>
<div class="actions-bar">
<form action="/folders/create" method="post" style="display:inline;">
<input type="text" name="name" placeholder="New library folder name" required>
<button type="submit">Create Folder</button>
</form>
</div>
<table class="library-table">
<thead><tr><th>ID</th><th>Name</th><th>Description</th><th>Actions</th></tr></thead>
<tbody>{"".join(table_rows)}</tbody>
</table>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def upload(request: Request) -> HTML:
"""File upload form."""
if "file" in request.files:
uploaded_file = request.files["file"]
try:
content = uploaded_file.read().decode("utf-8")[:5000]
# Escape HTML
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
truncated = safe_content[:1000] + "..." if len(safe_content) > 1000 else safe_content
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Upload Complete!</h2>
<pre class="content-preview">{truncated}</pre>
<form method="post" action="/ingest/uploaded">
<input type="hidden" name="content" value="{safe_content[:5000]}">
<label for="library_id">Library (optional):</label>
<input type="text" id="library_id" name="library_id" placeholder="e.g., my-docs">
<button type="submit">Ingest</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>File too large!</h2>
<p>Please upload smaller text files (limit: ~5MB).</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
else:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Upload Documentation Files</h2>
<form method="post" enctype="multipart/form-data">
<label for="file">Select file:</label>
<input type="file" name="file" id="file" accept=".txt,.md,.json,.py,.js,.html,.css,.yaml,.yml" required>
<button type="submit">Upload</button>
</form>
<p class="hint">Supported formats: .txt, .md, .json, .py, .js, .html, .css, .yaml</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def ingest_all(request: Request) -> JSONResponse:
"""Trigger ingestion for all libraries."""
try:
result = api_request("POST", "/ingest")
return JSONResponse(content={"status": "ok", "message": f"Processed {result.get('chunks', 0)} chunks"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
def ingest_library(request: Request, library_id: str) -> HTML:
"""Ingest for specific library."""
if "content" in request.form:
content = request.form.get("content")[:10000]
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Ingest</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Ingest for Library: {library_id}</h2>
<form method="post" action="/ingest/{library_id}">
<label for="content">Content (text):</label>
<textarea id="content" name="content" rows="10" maxlength="10000"></textarea>
<button type="submit">Ingest</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
else:
try:
result = api_request("POST", f"/ingest/{library_id}")
safe_msg = result.get('message', '') or ''
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Ingest Result</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Ingestion Complete!</h2>
<p>{safe_msg}</p>
<pre>{safe_json}</pre>
<a href="/libraries">← Back to Libraries</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Error</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Error</h2>
<pre>{safe_error}</pre>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
async def folders_create(request: Request) -> JSONResponse:
"""Create a new library folder."""
name = request.form.get("name", "").strip()
try:
from backend.app.db import upsert_library
await upsert_library(library_id=name, name=name, description=None, source_path=f"/docs/{name}")
return JSONResponse(content={"status": "ok", "message": f"Created folder '{name}'"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
async def folders_delete(request: Request) -> JSONResponse:
"""Delete a library."""
library_id = request.query_params.get("id", "").strip()
try:
from backend.app.db import delete_library
await delete_library(library_id)
return JSONResponse(content={"status": "ok", "message": f"Deleted library '{library_id}'"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
async def ingest_uploaded(request: Request) -> HTML:
"""Ingest uploaded file content."""
content = request.form.get("content", "")[:10000]
library_id = request.form.get("library_id", "uploaded")
try:
result = api_request("POST", f"/ingest/{library_id}", data={"content": content})
safe_msg = result.get('message', '') or ''
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload Result</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Ingestion Complete!</h2>
<p>{safe_msg}</p>
<pre>{safe_json}</pre>
<a href="/upload">← Upload Another</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Upload Ingest Error</h1><pre>{safe_error}</pre><a href="/upload">← Try Again</a></body>
</html>""", media_type="text/html")
def docs(request: Request, library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> HTML:
"""View docs from a library."""
try:
data = api_request("GET", f"/libraries/{library_id}/docs", params={"topic": topic, "tokens": tokens})
content = data.get("content", "")
except Exception as e:
content = str(e)
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")[:10000]
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Library: {library_id}</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/docs/{}".format(library_id))}</header>
<main>
<h2>Library: {library_id}</h2>
<p><strong>Topic:</strong> {topic or '(all)'} | <strong>Tokens:</strong> {tokens}</p>
<pre class="docs-content">{safe_content}</pre>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def search_redirect(request: Request) -> JSONResponse:
"""Redirect to search form."""
return JSONResponse(content={"redirect": "/search/form"})
def search_form(request: Request) -> HTML:
"""Search form page."""
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Search</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
<main>
<h2>Search Docs</h2>
<form method="post" action="/search">
<label for="query">Query:</label>
<input type="text" id="query" name="query" required placeholder="Enter your search query...">
<label for="library_id">Library (optional):</label>
<input type="text" id="library_id" name="library_id" placeholder="e.g., foundryvtt">
<label for="limit">Limit results:</label>
<select id="limit" name="limit">
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="20">20</option>
<option value="50">50</option>
</select>
<button type="submit">Search</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def search_results(request: Request) -> HTML:
"""Display search results."""
try:
query = request.query_params.get("q", "")
limit = int(request.query_params.get("limit", "10"))
payload = {"query": query, "library_id": None, "limit": limit}
result = api_request("POST", "/search", data=payload)
results = result.get("results", [])
except Exception as e:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Error</h1><pre>{str(e)}</pre><a href="/search/form">← Try Again</a></body>
</html>""", media_type="text/html")
cards = []
for r in results:
title = r.get("title", "Untitled") or (r.get("content", "")[:100] + "...")[:200]
content = (r.get("content", "") or r.get("chunk", ""))[:500]
cards.append(f"""<div class="result-card" data-id="{r.get('id')}"><h3>{title}</h3>
<p>{content}...</p><a href="/docs/{r.get('library_id')}">View Full</a></div>""")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Search Results</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
<main>
<h2>Search Results for "{query}"</h2>
<div class="results-count">{len(results)} results found</div>
{''.join(cards)}
<a href="/search/form">← New Search</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def sync_sources(request: Request) -> HTML:
"""Sync git sources."""
if request.method == "POST":
try:
data = api_request("POST", "/sources/sync")
safe_json = json.dumps(data, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Sync Result</title></head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sync/sources")}</header>
<main><h2>Git Sync Complete!</h2><pre>{safe_json}</pre>
<form method="post"><button type="submit">Sync Again</button></form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Sync Error</h1><pre>{safe_error}</pre><a href="/sources/git">← Try Again</a></body>
</html>""", media_type="text/html")
else:
try:
data = api_request("GET", "/libraries")
libs = [l.get("id") for l in data.get("libraries", []) if l.get("id") != "error"]
except Exception:
libs = []
lib_list = ", ".join(libs) if libs else "(none)"
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Git Sync</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
<main>
<h2>Sync Git Repositories</h2>
<p>Syncs all git repositories configured in <code>docs_sources.yaml</code>.</p>
<form method="post" action="/sync/sources">
<label for="override">Override existing repos:</label>
<input type="checkbox" id="override" name="override">
<button type="submit">Sync All Repositories</button>
</form>
<h3>Libraries Found: {lib_list}</h3>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def git_sources(request: Request) -> HTML:
"""List configured git sources."""
import yaml
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
try:
with open(config_path) as f:
data = yaml.safe_load(f)
sources = data.get("sources", [])
source_blocks = []
for src in sources:
url = src.get("repo_url", "")[:50] + "..." if len(src.get("repo_url", "")) > 50 else src.get("repo_url", "")
branch = src.get("branch", "main")
include = src.get("include_paths", ["*"])
exclude = src.get("exclude_paths", [])
source_blocks.append(f"""<div class="source-card">
<strong>{src.get('library_id', 'unknown')}</strong><br>
URL: {url}<br>
Branch: {branch}<br>
Include: {', '.join(include)}{' | Exclude: ' + ', '.join(exclude) if exclude else ''}
</div>""")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Git Sources</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
<main>
<h2>Configured Git Sources ({len(sources)})</h2>
{''.join(source_blocks)}
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Git Sources Error</h1><pre>{safe_error}</pre></body>
</html>""", media_type="text/html")
def logs(request: Request) -> HTML:
"""Logs/status page."""
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Logs</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/logs")}</header>
<main>
<h2>Status Messages</h2>
<div class="status-message">Docs API: {DOCS_API_URL}</div>
<div class="status-message">Qdrant Health: healthy | MCP OK: yes</div>
<p class="hint">Logs are printed to container stdout/stderr. For full logs, inspect Docker containers directly.</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
# Register all routes
__all__ = [
"health", "libraries", "upload", "ingest_all", "ingest_library",
"folders_create", "folders_delete", "docs", "search_redirect",
"search_form", "search_results", "sync_sources", "git_sources", "logs"
]
+37
View File
@@ -0,0 +1,37 @@
# Backend API Dependencies
fastapi==0.109.0
uvicorn[standard]==0.27.0
pydantic==2.5.3
python-dotenv==1.0.0
python-multipart==0.0.6
# Qdrant Vector Store Client
qdrant-client==1.7.0
# Text Processing for token estimation
tiktoken==0.7.0
# Local Embeddings using FastEmbed
fastembed==0.3.0
# PDF support for document ingestion
pypdf==5.0.0
# HTTP client for MCP server communication
httpx==0.26.0
# HTTP client for WebUI (used to call docs-api from WebUI)
requests==2.31.0
# FastMCP for MCP server integration (also used by backend)
fastmcp==0.6.0
# YAML parser for sources configuration
PyYAML==6.0.1
# =============================================================================
# TEST DEPENDENCIES
# =============================================================================
pytest==8.3.2
pytest-mock==3.14.0
pytest-asyncio==0.23.7