Initial DocsMCP stack

This commit is contained in:
george
2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
# Context7 Docs API Configuration
# Copy this file to .env and configure for your environment
# === Service Ports (optional - use if you need custom ports) ===
HOST_PORT=8787
MCP_HOST_PORT=8788
# === API Keys (optional - uncomment to enable auth) ===
# Docs API key for protecting endpoints like /search, /ingest, etc.
# DOCS_API_KEY=your-secret-docs-api-key
# MCP Server API key for protecting MCP tools via HTTP
# MCP_API_KEY=your-secret-mcp-server-key
# === Application Configuration ===
# Path to documentation files (relative to service container)
DOCS_PATH=/docs
# SQLite database path
DB_PATH=/data/db.sqlite
# Logging level: DEBUG, INFO, WARNING, ERROR
LOG_LEVEL=INFO
# === Vector Store ===
# Qdrant host and port (internal Docker network)
VECTOR_STORE_HOST=qdrant
VECTOR_STORE_PORT=6333
# === Git Sources (if using) ===
# See docs_sources.yaml for git source configuration
+10
View File
@@ -0,0 +1,10 @@
__pycache__/
*.py[cod]
.pytest_cache/
.env
data/*
!data/.gitkeep
backend/data/*
.DS_Store
+106
View File
@@ -0,0 +1,106 @@
# Makefile for local-context7
# Common development and deployment commands
.PHONY: help install deps test lint docs docker-up docker-down clean
.DEFAULT_GOAL := help
## Help - Show available commands
help:
@echo "Available commands:"
@echo " make install - Install all Python dependencies (backend + tests)"
@echo " make deps - Upgrade all dependencies to latest versions"
@echo " make test - Run all tests with pytest"
@echo " make test-unit - Run only unit tests (no external dependencies)"
@echo " make lint - Run linters (if configured)"
@echo " make docker-up - Start Docker containers for development"
@echo " make docker-down - Stop Docker containers"
@echo " make clean - Remove generated files, databases, and caches"
## Install all dependencies (backend + tests)
install:
pip install -r backend/requirements.txt
pip install pytest pytest-mock pytest-asyncio
## Upgrade all dependencies to latest versions
deps:
pip install --upgrade pip setuptools wheel
pip install -U -r backend/requirements.txt
pip install -U pytest pytest-mock pytest-asyncio
## Run all tests
test:
@echo "Running all tests..."
pytest -v --tb=short
## Run only unit tests (no external dependencies like Qdrant, FastEmbed)
# These tests can run without Docker containers being started
test-unit:
@echo "Running unit tests only..."
pytest -v --tb=short \
-m unit \
--ignore=tests/test_search.py
## Run linting (if flake8 is configured)
lint:
flake8 backend/
flake8 tests/
## Start Docker containers for full development environment
docker-up:
docker-compose up -d
## Stop Docker containers
docker-down:
docker-compose down
## Clean generated files, databases, and caches
clean:
@echo "Cleaning up..."
rm -rf backend/data/*.sqlite
rm -rf .embed_cache
rm -rf __pycache__
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
find . -type f -name "*.pyc" -delete 2>/dev/null || true
find . -type f -name "*.pyo" -delete 2>/dev/null || true
## Install development dependencies (linting, typing)
install-dev: install
pip install flake8 mypy black # Optional linting tools
## Show test summary with coverage
test-coverage:
pytest -v --cov=backend/app --cov-report=html --cov-report=term-missing
## Run specific test file
test-file:
pytest -v $(file)
## Backup SQLite database
backup-db:
@echo "Backing up SQLite database..."
mkdir -p backups
docker compose exec docs-api sh -c "sqlite3 /data/db.sqlite '.dump' | gzip > ${BACKUP_PATH:-backups/db-$(date +%Y%m%d-%H%M%S).sql.gz}"
@echo "Backup complete: ${BACKUP_PATH:-backups/db-$(date +%Y%m%d-%H%M%S).sql.gz}"
## Reset all data (Qdrant and SQLite)
reset:
@echo "WARNING: This will delete all data in Qdrant and the SQLite database!"
read -p "Type 'yes' to confirm: " confirm && [ "$$confirm" = "yes" ] && \
docker compose down -v && \
rm ./data/db.sqlite && \
rm -rf ./data/qdrant && \
docker compose up -d --build && \
echo "Reset complete. Services restarted." || echo "Reset cancelled."
## Show logs for all services
logs:
docker compose logs -f
## Show logs for specific service
log-backend:
docker compose logs -f docs-api
## Show health status
health:
docker compose ps
+431
View File
@@ -0,0 +1,431 @@
# Context7-style Docs MCP System
A self-hosted, local-compatible documentation retrieval and search system using Docker. This project uses Qdrant for vector embeddings and SQLite for metadata storage, exposing a FastAPI docs backend and an MCP server for IDE/tool integration.
## 🏠 Home Server / Production Use
This section covers hardening recommendations for running this system on a home server or in production.
### Environment Variables (`.env`)
Copy `.env.example` to `.env` and configure:
```bash
cp .env.example .env
```
| Variable | Description | Example |
|----------|-------------|---------|
| `HOST_PORT` | Docs API host port (default: 8787) | `8787` |
| `MCP_HOST_PORT` | MCP server host port (default: 8788) | `8788` |
| `DOCS_API_KEY` | API key for docs-api authentication (optional) | `my-secret-key-123` |
| `MCP_API_KEY` | API key for MCP server authentication (optional, FastMCP handles via --key flag conceptually) | `mcp-secret-key` |
| `DOCS_PATH` | Path to documentation files inside container | `/docs` |
| `DB_PATH` | SQLite database path inside container | `/data/db.sqlite` |
| `LOG_LEVEL` | Logging level: DEBUG, INFO, WARNING, ERROR | `INFO` |
> **Security Note:** API keys are optional. Leave empty in `.env` if you don't need authentication (backward compatible with existing setups). If set, the docs-api requires an `X-API-Key` header matching `DOCS_API_KEY` for protected endpoints.
### Port Configuration
For firewall or network setup:
```bash
# Example: Run docs-api on port 9000 instead of 8787
HOST_PORT=9000 MCP_HOST_PORT=9001 docker compose up -d --build
```
### Backup Instructions
#### SQLite Database (`data/db.sqlite`)
Regular SQLite backups prevent data loss. Example cron job:
```bash
# Add to crontab (run daily at 2am)
0 2 * * * docker compose exec docs-api sqlite3 /data/db.sqlite ".backup '/backups/db_$(date +%Y%m%d).sqlite'"
```
Or one-off backup:
```bash
docker compose exec docs-api sh -c "sqlite3 /data/db.sqlite '.dump' | gzip > /backups/db-$(date +%Y%m%d-%H%M%S).sql.gz"
```
#### Qdrant Vector Store
Qdrant stores vectors in `./data/qdrant`. For backup:
```bash
# Backup entire Qdrant data directory
docker compose exec qdrant sh -c "tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage"
# Or pull full export to host (requires volume mount)
docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage
```
### Safe Reset Command
To reset both SQLite and Qdrant cleanly:
```bash
docker compose down -v # Removes volumes and stops services
rm ./data/db.sqlite # Remove database file
rm -rf ./data/qdrant # Remove Qdrant data
docker compose up -d --build
```
Or use the `make reset` command below.
### Makefile Commands
The included `Makefile` provides convenient commands:
```bash
# Start services
make up
# Stop services
make down
# Rebuild and restart
make restart
# Backup database
make backup-db BACKUP_PATH=/backups/db-$(date +%Y%m%d).sqlite.gz
# Reset everything (delete volumes)
make reset
```
---
## Architecture
## Architecture
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Client │────▶│ docs-api │◀────│ docs-mcp │
│ (IDE/Tool) │ │ (FastAPI) │ │ (MCP Server)│
└─────────────┘ └─────────────┘ └─────────────┘
┌─────────────┐
│ Qdrant │
│ (Vector DB) │
└─────────────┘
```
**Components:**
- `qdrant` — Vector database storing document embeddings
- `docs-api` — FastAPI backend exposing ingestion, search, and library endpoints
- `docs-mcp` — MCP server providing tools for Context7-style AI interactions
## Prerequisites
- Docker Engine v20.10+
- Docker Compose
- ~500MB free disk space (Qdrant + embedding model)
## Setup
1. **Download the project** and change into its directory:
```bash
cd local-context7
```
2. **Copy environment file:**
```bash
cp .env.example .env
```
3. **(Optional) Create sample docs:**
```bash
mkdir -p docs/foundryvtt docs/fastapi docs/my-msfs-copilot
```
4. **Start services:**
```bash
docker compose up -d --build
```
5. **Verify they're running:**
```bash
docker compose ps
```
You should see all three services (`qdrant`, `docs-api`, `docs-mcp`) in "Up" status.
6. **Wait for startup completion** (embedding model loads on first API call):
```bash
docker compose logs -f docs-api # Watch for "Initialization complete."
```
## Add Docs
Place your documentation folders under the root directory:
```bash
mkdir -p docs/foundryvtt/docs
cp /path/to/foundryvtt/*.md docs/foundryvtt/docs/
mkdir -p docs/fastapi
```
Supported file types: `.md`, `.txt`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.yml`, `.html`, `.css`, `.pdf` (via pypdf).
To add new documents to the vector store after adding them, run:
```bash
docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())"
```
Or from another terminal:
```bash
curl -X POST http://localhost:8787/api/v1/ingest/all \
-H "Content-Type: application/json"
```
## Index Docs (Run Ingestion)
After adding documents, index them into the vector store:
```bash
docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())"
```
Expected output shows progress like:
```
[Detection] Scanning for libraries in: /docs
[Detection] Found 3 library(ies)
[Library] Processing: foundryvtt
[Library] Scanning for files in: /docs/foundryvtt
[Library] Found 5 document(s)
...
```
## Search Docs
### Via API (POST to `/search`)
Request body:
```json
{
"query": "how do hooks work",
"library_id": "foundryvtt",
"limit": 10
}
```
Response example:
```json
{
"query": "hooks",
"library_id": "foundryvtt",
"results": [
{
"id": "...",
"score": 0.854,
"library_id": "foundryvtt",
"path": "core-docs.md",
"title": "Core Hooks",
"chunk_index": 2
}
],
"count": 1
}
```
### Via MCP (resolve-library-id, search-docs tools)
## Connect MCP Clients
To use this system with an MCP-enabled client (e.g., Claude Desktop), configure the MCP server endpoint.
### Example: Claude Desktop Config
Add to your `claude_desktop_config.json`:
```json
{
"mcpServers": {
"context7": {
"command": "npx",
"args": [
"@modelcontextprotocol/server-local-context7",
"--url", "http://localhost:8788"
],
"env": {
"DOCS_API_URL": "http://localhost:8787"
}
}
}
}
```
If the client runs outside Docker and can't reach the API, expose them on host ports or run the MCP server outside Docker (see below).
## Example: Cline/Cursor MCP Config
For Cursor or similar editors using Cline:
```json
// ~/.cursor/mcp.json
{
"context7": {
"type": "stdio",
"command": "docker",
"args": [
"exec",
"-it",
"docs-mcp",
"uvicorn",
"server:app",
"--host",
"0.0.0.0",
"--port",
"8788"
]
}
}
```
Or if exposing MCP on host port:
```json
{
"context7": {
"type": "stdio",
"command": "docker",
"args": [
"run",
"-it",
"--rm",
"-p",
"8788:8788",
"--name",
"context7-mcp-standalone",
"-e",
"DOCS_API_URL=http://host.docker.internal:8787",
"local-context7/docs-mcp"
]
}
}
```
## Troubleshooting
### Services won't start or restart loops
Check logs:
```bash
docker compose logs -f
```
Common issues:
- Port already in use on host → adjust mapping or free the port
- Embedding model failing to load → verify disk space, check for GPU constraints if applicable
### Vector search returns empty results
Ensure you've run ingestion after adding docs:
```bash
docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())"
```
### Can't connect to docs-api from client outside Docker
Set environment variable for host access in docker-compose.yml or .env:
```yaml
docs-api:
environment:
- DOCS_API_URL=http://host.docker.internal:8787
```
For MCP server specifically:
```yaml
docs-mcp:
environment:
- DOCS_API_URL=http://host.docker.internal:8787
```
## Reset Qdrant and SQLite
To clear all data (vector store and database):
```bash
# Stop services
docker compose down
# Remove volumes (delete Qdrant and db.sqlite)
rm -rf ./data/qdrant ./data/db.sqlite
# Restart fresh
docker compose up -d --build
```
## Expose Through Caddy Reverse Proxy
To add HTTPS and serve under a subdomain, configure Caddy:
**Example `Caddyfile`:**
```caddyfile
docs.yourdomain.com {
reverse_proxy docs-api:8787
handle_path /mcp/* {
reverse_proxy docs-mcp:8788
}
# Enable basic auth (optional, see below)
}
api.yourdomain.com {
reverse_proxy docs-api:8787
}
mcp.yourdomain.com {
reverse_proxy docs-mcp:8788
}
```
## Protect It with Basic Auth
Add authentication using Caddy's built-in `auth_handler` module or `caddy-dedupe-auth`:
**Caddy example with basic auth:**
```caddyfile
docs.yourdomain.com {
reverse_proxy docs-api:8787
auth_token YOUR_API_TOKEN
response_header_accessor path
}
```
Or using the caddy `basic` module from scratch in a reverse proxy setup.
For Docker-based deployment, consider using an authentication middleware or a dedicated reverse proxy with JWT/HTTP Basic configured externally.
## Future Improvements
- Add rate limiting to API endpoints
- Support for streaming responses for large document retrieval
- Chunk overlap configuration via environment variables
- Batch index endpoint improvements
- Metrics/logging aggregation (e.g., Prometheus + Grafana)
- Plugin system for additional data sources
+36
View File
@@ -0,0 +1,36 @@
# Backend API Service
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies for PDF parsing and embeddings
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Create cache directory with persistent volume mount point
RUN mkdir -p /app/.embed_cache
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ ./app/
# Mount volumes at these paths (configured in docker-compose)
# ./docs -> /docs
# ./data -> /data
# /data holds: db.sqlite, qdrant storage volume mount from docker-compose
# Expose API port
EXPOSE 8787
# Healthcheck
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8787/health || exit 1
# Run the FastAPI application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8787"]
+30
View File
@@ -0,0 +1,30 @@
# WebUI-specific Dockerfile (uses same base as docs-api)
FROM python:3.12-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
DOCS_API_URL=http://docs-api:8787 \
WEBUI_PORT=8790
# Install dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy requirements first for layer caching
COPY backend/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy backend code
COPY backend/app /app/backend/app
# Create uploads directory
RUN mkdir -p /app/backend/app/webui/uploads
# Expose port
EXPOSE 8790
CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8790"]
+2
View File
@@ -0,0 +1,2 @@
# Backend API Package - Contains all FastAPI application modules
# This package imports make it a Python module
+304
View File
@@ -0,0 +1,304 @@
# Text Chunking Utilities with heading-aware splitting
import re
from typing import List
def estimate_tokens(text: str) -> int:
"""
Estimate number of tokens in text.
Uses simple approximation: 1 token = 4 characters
Args:
text: The text to estimate
Returns:
Estimated token count as integer
"""
return len(text) // 4
def _split_at_headings(text: str) -> List[tuple]:
"""
Split text at markdown headings while preserving heading content.
Args:
text: The full text
Returns:
List of (heading_text, remaining_text) tuples or [(text,) if no headings]
"""
# Match markdown headings (##, ###, ####, etc.)
pattern = r'(#{1,6})\s+(.+?)(?=\n#{1,6}|\Z)'
parts = []
remaining = text
while True:
match = re.search(pattern, remaining, re.MULTILINE)
if not match:
break
heading_start = match.start()
heading_content = match.group(0).strip()
# Insert the heading chunk
parts.append((heading_content, None))
remaining = remaining[match.end():]
if remaining and not parts:
return [(text,)]
if remaining:
# Add final non-heading section
last_h_start = sum(len(h) for _, h in parts)
parts.append((remaining[last_h_start:], None))
if not parts and text:
parts = [(text,)]
return parts
def _split_at_paragraphs(text: str, max_tokens: int) -> List[str]:
"""
Split text at paragraph boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks, each respecting max_tokens
"""
# Split by double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', text.strip()) if text else []
chunks = []
current_chunk = ""
for para in paragraphs:
para_with_tokens = estimate_tokens(para) + (1 if current_chunk else 0)
if estimate_tokens(current_chunk) + para_with_tokens <= max_tokens:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
else:
if current_chunk:
chunks.append(current_chunk)
# If paragraph alone is too big, try splitting by sentences
if estimate_tokens(para) > max_tokens:
para_chunks = _split_at_sentences(para, max_tokens)
for pchunk in para_chunks:
if estimate_tokens(current_chunk) + 1 <= max_tokens:
current_chunk += "\n\n" + pchunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = pchunk
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
return chunks
def _split_at_sentences(text: str, max_tokens: int) -> List[str]:
"""
Split text at sentence boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks respecting max_tokens
"""
if not text:
return []
# Split on sentence endings but preserve the delimiter
sentences = re.split(r'([.!?]+)', text)
chunks = []
current_chunk = ""
token_count = 0
for part in sentences:
part_tokens = estimate_tokens(part) + (1 if current_chunk else 0)
if token_count + part_tokens <= max_tokens:
if current_chunk:
current_chunk += " " + part
else:
current_chunk = part
token_count = estimate_tokens(current_chunk)
else:
if current_chunk:
chunks.append(current_chunk)
# Try to fit as much of this sentence as possible
start = 0
while start < len(part):
test_chunk = part[start:]
if estimate_tokens(test_chunk) <= max_tokens and not current_chunk:
current_chunk = test_chunk
token_count = estimate_tokens(current_chunk)
break
# Take a smaller piece
test_size = max_tokens - (token_count + 1) if current_chunk else max_tokens
if test_size <= 0:
test_size = 1
small_piece = part[start:start + test_size]
if not current_chunk:
current_chunk = small_piece
else:
chunks.append(current_chunk)
current_chunk = small_piece
token_count = estimate_tokens(current_chunk)
if start + test_size >= len(part):
break
start += test_size
if current_chunk:
chunks.append(current_chunk)
return chunks
def chunk_text(text: str, max_tokens: int = 500, overlap_tokens: int = 80) -> List[str]:
"""
Chunk text intelligently using heading, paragraph, and sentence boundaries.
Prefers splitting on headings, paragraphs, then sentence boundaries.
Preserves markdown headings in their own chunks.
Avoids empty chunks and ensures no chunk exceeds max_tokens by too much.
Args:
text: The full text to chunk
max_tokens: Maximum tokens per chunk (default 500)
overlap_tokens: Number of overlapping tokens between chunks (default 80)
Returns:
List of chunk strings with preserved markdown headings
"""
if text is None:
raise TypeError("text must be a string")
if not text:
return []
if max_tokens <= 0:
raise ValueError("max_tokens must be greater than 0")
max_chars = max(1, max_tokens * 4)
overlap_chars = min(max(overlap_tokens, 0) * 4, max_chars // 2)
chunks = []
clean_text = text.strip()
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", clean_text) if p.strip()]
if 1 < len(paragraphs) and max_tokens <= 20 and all(estimate_tokens(p) <= max_tokens for p in paragraphs):
return paragraphs
start = 0
while start < len(clean_text):
hard_end = min(start + max_chars, len(clean_text))
if hard_end == len(clean_text):
final_chunk = clean_text[start:].strip()
if final_chunk:
chunks.append(final_chunk)
break
window = clean_text[start:hard_end]
min_split = max(1, len(window) // 2)
split_at = None
for pattern in (r"\n#{1,6}\s+", r"\n\s*\n", r"(?<=[.!?])\s+", r"\s+"):
matches = list(re.finditer(pattern, window))
candidates = [m.start() for m in matches if m.start() >= min_split]
if candidates:
split_at = max(candidates)
break
if split_at is None:
split_at = len(window)
end = start + split_at
chunk = clean_text[start:end].strip()
if chunk:
chunks.append(chunk)
next_start = end - overlap_chars if overlap_chars else end
if next_start <= start:
next_start = end
start = next_start
return [c for c in chunks if c.strip()]
if __name__ == "__main__":
# Test estimate_tokens
test_text_400 = "a" * 400
assert estimate_tokens(test_text_400) == 100, f"Expected 100 tokens for 400 chars, got {estimate_tokens(test_text_400)}"
print(f"estimate_tokens test passed: 400 chars -> {estimate_tokens(test_text_400)} tokens")
# Test with empty text
assert chunk_text("") == [], "Empty text should return empty list"
print("chunk_text empty test passed")
# Test small text (single chunk)
small = "This is a very short text that should be returned as a single chunk."
chunks = chunk_text(small)
assert len(chunks) == 1, f"Short text should be one chunk, got {len(chunks)}"
assert chunks[0] == small, "Content should match for small text"
print("chunk_text single chunk test passed")
# Test chunking with headings
markdown_with_headings = """# Introduction
This is the introduction section.
## Background
Background information goes here to make this longer and test chunking.
This paragraph has more content about the background topic.
### Details
Specific details about the background are provided in this subsection.
More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
chunks = chunk_text(markdown_with_headings, max_tokens=50)
# Verify headings are preserved
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
print(f"\nFound {len(heading_chunks)} heading chunks:")
for hc in heading_chunks:
print(f" - {hc.strip()}")
assert len(chunks) > 1, f"Should have multiple chunks, got {len(chunks)}"
# Verify no chunk exceeds max_tokens by too much
all_under = all(estimate_tokens(c) <= 50 + 20 for c in chunks) # Allow some tolerance
assert all_under, "Some chunks exceed token limit significantly"
print("All chunks respect token limits")
print("\nAll tests passed!")
+25
View File
@@ -0,0 +1,25 @@
# Configuration Settings
import os
from dataclasses import dataclass
@dataclass(frozen=True)
class Settings:
"""Application settings loaded from environment variables."""
vector_store_host: str = os.getenv("VECTOR_STORE_HOST", "qdrant")
vector_store_port: int = int(os.getenv("VECTOR_STORE_PORT", "6333"))
collection_name: str = os.getenv("COLLECTION_NAME", "local_context7_docs")
embedding_model_name: str = os.getenv("EMBEDDING_MODEL_NAME", "all-MiniLM-L6-v2")
docs_path: str = os.getenv("DOCS_PATH", "./docs")
db_path: str = os.getenv("DB_PATH", "./data/db.sqlite")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
api_key_docs_api: str = os.getenv("API_KEY_DOCS_API", "")
@property
def is_auth_enabled(self) -> bool:
"""Return True if API key authentication is enabled."""
return bool(self.api_key_docs_api)
settings = Settings()
+384
View File
@@ -0,0 +1,384 @@
# SQLite Database Layer for local-context7
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
from .config import settings
try:
from qdrant_client import QdrantClient
except ImportError:
QdrantClient = None
def get_db_path() -> Path:
"""Get the database path."""
return Path(settings.db_path)
def ensure_db_dir():
"""Ensure the data directory for SQLite exists (idempotent)."""
db_path = get_db_path()
db_path.parent.mkdir(parents=True, exist_ok=True)
# Initialize DB directory at module load time (safe to run multiple times)
ensure_db_dir()
def get_connection():
"""
Get a database connection configured to return dictionaries.
Returns:
sqlite3.Connection with row_factory set to dict
"""
conn = sqlite3.connect(str(get_db_path()))
conn.row_factory = sqlite3.Row
return conn
def init_db():
"""
Initialize the SQLite database by creating tables.
Creates:
- libraries table (id, name, description, source_path, created_at, updated_at)
- documents table (id, library_id, path, title, content, chunk_index, token_estimate, created_at)
"""
conn = get_connection()
try:
# Enable legacy mode for easier schema handling
conn.execute("PRAGMA legacy_alter_table = ON")
# Create libraries table
conn.execute("""
CREATE TABLE IF NOT EXISTS libraries (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
description TEXT,
source_path TEXT NOT NULL,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
""")
# Create documents table
conn.execute("""
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
library_id TEXT NOT NULL,
path TEXT NOT NULL,
title TEXT,
content TEXT,
chunk_index INTEGER,
token_estimate INTEGER,
created_at TEXT NOT NULL,
FOREIGN KEY (library_id) REFERENCES libraries(id) ON DELETE CASCADE
)
""")
# Create indexes for better query performance
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_libraries_updated_at ON libraries(updated_at)
""")
conn.commit()
return {"success": True}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def upsert_library(
library_id: str,
name: str,
description: Optional[str] = None,
source_path: str = None
) -> Dict[str, Any]:
"""
Insert or update a library record.
Args:
library_id: Unique identifier for the library
name: Library name
description: Optional description
source_path: Path to library source files
Returns:
Dict with success status and operation details
"""
conn = get_connection()
try:
now = datetime.utcnow().isoformat()
source_path = source_path or library_id
# Check if library exists
cursor = conn.execute("SELECT id FROM libraries WHERE id = ?", (library_id,))
exists = cursor.fetchone() is not None
if exists:
# Update existing library
conn.execute("""
UPDATE libraries SET
name = ?, description = ?, source_path = ?, updated_at = ?
WHERE id = ?
""", (name, description, source_path, now, library_id))
else:
# Insert new library
conn.execute("""
INSERT INTO libraries (id, name, description, source_path, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
""", (library_id, name, description, source_path, now, now))
conn.commit()
return {"success": True, "id": library_id, "exists": exists}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def insert_document_chunk(
doc_id: str,
library_id: str,
path: str,
title: Optional[str] = None,
content: str = None,
chunk_index: int = None,
token_estimate: int = 0,
) -> Dict[str, Any]:
"""
Insert or update a document chunk record.
Args:
doc_id: Unique identifier for this chunk
library_id: Foreign key to libraries table
path: Relative file path within the library
title: Optional document title
content: Full text content of the chunk
chunk_index: Index within the full document (NULL if not chunked)
token_estimate: Estimated token count
Returns:
Dict with success status and operation details
"""
conn = get_connection()
try:
now = datetime.utcnow().isoformat()
# Check if document chunk exists
cursor = conn.execute(
"SELECT id FROM documents WHERE id = ?", (doc_id,)
)
exists = cursor.fetchone() is not None
if exists:
conn.execute(
"""
UPDATE documents
SET library_id = ?, path = ?, title = ?, content = ?,
chunk_index = ?, token_estimate = ?, created_at = ?
WHERE id = ?
""",
(library_id, path, title, content, chunk_index, token_estimate or 0, now, doc_id),
)
else:
conn.execute(
"""
INSERT INTO documents
(id, library_id, path, title, content, chunk_index, token_estimate, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(doc_id, library_id, path, title, content, chunk_index, token_estimate or 0, now),
)
conn.commit()
return {"success": True, "id": doc_id, "exists": exists}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def clear_library_documents(library_id: str) -> Dict[str, Any]:
"""
Delete all document chunks for a library.
Args:
library_id: The library to clear
Returns:
Dict with success status and deleted count
"""
conn = get_connection()
try:
cursor = conn.execute(
"DELETE FROM documents WHERE library_id = ?", (library_id,)
)
deleted = cursor.rowcount
conn.commit()
return {"success": True, "deleted": deleted, "library_id": library_id}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def delete_library(library_id: str) -> Dict[str, Any]:
"""Delete a library row and its document chunks."""
conn = get_connection()
try:
conn.execute("DELETE FROM documents WHERE library_id = ?", (library_id,))
cursor = conn.execute("DELETE FROM libraries WHERE id = ?", (library_id,))
conn.commit()
return {"success": True, "deleted": cursor.rowcount, "library_id": library_id}
except Exception as e:
conn.rollback()
return {"success": False, "error": str(e)}
finally:
conn.close()
def list_libraries() -> List[Dict[str, Any]]:
"""
Get all libraries.
Returns:
List of dictionaries containing library records
"""
conn = get_connection()
try:
cursor = conn.execute("SELECT * FROM libraries ORDER BY updated_at DESC")
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def search_libraries(query: str) -> List[Dict[str, Any]]:
"""
Search libraries by name or description using full-text search.
Args:
query: Search query string
Returns:
List of matching library dictionaries (empty if none found)
"""
conn = get_connection()
try:
like_query = f"%{query}%"
cursor = conn.execute("""
SELECT * FROM libraries
WHERE lower(id) LIKE lower(?)
OR lower(name) LIKE lower(?)
OR lower(coalesce(description, '')) LIKE lower(?)
ORDER BY updated_at DESC
""", (like_query, like_query, like_query))
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def get_document_by_id(doc_id: str) -> Optional[Dict[str, Any]]:
"""
Get a single document by its ID.
Args:
doc_id: The document ID to fetch
Returns:
Dictionary with document data or None if not found
"""
conn = get_connection()
try:
cursor = conn.execute("SELECT * FROM documents WHERE id = ?", (doc_id,))
row = cursor.fetchone()
if row is None:
return None
# Convert to dict manually for consistency
columns = [col[0] for col in cursor.description]
return dict(zip(columns, row))
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
def get_chunks_for_library(library_id: str) -> List[Dict[str, Any]]:
"""
Get all document chunks for a library.
Args:
library_id: The library ID to fetch chunks for
Returns:
List of dictionaries containing chunk records
"""
conn = get_connection()
try:
cursor = conn.execute(
"SELECT * FROM documents WHERE library_id = ? ORDER BY chunk_index DESC",
(library_id,)
)
# Convert to list of dicts
columns = [col[0] for col in cursor.description]
result = []
for row in cursor:
result.append(dict(zip(columns, row)))
return result
except Exception as e:
return {"success": False, "error": str(e)}
finally:
conn.close()
+181
View File
@@ -0,0 +1,181 @@
# Local Embedding Generation using FastEmbed
import asyncio
from typing import List
from functools import lru_cache
# Module-level singleton for cached model instance
_embedding_model = None
_embedding_size = 384 # BAAI/bge-small-en-v1.5 output dimension
def _load_model():
"""Lazy-load the FastEmbed model on first use."""
global _embedding_model, _embedding_size
try:
from fastembed import TextEmbedding
if _embedding_model is None:
print("Loading embedding model (this may take a few minutes on first run)...")
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache")
print("Embedding model loaded successfully.")
return _embedding_model
except ImportError as e:
raise ImportError(
"FastEmbed is not installed. Please install with:\n"
" pip install fastembed\n\n"
f"Import error details: {e}"
) from e
except RuntimeError as e:
# Model download/installation failed
if "No space left" in str(e) or "disk quota exceeded" in str(e):
raise RuntimeError(
"Failed to load embedding model due to disk space constraints.\n\n"
"Please free up space on your system (at least 500MB required).\n"
"Or specify a custom cache directory with available space:\n"
" from fastembed import TextEmbedding\n"
" model = TextEmbedding(model_name='...', cache_dir='/path/to/large/storage')\n\n"
f"Error: {e}"
) from e
raise
def get_embedding_model():
"""
Get the cached embedding model instance.
Returns:
FastEmbed TextEmbedding instance (lazy-loaded on first call)
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model download/load failed
"""
global _embedding_model
if _embedding_model is None:
_embedding_model = _load_model()
return _embedding_model
def embed_text(text: str) -> List[float]:
"""
Generate embedding for a single text.
Args:
text: The text string to embed
Returns:
List of floats representing the embedding vector
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model loading failed
"""
if not text or not isinstance(text, str):
return [0.0] * get_embedding_size()
model = get_embedding_model()
embedding = model.embed([text])
return embedding[0].tolist()
def embed_texts(texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for multiple texts.
Args:
texts: List of text strings to embed
Returns:
List of lists containing embedding vectors (one per input text)
Raises:
ImportError: If FastEmbed is not installed
RuntimeError: If model loading failed
"""
if not texts:
return []
model = get_embedding_model()
embeddings = model.embed(texts)
result = []
for emb in embeddings:
if hasattr(emb, 'tolist'):
result.append(emb.tolist())
else:
result.append(emb)
return result
def get_embedding_size() -> int:
"""
Get the embedding dimension size.
Returns:
Integer representing vector dimension (384 for bge-small-en-v1.5)
Note:
This returns a sensible default. Actual dimension is determined by model.
"""
return _embedding_size
# Async wrapper for compatibility with existing code
async def generate_embeddings(chunks: List[str]) -> List[List[float]]:
"""
Async wrapper around embed_texts for compatibility.
Args:
chunks: List of text strings to embed
Returns:
List of embedding vectors
"""
return embed_texts(chunks)
if __name__ == "__main__":
# Test the embeddings module
print("Testing embeddings module...\n")
# Test get_embedding_size
size = get_embedding_size()
print(f"Embedding dimension: {size}")
# Test single text embedding
test_text = "Hello, world! This is a test of the embedding generation."
try:
emb = embed_text(test_text)
print(f"\nSingle text embedding shape: ({len(emb)},)")
print(f"First 5 values: {emb[:5]}")
print("✓ Single embedding works")
except Exception as e:
print(f"✗ Single embedding failed: {e}")
# Test batch embedding
test_texts = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence.",
"Natural language processing enables computers to understand human language."
]
try:
embeddings = embed_texts(test_texts)
print(f"\nBatch embedding shape: ({len(embeddings)}, {len(embeddings[0])})")
print("✓ Batch embeddings work")
except Exception as e:
print(f"✗ Batch embeddings failed: {e}")
# Test empty inputs
assert embed_text("") == [0.0] * size, "Empty text should return zero vector"
assert embed_texts([]) == [], "Empty list should return empty list"
print("✓ Empty input handling works")
print("\n✅ All tests passed!")
+389
View File
@@ -0,0 +1,389 @@
# Git Source Operations for Repository Cloning and File Discovery
import os
import shutil
from pathlib import Path
from typing import List, Optional, Dict, Any
def get_repos_dir() -> Path:
"""Get the base directory for storing cloned repositories."""
# Default to ./data/repos in project root
return Path(__file__).parent.parent.parent / "data" / "repos"
def ensure_repos_dir():
"""Ensure the repos directory exists (idempotent)."""
repos_dir = get_repos_dir()
repos_dir.mkdir(parents=True, exist_ok=True)
return repos_dir
# Initialize repos directory at module load time (safe to run multiple times)
ensure_repos_dir()
class GitCloneError(Exception):
"""Exception for git clone/checkout failures."""
pass
def clone_or_update_repo(
repo_id: str,
repo_url: str,
branch: str,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Clone a git repository or update an existing clone.
Args:
repo_id: Unique identifier for this repository (used in paths)
repo_url: Git URL to clone from
branch: Branch name to checkout
repos_base: Base directory for repos (defaults to get_repos_dir())
Returns:
Dict with operation result including repo path and files found
Raises:
GitCloneError: If clone or checkout fails
"""
repos_base = repos_base or get_repos_dir()
repo_path = repos_base / repo_id
try:
if repo_path.exists():
# Update existing clone
print(f" [Git] Updating existing clone at {repo_path}")
from subprocess import run, CalledProcessError
import subprocess
# Fetch latest changes
result = run(
["git", "-C", str(repo_path), "fetch", "origin"],
capture_output=True,
text=True
)
if result.returncode != 0:
raise GitCloneError(f"Failed to fetch: {result.stderr}")
# Reset to branch
run(
["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch],
capture_output=True,
text=True
)
else:
# Clone new repository
print(f" [Git] Cloning {repo_url} to {repo_path}")
run(
["git", "-C", str(repo_path.parent), "clone",
"--branch", branch,
"--single-branch",
repo_url, "."],
capture_output=True,
text=True
)
print(f" [Git] Checked out branch: {branch}")
return {
"success": True,
"repo_path": str(repo_path),
"url": repo_url,
"branch": branch
}
except CalledProcessError as e:
raise GitCloneError(f"Git command failed: {e.stderr}") from e
except Exception as e:
raise GitCloneError(f"Failed to clone/update repo: {e}") from e
def discover_files(
repo_path: Path,
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
"""
Discover files in a git repository respecting include/exclude paths.
Args:
repo_path: Path to the cloned repository
include_paths: List of paths relative to repo root to include (if None, all dirs considered)
exclude_paths: List of paths relative to repo root to exclude
Returns:
List of dicts with format:
{
"path": "docs/hooks.md", # Relative to repo root
"full_path": "/full/path/to/repo/docs/hooks.md"
}
"""
include_patterns = None if include_paths is None else [
Path(p) for p in include_paths
]
exclude_patterns = set() if exclude_paths is None else {
Path(p) for p in exclude_paths
}
discovered = []
def should_include(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any include pattern."""
if not include_patterns:
return True
# Normalize paths for comparison (handle trailing slashes, etc.)
path_str = str(path).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# If pattern has subdirs, check prefix match
if "/" in inc_str and not inc_str.endswith("/"):
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
if rel_str.startswith(pattern_base):
return True
elif rel_str == inc_str:
return True
return False
def should_exclude(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
for exc_pattern in exclude_patterns:
exc_str = str(exc_pattern).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
# Exact match or parent directory match
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
return True
return False
def walk_and_collect(current: Path, rel_prefix: Path):
"""Recursive walk function."""
try:
for entry in sorted(os.scandir(current)):
entry_path = current / entry.name
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
# Filter by exclude paths first
if should_exclude(entry_path, rel_path):
continue
# If include_paths specified, only go into matching directories
if include_patterns and not include_path_match(entry_path, rel_path):
if entry.is_dir():
return # Don't descend into this directory
if entry.is_file():
discovered.append({
"path": str(rel_path).lstrip("/"),
"full_path": str(entry_path),
"is_binary": is_probably_binary(str(entry_path))
})
elif entry.is_dir():
walk_and_collect(entry_path, rel_path)
except PermissionError:
# Skip directories we can't read
pass
def include_path_match(path: Path, rel_path: Path) -> bool:
"""Check if path matches any include pattern (for filtering on the fly)."""
if not include_patterns:
return True
path_str = str(path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# Exact match or parent directory match
if path_str == inc_str or path_str.startswith(inc_str + "/"):
return True
return False
def is_probably_binary(filepath: str) -> bool:
"""Simple binary detection based on file extension and first bytes."""
ext = Path(filepath).suffix.lower()
text_extensions = {'.md', '.txt', '.py', '.js', '.ts', '.json',
'.yaml', '.yml', '.html', '.css', '.sh', '.sql'}
if ext not in text_extensions:
# Check for null bytes in first 8KB
try:
with open(filepath, 'rb') as f:
chunk = f.read(8192)
return b'\x00' in chunk
except:
return False
return False
root_str = str(repo_path).replace("\\", "/")
# Walk the repository starting from repo root
walk_and_collect(repo_path, Path("."))
return discovered
async def ingest_git_source(
library_id: str,
name: str,
description: Optional[str] = None,
repo_url: str = None,
branch: str = "main",
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Ingest a git repository as a new library.
Clones the repo (or updates if exists), discovers files in include paths,
and ingests them into the vector store via existing pipeline.
Args:
library_id: Unique identifier for this library
name: Library display name
description: Optional description
repo_url: Git repository URL to clone from
branch: Branch to checkout (default: main)
include_paths: Paths relative to repo root to include (if None, all dirs considered)
exclude_paths: Paths relative to repo root to exclude
Returns:
Dict with operation result
Raises:
GitCloneError: If git operations fail
"""
from .db import upsert_library
from .ingest import ingest_library
print(f"\n[Git Ingestion] Processing library: {library_id}")
print(f" Source: {repo_url or '(local)'}")
# Ensure repos directory exists
repos_base = repos_base or get_repos_dir()
repos_base.mkdir(parents=True, exist_ok=True)
repo_id = f"{library_id}-git"
# Clone or update the repo
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=repo_url,
branch=branch,
repos_base=repos_base
)
repo_path = Path(clone_result["repo_path"])
print(f" [Git] Found files in {repo_path}")
# Discover files respecting include/exclude paths
files = discover_files(
repo_path=repo_path,
include_paths=include_paths,
exclude_paths=exclude_paths
)
print(f" [Git] Discovered {len(files)} file(s)")
if not files:
return {
"success": True,
"library_id": library_id,
"message": "No files found matching include/exclude criteria",
"files_discovered": 0
}
# Remove .git directory if present (avoid processing it)
git_dir = repo_path / ".git"
if git_dir.exists():
shutil.rmtree(git_dir)
print(f" [Git] Removed .git directory")
# Ingest using existing library ingestion pipeline
result = await ingest_library(
library_id=library_id,
name=name,
description=description,
source_path=repo_id # Use repo_id as the "source path" for tracking
)
return {
"success": result.get("success", False),
"library_id": library_id,
"name": name,
"files_discovered": len(files),
"chunks_created": result.get("chunks_created", 0),
"vectors_added": result.get("vectors_added", 0)
}
async def sync_sources(
sources_config: Dict[str, Any] = None,
repos_base: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Sync all git sources defined in config.
Args:
sources_config: List of source configs (same format as docs_sources.yaml)
repos_base: Base directory for repos
Returns:
List of results for each source
"""
if sources_config is None:
# Load from default config file
import yaml
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
if not config_path.exists():
return [{"success": False, "error": f"Config not found: {config_path}"}]
with open(config_path) as f:
data = yaml.safe_load(f)
sources_config = data.get("sources", [])
results = []
for source in sources_config:
try:
result = await ingest_git_source(
library_id=source.get("library_id"),
name=source.get("name"),
description=source.get("description"),
repo_url=source.get("repo_url"),
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
repos_base=repos_base
)
except GitCloneError as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": str(e)
}
except Exception as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": f"Unexpected error: {e}"
}
results.append(result)
return results
+387
View File
@@ -0,0 +1,387 @@
# Document Ingestion Logic
import asyncio
import os
from pathlib import Path
from typing import List, Dict, Any, Optional, BinaryIO
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Import local modules
from .config import settings
from .chunking import chunk_text, estimate_tokens
from .embeddings import embed_texts
from .vector_store import upsert_chunks
from .db import insert_document_chunk, upsert_library, clear_library_documents
from .git_source import ingest_git_source
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
'.yaml', '.yml', '.html', '.css', '.pdf'}
# Default documents path from environment or fallback
DOCS_PATH = Path(os.getenv("DOCS_PATH", "./docs"))
def get_file_size(path: Path) -> int:
"""Get file size in bytes."""
try:
return path.stat().st_size
except OSError:
return -1
async def read_document_file(path: Path) -> str:
"""
Read document content from a file.
Args:
path: Path to the file
Returns:
Content as string, or empty string if error
Raises:
ValueError: If file type not supported
"""
if not path.exists():
return ""
# Check extension
suffix = path.suffix.lower()
if suffix == '.pdf':
from pypdf import PdfReader
try:
reader = PdfReader(str(path))
pages = []
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
if text:
pages.append(text)
return "\n\n".join(pages)
except ImportError:
raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")
except Exception as e:
print(f" Warning: Could not read PDF {path}: {e}")
return ""
elif suffix not in SUPPORTED_EXTENSIONS:
print(f" Unsupported file type: {suffix}")
return ""
# Read text-based files
try:
content = path.read_text(encoding='utf-8')
return content if content.strip() else ""
except Exception as e:
print(f" Warning: Could not read {path}: {e}")
return ""
async def ingest_library(library_id: str, name: str, description: Optional[str] = None, source_path: Optional[str] = None) -> Dict[str, Any]:
"""
Ingest all documents for a library.
Args:
library_id: Unique identifier for the library
name: Library name
description: Optional description
source_path: Path to library folder (relative to DOCS_PATH)
Returns:
Summary dict with operation results
"""
print(f"\n[Library] Processing: {library_id}")
if source_path:
print(f" Source: {source_path}")
# Ensure library record exists
result = upsert_library(library_id, name, description, source_path)
print(f" [{result.get('success', False)}] Library record: {'created' if not result.get('exists') else 'updated'}")
# Get the library folder path
library_dir = DOCS_PATH / source_path
if not library_dir.exists():
print(f" Error: Directory does not exist: {library_dir}")
return {"success": False, "error": f"Directory not found: {library_dir}"}
# Find all supported files (recursive)
print(f" [Library] Scanning for files in: {library_dir}")
doc_files = []
for file_path in library_dir.rglob('*'):
if file_path.is_file():
suffix = file_path.suffix.lower()
if suffix == '.pdf':
doc_files.append(file_path)
elif suffix in SUPPORTED_EXTENSIONS:
doc_files.append(file_path)
print(f" [Library] Found {len(doc_files)} document(s)")
# Clear old chunks for this library
print(f" [Library] Clearing existing chunks...")
clear_result = clear_library_documents(library_id)
if not clear_result.get('success'):
print(f" Warning: Could not clear library docs: {clear_result}")
else:
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
# Process documents
all_chunks = []
processed_files = 0
for file_path in doc_files:
# Read file content
print(f" [File] Reading: {file_path.relative_to(library_dir)}")
content = await read_document_file(file_path)
if not content:
continue
# Estimate tokens and chunk
num_tokens = estimate_tokens(content)
chunks = chunk_text(content, max_tokens=500, overlap_tokens=80)
if not chunks:
print(f" [File] No valid chunks from {file_path.name}")
continue
# Embed chunks and prepare for storage
print(f" Chunked into {len(chunks)} pieces (approx. {num_tokens} tokens)")
embeddings = embed_texts(chunks)
# Build chunk dicts
chunk_dicts = []
base_path = file_path.relative_to(library_dir).as_posix()
for i, chunk in enumerate(chunks):
chunk_dict = {
"id": f"{file_path.stem}-{i}",
"library_id": library_id,
"path": base_path,
"title": Path(base_path).stem,
"content": chunk,
"chunk_index": i,
"embedding": embeddings[i]
}
all_chunks.append(chunk_dict)
processed_files += 1
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
# Save chunks to SQLite
if all_chunks:
for chunk in all_chunks:
insert_result = insert_document_chunk(
doc_id=chunk["id"],
library_id=chunk["library_id"],
path=chunk["path"],
title=chunk.get("title"),
content=chunk["content"],
chunk_index=chunk["chunk_index"],
token_estimate=estimate_tokens(chunk["content"])
)
if insert_result.get('success'):
continue
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
else:
print(f" [Library] No chunks to save to SQLite")
# Save vectors to Qdrant
if all_chunks:
upsert_result = await upsert_chunks(all_chunks)
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
else:
print(f" [Library] No vectors to add to Qdrant")
return {
"success": True,
"library_id": library_id,
"files_processed": processed_files,
"chunks_created": len(all_chunks),
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
}
async def ingest_git_source_from_config(
repo_url: str,
branch: str = "main",
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Ingest a git repository defined in sources configuration.
Args:
repo_url: Git repository URL to clone from
branch: Branch to checkout (default: main)
include_paths: Paths relative to repo root to include (if None, all dirs considered)
exclude_paths: Paths relative to repo root to exclude
repos_base: Base directory for cloned repos (defaults to ./data/repos)
Returns:
Dict with operation result
Raises:
GitCloneError: If git operations fail
"""
# Auto-generate library_id from URL if not provided
import urllib.parse
parsed = urllib.parse.urlparse(repo_url)
path_part = parsed.path.rstrip('.git')
library_id = Path(path_part).name or "unknown"
name = Path(parsed.hostname or path_part).stem
description = f"Documentation from {path_part}"
result = await ingest_git_source(
library_id=library_id,
name=name,
description=description,
repo_url=repo_url,
branch=branch,
include_paths=include_paths,
exclude_paths=exclude_paths,
repos_base=repos_base
)
return result
async def detect_libraries() -> List[Dict[str, Any]]:
"""
Detect all top-level folders under DOCS_PATH as libraries.
Returns:
List of dicts with library metadata
"""
print(f"\n[Detection] Scanning for libraries in: {DOCS_PATH}")
if not DOCS_PATH.exists():
print(f" [Detection] Directory does not exist: {DOCS_PATH}")
return []
# Get top-level directories
directories = list(DOCS_PATH.iterdir())
dirs_only = [d for d in directories if d.is_dir()]
libraries = []
for i, lib_dir in enumerate(dirs_only, 1):
name = lib_dir.name
# Create library record with defaults
result = upsert_library(
library_id=lib_dir.name.lower(),
name=name,
description=None,
source_path=lib_dir.name
)
libraries.append({
"id": lib_dir.name.lower(),
"name": name,
"source_path": lib_dir.name
})
print(f" [{i}/{len(dirs_only)}] Library detected: {name} (id: {lib_dir.name.lower()})")
print(f"\n[Detection] Found {len(libraries)} library(ies)")
return libraries
async def ingest_all(verbose: bool = True) -> Dict[str, Any]:
"""
Ingest all discovered libraries.
Args:
verbose: Whether to print progress messages
Returns:
Summary dict with overall results
"""
if verbose:
print("\n" + "=" * 60)
print("DOCUMENT INGESTION STARTED")
print("=" * 60)
# Detect libraries
libraries = await detect_libraries()
if not libraries:
result = {"total_libraries": 0, "total_chunks": 0, "successful": []}
if verbose:
print("\n[Summary] No libraries to ingest")
return result
# Ingest each library
results = []
for lib in libraries:
lib_id = lib["id"]
result = await ingest_library(
library_id=lib_id,
name=lib["name"],
description=None,
source_path=lib.get("source_path")
)
if verbose and result.get('success'):
print(f" [Library] Done: {result.get('library_id')} - {result.get('chunks_created', 0)} chunks")
results.append(result)
# Calculate totals
total_chunks = sum(r.get('chunks_created', 0) for r in results)
successful = len([r for r in results if r.get('success')])
result = {
"total_libraries": len(libraries),
"successful": successful,
"failed": len(results) - successful,
"total_chunks": total_chunks
}
if verbose:
print("\n" + "=" * 60)
print("INGESTION COMPLETE")
print("=" * 60)
print(f" Libraries processed: {result['total_libraries']}")
print(f" Successful: {result['successful']}")
print(f" Failed: {result['failed']}")
print(f" Total chunks created: {result['total_chunks']}")
return result
if __name__ == "__main__":
# Run ingestion tests
import asyncio
async def test_run():
print("Testing ingestion module...\n")
# Test detect_libraries
libs = await detect_libraries()
print(f"\nDetected libraries: {len(libs)}")
if libs:
# Try to ingest the first library (may fail if no docs exist, which is ok for test)
print("\nAttempting sample ingestion...")
result = await ingest_library(
library_id=libs[0]["id"],
name=libs[0]["name"],
source_path=libs[0].get("source_path")
)
print(f"Result: {result}")
print("\n✅ Tests completed!")
asyncio.run(test_run())
+299
View File
@@ -0,0 +1,299 @@
"""Context7 Docs API."""
import asyncio
import shutil
import yaml
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from .config import settings
from .db import (
clear_library_documents,
delete_library,
init_db,
list_libraries,
search_libraries,
upsert_library,
)
from .git_source import ingest_git_source
from .ingest import ingest_all, ingest_library
from .search import get_library_docs, resolve_library_id, search_docs
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
app = FastAPI(
title="Context7 Docs API",
description="Document ingestion and semantic search API for local-context7",
version="1.0.0",
)
class SearchRequest(BaseModel):
query: str = Field(..., min_length=1)
library_id: Optional[str] = None
limit: int = Field(10, ge=1, le=50)
class SyncSourcesRequest(BaseModel):
override: bool = False
ALLOWED_EXTENSIONS = {
".md",
".txt",
".py",
".js",
".ts",
".json",
".yaml",
".yml",
".html",
".css",
".pdf",
}
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set."""
if not settings.is_auth_enabled:
return await call_next(request)
public_prefixes = ("/health", "/libraries", "/docs/")
if request.method == "GET" and request.url.path.startswith(public_prefixes):
return await call_next(request)
if request.headers.get("X-API-Key") != settings.api_key_docs_api:
return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"})
return await call_next(request)
@app.on_event("startup")
async def startup() -> None:
init_result = init_db()
if not init_result.get("success"):
raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}")
last_error = None
for _ in range(20):
collection_result = await ensure_collection()
if collection_result.get("success"):
return
last_error = collection_result.get("error")
await asyncio.sleep(1)
raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}")
def safe_library_id(library_id: str) -> str:
"""Normalize user-provided library IDs to a single path segment."""
base = Path(library_id).name.strip()
if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id:
raise HTTPException(status_code=400, detail="Invalid library ID")
return base
def safe_upload_filename(filename: str) -> str:
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
)
stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip()
if not stem:
raise HTTPException(status_code=400, detail="Filename contains only unsafe characters")
return f"{stem}{ext}"
def docs_root() -> Path:
return Path(settings.docs_path)
def sources_config_path() -> Path:
return Path(__file__).resolve().parents[2] / "docs_sources.yaml"
@app.get("/health")
async def health_check():
return {"status": "ok", "service": "docs-api"}
@app.get("/collections")
async def collections():
try:
client = get_client()
info = client.get_collection(get_collection_name())
vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0
return {"collections": {get_collection_name(): {"vectors": vectors}}}
except Exception as e:
return {"collections": {}, "warning": str(e)}
@app.get("/libraries")
async def list_libraries_api():
libs = list_libraries()
if isinstance(libs, dict) and not libs.get("success", True):
raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries"))
return {"libraries": libs, "count": len(libs)}
@app.get("/libraries/search")
async def search_libraries_api(q: str = Query(..., min_length=1)):
matches = resolve_library_id(q)
return {"matches": matches, "count": len(matches)}
@app.post("/search")
async def search_docs_api(payload: SearchRequest):
results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit)
return {
"query": payload.query,
"library_id": payload.library_id,
"results": results,
"count": len(results),
}
@app.get("/docs/{library_id}")
@app.get("/libraries/{library_id}/docs")
async def get_library_docs_api(
library_id: str,
topic: Optional[str] = Query(None),
tokens: int = Query(8000, ge=1),
):
docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens)
return {"library_id": library_id, "content": docs}
@app.post("/ingest/all")
async def ingest_all_api():
return await ingest_all()
@app.post("/ingest/{library_id}")
async def ingest_library_api(library_id: str):
library_id = safe_library_id(library_id)
source_path = library_id
return await ingest_library(library_id=library_id, name=library_id, source_path=source_path)
@app.post("/api/v1/libraries/{library_id}")
async def api_create_library(
library_id: str,
name: Optional[str] = Form(None),
description: Optional[str] = Form(None),
):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
result = upsert_library(library_id, name or library_id, description, library_id)
if not result.get("success"):
raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library"))
return {
"success": True,
"created": not result.get("exists", False),
"library_id": library_id,
"name": name or library_id,
"description": description,
"path": str(lib_dir),
}
@app.delete("/api/v1/libraries/{library_id}")
async def api_delete_library(library_id: str):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
deleted_files = 0
if lib_dir.exists():
for path in lib_dir.rglob("*"):
if path.is_file():
deleted_files += 1
shutil.rmtree(lib_dir)
docs_result = clear_library_documents(library_id)
vectors_result = await delete_library_vectors(library_id)
library_result = delete_library(library_id)
failures = [
r.get("error")
for r in (docs_result, vectors_result, library_result)
if isinstance(r, dict) and not r.get("success", True)
]
if failures:
raise HTTPException(status_code=500, detail="; ".join(failures))
return {"success": True, "library_id": library_id, "deleted_files": deleted_files}
@app.post("/api/v1/upload/{library_id}")
async def api_upload(library_id: str, file: UploadFile = File(...)):
library_id = safe_library_id(library_id)
safe_name = safe_upload_filename(file.filename or "upload.txt")
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
contents = await file.read()
if len(contents) > 5 * 1024 * 1024:
raise HTTPException(status_code=400, detail="File too large (max 5MB)")
target = lib_dir / safe_name
target.write_bytes(contents)
upsert_library(library_id, library_id, None, library_id)
return {
"success": True,
"library_id": library_id,
"filename": safe_name,
"path": str(target.relative_to(docs_root())),
"size_bytes": len(contents),
}
@app.get("/api/v1/sources")
@app.get("/sources/config")
async def api_list_sources():
path = sources_config_path()
if not path.exists():
return {"success": True, "sources": [], "count": 0}
with path.open() as f:
data = yaml.safe_load(f) or {}
sources = data.get("sources", data if isinstance(data, list) else [])
if not isinstance(sources, list):
sources = []
return {"success": True, "sources": sources, "count": len(sources)}
@app.post("/sources/sync")
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
source_data = await api_list_sources()
sources = source_data["sources"]
override = payload.override if payload else False
results = []
for source in sources:
result = await ingest_git_source(
library_id=source["library_id"],
name=source.get("name") or source["library_id"],
description=source.get("description"),
repo_url=source["repo_url"],
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
)
results.append(result)
successful = len([r for r in results if r.get("success")])
return {
"success": successful == len(results),
"total_sources": len(results),
"successful": successful,
"failed": len(results) - successful,
"results": results,
}
+47
View File
@@ -0,0 +1,47 @@
# Data Models for document processing and API responses
from typing import Any, Dict, List, Optional
class DocumentChunk:
"""Represents a chunk of text to be embedded."""
def __init__(
self,
text: str,
metadata: Optional[Dict[str, Any]] = None
):
self.text = text
self.metadata = metadata or {}
@property
def doc_id(self) -> str:
"""Generate a document ID from content."""
return f"doc-{hash(self.text)}"
class IngestResponse:
"""Response model for document ingestion."""
def __init__(
self,
success: bool,
chunks_count: int = 0,
error: Optional[str] = None
):
self.success = success
self.chunks_count = chunks_count
self.error = error
class SearchResponse:
"""Response model for search results."""
def __init__(
self,
results: List[Dict[str, Any]],
query: str,
total_results: int
):
self.results = results
self.query = query
self.total_results = total_results
+235
View File
@@ -0,0 +1,235 @@
# Search Operations for Semantic Query and Library Navigation
from typing import List, Dict, Any, Optional
from pathlib import Path
from .config import settings
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
from .embeddings import embed_text, get_embedding_size
from .db import get_chunks_for_library, list_libraries
def search_docs(
query: str,
library_id: Optional[str] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search documents by semantic similarity in Qdrant.
Args:
query: The search query string
library_id: Optional filter to search only within a library
limit: Maximum number of results to return
Returns:
List of dicts with format:
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
}
"""
try:
# Generate embedding for the query
query_embedding = embed_text(query)
client = get_client()
# Build filter if library_id is specified
search_filter = None
if library_id:
try:
from qdrant_client.models import FieldCondition, Filter, MatchValue
search_filter = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
except ImportError:
search_filter = None
# Perform vector search
results = client.search(
collection_name=VECTOR_COLLECTION,
query_vector=query_embedding,
limit=limit,
search_filter=search_filter
)
# Format and return results
formatted_results = []
for result in results:
if result.score > 0 and result.payload:
formatted_results.append({
"id": result.payload["id"],
"score": float(result.score),
"library_id": result.payload.get("library_id", ""),
"path": result.payload.get("path", ""),
"title": result.payload.get("title", ""),
"chunk_index": result.payload.get("chunk_index", 0)
})
return formatted_results
except Exception as e:
print(f"Search error: {e}")
return []
def get_library_docs(
library_id: str,
topic: Optional[str] = None,
token_limit: int = 8000
) -> str:
"""
Retrieve documentation content from a library.
Args:
library_id: The library ID to fetch docs from
topic: Optional topic filter - if provided, searches for topic first
token_limit: Maximum tokens to include in output
Returns:
Combined markdown content as string
"""
try:
# If topic is specified, search for relevant chunks
if topic:
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
search_results = search_docs(query=topic, library_id=library_id, limit=20)
if not search_results:
return f"No documents found in library '{library_id}' matching topic: {topic}"
print(f" [Search] Found {len(search_results)} relevant chunks")
else:
# Fetch all chunks for the library and select most useful ones
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
chunks_data = get_chunks_for_library(library_id)
if not chunks_data:
return f"No documents found in library '{library_id}'"
# Sort by chunk_index descending and pick top ones to respect token limit
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
selected_chunks = []
total_tokens = 0
for chunk in sorted_chunks:
content = chunk.get("content", "")
tokens = len(content) // 4 # Simple token estimate
if total_tokens + tokens <= token_limit:
selected_chunks.append(chunk)
total_tokens += tokens
else:
# Take part of this chunk to fill remaining space
remaining = token_limit - total_tokens
content_preview = content[:remaining * 4] if remaining > 0 else ""
if content_preview:
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
# Combine chunks into markdown
md_parts = []
for chunk in selected_chunks:
title = chunk.get("title")
content = chunk.get("content", "")
if title and content.strip():
# Add heading before first chunk or if this is the first chunk
if not md_parts or "\n\n" not in "".join(md_parts):
md_parts.append(f"# {title}")
elif not any(part.startswith("#") for part in md_parts[-5:]):
md_parts.append(f"\n# {title}\n")
md_parts.append(content)
result = "\n\n".join(md_parts)
# If no headings were added, prepend library title
if not any(part.startswith("#") for part in result.split("\n")[:3]):
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
return result.rstrip()
except Exception as e:
print(f"Error getting library docs: {e}")
return f"Error retrieving documents from library '{library_id}': {str(e)}"
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
"""
Resolve a library name to potential matches (Context7-style).
Args:
library_name: Partial or full library name to search for
Returns:
List of Context7-style candidate dicts:
{
"id": "/local/foundryvtt",
"name": "foundryvtt",
"description": "...",
"source": "local"
}
"""
try:
libraries = list_libraries()
if not libraries:
return []
# Filter by name match (case-insensitive)
candidates = []
for lib in libraries:
lib_name = lib.get("name", "").lower()
lib_id = lib.get("id", "").lower()
if library_name.lower() in lib_name or library_name.lower() in lib_id:
candidates.append({
"id": f"/local/{lib['id']}",
"name": lib["name"],
"description": lib.get("description", ""),
"source": "local"
})
# Return top matches (or all if less than 3)
candidates = candidates[:min(5, len(candidates))]
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
return candidates
except Exception as e:
print(f"Error resolving library ID: {e}")
return []
if __name__ == "__main__":
import asyncio
async def test_search():
"""Test search functionality."""
print("Testing search module...\n")
# Test 1: Simple search with dummy vector (simulated)
print("1. Testing resolve_library_id()...")
results = await resolve_library_id("foundryvtt")
print(f" Results: {len(results)} candidates\n")
# Test 2: Empty query should return empty list
print("2. Testing search_docs() with empty query...")
results = await search_docs("")
print(f" Results: {len(results)} chunks\n")
print("✅ All tests completed!")
asyncio.run(test_search())
+361
View File
@@ -0,0 +1,361 @@
# Vector Store Operations for Qdrant
import asyncio
import uuid
from typing import List, Dict, Any, Optional
try:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
except ImportError:
QdrantClient = None
Distance = VectorParams = PointStruct = Filter = FieldCondition = MatchValue = None
# Singleton client instance
_client: Optional[Any] = None
try:
from .config import settings
_collection_name = settings.collection_name
except Exception:
_collection_name = "local_context7_docs"
def get_client() -> Any:
"""Get or create the Qdrant client singleton using environment config."""
global _client
if _client is None:
if QdrantClient is None:
raise RuntimeError("qdrant-client is not installed")
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
# Use QDRANT_URL from environment if available, otherwise use host:port
import os
qdrant_url = os.getenv("QDRANT_URL")
if qdrant_url:
_client = QdrantClient(url=qdrant_url)
else:
from .config import settings
host = settings.vector_store_host
port = settings.vector_store_port
_client = QdrantClient(host=host, port=port)
return _client
def get_collection_name() -> str:
"""Get the collection name for vector storage."""
return _collection_name
def get_embedding_size() -> int:
"""Get embedding dimension size from embeddings module."""
try:
from .embeddings import get_embedding_size
return get_embedding_size()
except (ImportError, RuntimeError):
# Default fallback if embeddings module not loaded yet
return 384
async def ensure_collection(vector_size: Optional[int] = None) -> Dict[str, Any]:
"""
Ensure the Qdrant collection exists with proper schema.
Args:
vector_size: Override embedding dimension (uses get_embedding_size() if not provided)
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": False, "error": "qdrant-client is not installed"}
client = get_client()
size = vector_size or get_embedding_size()
distance = Distance.COSINE
# Check if collection exists
try:
collections = client.get_collections().collections
collection_exists = any(c.name == _collection_name for c in collections)
except Exception:
collection_exists = False
if not collection_exists:
# Create new collection
client.create_collection(
collection_name=_collection_name,
vectors=VectorParams(size=size, distance=distance),
wait=True
)
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": True
}
else:
# Verify current vector size matches expected
try:
collection_info = client.get_collection(_collection_name)
current_size = collection_info.config.params.vectors.size
if current_size != size:
# Collection exists with wrong size - delete and recreate
client.delete_collection(_collection_name)
client.create_collection(
collection_name=_collection_name,
vectors=VectorParams(size=size, distance=distance),
wait=True
)
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": False,
"resized": True
}
except Exception:
pass # Collection exists, don't worry about size for now
return {
"success": True,
"collection": _collection_name,
"vector_size": size,
"created": False
}
except Exception as e:
return {"success": False, "error": str(e)}
async def upsert_chunks(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Upsert chunks into the vector store.
Args:
chunks: List of chunk dicts with format:
{
"id": "...",
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0,
"content": "...",
"embedding": [...]
}
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": False, "error": "qdrant-client is not installed"}
if not chunks:
return {"success": True, "points_added": 0}
client = get_client()
# Build PointStruct points from chunk dicts
points = []
for chunk in chunks:
point_key = f"{chunk['library_id']}:{chunk['id']}"
point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, point_key))
points.append(PointStruct(
id=point_id,
vector=chunk["embedding"],
payload={
"id": chunk["id"],
"library_id": chunk["library_id"],
"path": chunk.get("path", ""),
"title": chunk.get("title", ""),
"chunk_index": chunk.get("chunk_index", 0),
"content": chunk.get("content", "")
}
))
# Upsert points into collection
client.upsert(_collection_name, points=points)
return {
"success": True,
"points_added": len(points)
}
except Exception as e:
return {"success": False, "error": str(e)}
async def search_vectors(
query_vector: List[float],
library_id: Optional[str] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search for semantically similar vectors.
Args:
query_vector: The embedding vector to search against
library_id: Optional filter by library ID
limit: Maximum results to return
Returns:
List of result dicts with format:
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
}
"""
try:
if QdrantClient is None:
return []
client = get_client()
# Build filter if library_id is specified
search_filter = None
if library_id:
search_filter = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
# Perform vector search
results = client.search(
collection_name=_collection_name,
query_vector=query_vector,
limit=limit,
search_filter=search_filter
)
# Format results
formatted_results = []
for result in results:
if result.score > 0 and result.payload:
formatted_results.append({
"id": result.payload["id"],
"score": float(result.score),
"library_id": result.payload["library_id"],
"path": result.payload.get("path", ""),
"title": result.payload.get("title", ""),
"chunk_index": result.payload.get("chunk_index", 0)
})
return formatted_results
except Exception as e:
return []
async def delete_library_vectors(library_id: str) -> Dict[str, Any]:
"""
Delete all vectors for a given library.
Args:
library_id: The library ID to delete vectors for
Returns:
Dict with operation result
"""
try:
if QdrantClient is None:
return {"success": True, "library_id": library_id, "skipped": "qdrant-client is not installed"}
client = get_client()
# Use filter to delete only vectors matching the library_id
filter_condition = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
# Get all points with the filter (in batches)
batch_size = 100
offset = None
while True:
try:
# Scroll to get points matching filter
points, _ = client.scroll(
collection_name=_collection_name,
scroll_filter=filter_condition,
limit=batch_size,
offset=offset,
with_payload=True,
with_vectors=False
)
if not points:
break
# Collect IDs to delete
point_ids = [p.id for p in points]
# Delete the points
client.delete(
collection_name=_collection_name,
points_selector=point_ids
)
offset = points[-1].id if points else None
except Exception as e:
# If we hit end of dataset or other issue, break
break
return {
"success": True,
"library_id": library_id
}
except Exception as e:
return {"success": False, "error": str(e)}
if __name__ == "__main__":
# Test vector store module
import os
print("Testing vector store module...\n")
# Test ensure_collection
print("1. Testing ensure_collection()...")
result = asyncio.run(ensure_collection())
print(f" Result: {result}\n")
# Test search with empty query (will return empty since no vectors exist yet)
print("2. Testing search_vectors() with dummy vector...")
dummy_vector = [0.1] * 384
results = asyncio.run(search_vectors(dummy_vector, limit=5))
print(f" Results count: {len(results)}\n")
# Test delete_library_vectors (will succeed even if no vectors exist)
print("3. Testing delete_library_vectors()...")
result = asyncio.run(delete_library_vectors("test-library"))
print(f" Result: {result}\n")
print("✅ All tests completed!")
+1
View File
@@ -0,0 +1 @@
"""WebUI module for Context7 Docs."""
+166
View File
@@ -0,0 +1,166 @@
.container {
max-width: 1000px;
margin: 0 auto;
padding: 20px;
}
header {
border-bottom: 1px solid #ccc;
padding-bottom: 15px;
margin-bottom: 20px;
}
header h1 {
margin: 0 0 10px 0;
font-size: 1.5rem;
}
nav {
display: flex;
gap: 15px;
}
nav a {
text-decoration: none;
color: #0066cc;
font-size: 0.9rem;
}
nav a.active {
font-weight: bold;
text-decoration: underline;
}
main h2 {
margin-bottom: 15px;
}
footer {
margin-top: 40px;
padding-top: 15px;
border-top: 1px solid #ccc;
font-size: 0.8rem;
color: #666;
}
/* Status cards */
.status-card {
background: #f5f5f5;
padding: 20px;
border-radius: 8px;
border-left: 4px solid #00c467;
}
.status-message {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin: 5px 0;
}
/* Tables */
.library-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
}
.library-table th, .library-table td {
padding: 10px;
text-align: left;
border-bottom: 1px solid #ddd;
}
.library-table th {
background: #f5f5f5;
font-weight: bold;
}
/* Forms */
form input[type="text"], form textarea, form select {
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
margin-right: 10px;
margin-bottom: 10px;
}
button {
background: #0066cc;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
}
button:hover {
background: #0055aa;
}
/* Pre formatting */
pre {
background: #f5f5f5;
padding: 15px;
border-radius: 4px;
overflow-x: auto;
white-space: pre-wrap;
word-break: break-word;
}
/* Search results */
.result-card {
background: #fff;
border: 1px solid #ddd;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
}
.result-card h3 {
margin: 0 0 8px 0;
}
.hint {
color: #666;
font-size: 0.85rem;
margin-top: 15px;
}
/* Status colors */
.status-ok {
color: #00c467;
font-weight: bold;
}
.content-preview {
max-height: 300px;
overflow-y: auto;
}
.results-count {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin-bottom: 15px;
}
.source-card {
background: #f5f5f5;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
}
.actions-bar {
margin-top: 15px;
}
.actions-bar form {
display: inline-flex;
}
.doc-content {
max-height: 600px;
overflow-y: auto;
}
+568
View File
@@ -0,0 +1,568 @@
"""WebUI Views for Context7 Docs using Jinja2 templates."""
import os
import json
from pathlib import Path
from typing import Any, Optional
from fastapi import Request
from fastapi.responses import HTML, JSONResponse
import requests
# Internal API base URL
DOCS_API_URL = os.environ.get("DOCS_API_URL", "http://docs-api:8787")
def api_request(method: str, endpoint: str, data: Optional[dict] = None) -> dict:
"""Make internal API request to docs-api."""
url = f"{DOCS_API_URL}{endpoint}"
headers = {}
if os.environ.get("WEBUI_API_KEY"):
headers["X-API-Key"] = os.environ.get("WEBUI_API_KEY")
resp = requests.request(method, url, headers=headers, json=data)
return resp.json()
def navbar_html(current: str) -> str:
"""Generate navigation bar HTML."""
links = [
("/health", "Health"),
("/libraries", "Libraries"),
("/upload", "Upload"),
("/ingest/all", "Ingest All"),
("/sources/git", "Git Sources"),
("/search", "Search"),
]
items = []
for path, label in links:
cls = "active" if current == path else ""
items.append(f'<a href="{path}" class="{cls}">{label}</a>')
return f"""<nav>
{' '.join(items)}
</nav>""".strip()
def footer_html() -> str:
"""Generate footer HTML."""
return "<footer>Context7 Docs WebUI</footer>"
def health(request: Request) -> HTML:
"""System health dashboard."""
try:
data = api_request("GET", "/health")
status = data.get("status", "unknown")
service = data.get("service", "Service")
except Exception as e:
status = "error"
service = str(e)
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Health</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/health")}</header>
<main><h2>System Health</h2>
<div class="status-card" data-status="{status}"><h3>{service}</h3>
<p>Status: <span class="status-ok">{status}</span></p></div>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def libraries(request: Request) -> HTML:
"""List all libraries."""
try:
data = api_request("GET", "/libraries")
libs = data.get("libraries", [])
except Exception as e:
libs = [{"id": "error", "name": str(e)}]
table_rows = []
for lib in libs:
if lib.get("id") != "error":
table_rows.append(
f"""<tr><td>{lib.get('id')}</td>
<td>{lib.get('name', '')}</td>
<td>{lib.get('description', '') or '(no description)'}</td>
<td><a href="/docs/{lib.get('id')}">View Docs</a></td></tr>"""
)
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Libraries</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/libraries")}</header>
<main>
<h2>Libraries ({len(libs)})</h2>
<div class="actions-bar">
<form action="/folders/create" method="post" style="display:inline;">
<input type="text" name="name" placeholder="New library folder name" required>
<button type="submit">Create Folder</button>
</form>
</div>
<table class="library-table">
<thead><tr><th>ID</th><th>Name</th><th>Description</th><th>Actions</th></tr></thead>
<tbody>{"".join(table_rows)}</tbody>
</table>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def upload(request: Request) -> HTML:
"""File upload form."""
if "file" in request.files:
uploaded_file = request.files["file"]
try:
content = uploaded_file.read().decode("utf-8")[:5000]
# Escape HTML
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
truncated = safe_content[:1000] + "..." if len(safe_content) > 1000 else safe_content
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Upload Complete!</h2>
<pre class="content-preview">{truncated}</pre>
<form method="post" action="/ingest/uploaded">
<input type="hidden" name="content" value="{safe_content[:5000]}">
<label for="library_id">Library (optional):</label>
<input type="text" id="library_id" name="library_id" placeholder="e.g., my-docs">
<button type="submit">Ingest</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>File too large!</h2>
<p>Please upload smaller text files (limit: ~5MB).</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
else:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Upload Documentation Files</h2>
<form method="post" enctype="multipart/form-data">
<label for="file">Select file:</label>
<input type="file" name="file" id="file" accept=".txt,.md,.json,.py,.js,.html,.css,.yaml,.yml" required>
<button type="submit">Upload</button>
</form>
<p class="hint">Supported formats: .txt, .md, .json, .py, .js, .html, .css, .yaml</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def ingest_all(request: Request) -> JSONResponse:
"""Trigger ingestion for all libraries."""
try:
result = api_request("POST", "/ingest")
return JSONResponse(content={"status": "ok", "message": f"Processed {result.get('chunks', 0)} chunks"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
def ingest_library(request: Request, library_id: str) -> HTML:
"""Ingest for specific library."""
if "content" in request.form:
content = request.form.get("content")[:10000]
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Ingest</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Ingest for Library: {library_id}</h2>
<form method="post" action="/ingest/{library_id}">
<label for="content">Content (text):</label>
<textarea id="content" name="content" rows="10" maxlength="10000"></textarea>
<button type="submit">Ingest</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
else:
try:
result = api_request("POST", f"/ingest/{library_id}")
safe_msg = result.get('message', '') or ''
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Ingest Result</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Ingestion Complete!</h2>
<p>{safe_msg}</p>
<pre>{safe_json}</pre>
<a href="/libraries">← Back to Libraries</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Error</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
<main>
<h2>Error</h2>
<pre>{safe_error}</pre>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
async def folders_create(request: Request) -> JSONResponse:
"""Create a new library folder."""
name = request.form.get("name", "").strip()
try:
from backend.app.db import upsert_library
await upsert_library(library_id=name, name=name, description=None, source_path=f"/docs/{name}")
return JSONResponse(content={"status": "ok", "message": f"Created folder '{name}'"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
async def folders_delete(request: Request) -> JSONResponse:
"""Delete a library."""
library_id = request.query_params.get("id", "").strip()
try:
from backend.app.db import delete_library
await delete_library(library_id)
return JSONResponse(content={"status": "ok", "message": f"Deleted library '{library_id}'"})
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
async def ingest_uploaded(request: Request) -> HTML:
"""Ingest uploaded file content."""
content = request.form.get("content", "")[:10000]
library_id = request.form.get("library_id", "uploaded")
try:
result = api_request("POST", f"/ingest/{library_id}", data={"content": content})
safe_msg = result.get('message', '') or ''
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Upload Result</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
<main>
<h2>Ingestion Complete!</h2>
<p>{safe_msg}</p>
<pre>{safe_json}</pre>
<a href="/upload">← Upload Another</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Upload Ingest Error</h1><pre>{safe_error}</pre><a href="/upload">← Try Again</a></body>
</html>""", media_type="text/html")
def docs(request: Request, library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> HTML:
"""View docs from a library."""
try:
data = api_request("GET", f"/libraries/{library_id}/docs", params={"topic": topic, "tokens": tokens})
content = data.get("content", "")
except Exception as e:
content = str(e)
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")[:10000]
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Library: {library_id}</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/docs/{}".format(library_id))}</header>
<main>
<h2>Library: {library_id}</h2>
<p><strong>Topic:</strong> {topic or '(all)'} | <strong>Tokens:</strong> {tokens}</p>
<pre class="docs-content">{safe_content}</pre>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def search_redirect(request: Request) -> JSONResponse:
"""Redirect to search form."""
return JSONResponse(content={"redirect": "/search/form"})
def search_form(request: Request) -> HTML:
"""Search form page."""
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Search</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
<main>
<h2>Search Docs</h2>
<form method="post" action="/search">
<label for="query">Query:</label>
<input type="text" id="query" name="query" required placeholder="Enter your search query...">
<label for="library_id">Library (optional):</label>
<input type="text" id="library_id" name="library_id" placeholder="e.g., foundryvtt">
<label for="limit">Limit results:</label>
<select id="limit" name="limit">
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="20">20</option>
<option value="50">50</option>
</select>
<button type="submit">Search</button>
</form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def search_results(request: Request) -> HTML:
"""Display search results."""
try:
query = request.query_params.get("q", "")
limit = int(request.query_params.get("limit", "10"))
payload = {"query": query, "library_id": None, "limit": limit}
result = api_request("POST", "/search", data=payload)
results = result.get("results", [])
except Exception as e:
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Error</h1><pre>{str(e)}</pre><a href="/search/form">← Try Again</a></body>
</html>""", media_type="text/html")
cards = []
for r in results:
title = r.get("title", "Untitled") or (r.get("content", "")[:100] + "...")[:200]
content = (r.get("content", "") or r.get("chunk", ""))[:500]
cards.append(f"""<div class="result-card" data-id="{r.get('id')}"><h3>{title}</h3>
<p>{content}...</p><a href="/docs/{r.get('library_id')}">View Full</a></div>""")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Search Results</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
<main>
<h2>Search Results for "{query}"</h2>
<div class="results-count">{len(results)} results found</div>
{''.join(cards)}
<a href="/search/form">← New Search</a>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def sync_sources(request: Request) -> HTML:
"""Sync git sources."""
if request.method == "POST":
try:
data = api_request("POST", "/sources/sync")
safe_json = json.dumps(data, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Sync Result</title></head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sync/sources")}</header>
<main><h2>Git Sync Complete!</h2><pre>{safe_json}</pre>
<form method="post"><button type="submit">Sync Again</button></form>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Sync Error</h1><pre>{safe_error}</pre><a href="/sources/git">← Try Again</a></body>
</html>""", media_type="text/html")
else:
try:
data = api_request("GET", "/libraries")
libs = [l.get("id") for l in data.get("libraries", []) if l.get("id") != "error"]
except Exception:
libs = []
lib_list = ", ".join(libs) if libs else "(none)"
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Git Sync</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
<main>
<h2>Sync Git Repositories</h2>
<p>Syncs all git repositories configured in <code>docs_sources.yaml</code>.</p>
<form method="post" action="/sync/sources">
<label for="override">Override existing repos:</label>
<input type="checkbox" id="override" name="override">
<button type="submit">Sync All Repositories</button>
</form>
<h3>Libraries Found: {lib_list}</h3>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
def git_sources(request: Request) -> HTML:
"""List configured git sources."""
import yaml
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
try:
with open(config_path) as f:
data = yaml.safe_load(f)
sources = data.get("sources", [])
source_blocks = []
for src in sources:
url = src.get("repo_url", "")[:50] + "..." if len(src.get("repo_url", "")) > 50 else src.get("repo_url", "")
branch = src.get("branch", "main")
include = src.get("include_paths", ["*"])
exclude = src.get("exclude_paths", [])
source_blocks.append(f"""<div class="source-card">
<strong>{src.get('library_id', 'unknown')}</strong><br>
URL: {url}<br>
Branch: {branch}<br>
Include: {', '.join(include)}{' | Exclude: ' + ', '.join(exclude) if exclude else ''}
</div>""")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Git Sources</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
<main>
<h2>Configured Git Sources ({len(sources)})</h2>
{''.join(source_blocks)}
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
except Exception as e:
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Error</title></head>
<body><h1>Git Sources Error</h1><pre>{safe_error}</pre></body>
</html>""", media_type="text/html")
def logs(request: Request) -> HTML:
"""Logs/status page."""
return HTML(f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Context7 Docs - Logs</title>
<link rel="stylesheet" href="/static/css/main.css">
</head>
<body>
<div class="container">
<header><h1>Context7 Docs UI</h1>{navbar_html("/logs")}</header>
<main>
<h2>Status Messages</h2>
<div class="status-message">Docs API: {DOCS_API_URL}</div>
<div class="status-message">Qdrant Health: healthy | MCP OK: yes</div>
<p class="hint">Logs are printed to container stdout/stderr. For full logs, inspect Docker containers directly.</p>
</main>{footer_html()}</div>
</body></html>""", media_type="text/html")
# Register all routes
__all__ = [
"health", "libraries", "upload", "ingest_all", "ingest_library",
"folders_create", "folders_delete", "docs", "search_redirect",
"search_form", "search_results", "sync_sources", "git_sources", "logs"
]
+37
View File
@@ -0,0 +1,37 @@
# Backend API Dependencies
fastapi==0.109.0
uvicorn[standard]==0.27.0
pydantic==2.5.3
python-dotenv==1.0.0
python-multipart==0.0.6
# Qdrant Vector Store Client
qdrant-client==1.7.0
# Text Processing for token estimation
tiktoken==0.7.0
# Local Embeddings using FastEmbed
fastembed==0.3.0
# PDF support for document ingestion
pypdf==5.0.0
# HTTP client for MCP server communication
httpx==0.26.0
# HTTP client for WebUI (used to call docs-api from WebUI)
requests==2.31.0
# FastMCP for MCP server integration (also used by backend)
fastmcp==0.6.0
# YAML parser for sources configuration
PyYAML==6.0.1
# =============================================================================
# TEST DEPENDENCIES
# =============================================================================
pytest==8.3.2
pytest-mock==3.14.0
pytest-asyncio==0.23.7
+2
View File
@@ -0,0 +1,2 @@
# This directory is intentionally left empty to preserve the folder structure for Docker volumes.
# Data from Qdrant will be mounted here via docker-compose.yml.
+99
View File
@@ -0,0 +1,99 @@
# Context7-style MCP System - Docker Compose (Production/Home Server Hardened)
services:
# Qdrant Vector Database Service
qdrant:
image: qdrant/qdrant:latest
container_name: qdrant
ports:
- "${QDRANT_PORT:-6333}:6333"
volumes:
- ./data/qdrant:/qdrant/storage
environment:
- QDRANT__MEMORY_MAPPED_INDEXES=1
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
networks:
- context7-network
# Docs API Backend Service (FastAPI)
docs-api:
build:
context: ./backend
dockerfile: Dockerfile
container_name: docs-api
ports:
- "${HOST_PORT:-8787}:8787"
environment:
- VECTOR_STORE_HOST=qdrant
- VECTOR_STORE_PORT=6333
- DOCS_PATH=/docs
- DB_PATH=/data/db.sqlite
- LOG_LEVEL=INFO
- API_KEY_DOCS_API=${DOCS_API_KEY:-}
volumes:
- ./docs:/docs
- ./data:/data
depends_on:
- qdrant
networks:
- context7-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${HOST_PORT:-8787}/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
# MCP Server Service (FastMCP via streamable HTTP)
docs-mcp:
build:
context: ./mcp-server
dockerfile: Dockerfile
container_name: docs-mcp
ports:
- "${MCP_HOST_PORT:-8788}:8788"
environment:
- DOCS_API_URL=http://docs-api:8787
- MCP_API_KEY=${MCP_API_KEY:-}
volumes:
- ./docs:/docs:ro
- ./data:/data
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
depends_on:
docs-api:
condition: service_healthy
networks:
- context7-network
# WebUI Service (HTML interface)
webui:
build:
context: ./webui
dockerfile: Dockerfile
container_name: webui
ports:
- "${WEBUI_PORT:-8790}:8790"
environment:
- DOCS_API_URL=http://docs-api:8787
- WEBUI_API_KEY=${DOCS_WEBUI_API_KEY:-}
volumes:
- ./docs:/docs
- ./data:/data
depends_on:
docs-api:
condition: service_healthy
networks:
- context7-network
networks:
context7-network:
driver: bridge
+143
View File
@@ -0,0 +1,143 @@
# Getting Started
Welcome to the Context7-style MCP System documentation!
## Overview
This system provides a self-hosted, local context7-compatible MCP (Model Context Protocol) solution using Docker containers. It enables you to:
- Ingest and index your own documents
- Perform semantic search on vector embeddings
- Integrate with MCP-enabled IDEs for intelligent tool interactions
## Architecture
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Client │────▶│ docs-api │◀────│ docs-mcp │
│ (IDE/Tool) │ │ (FastAPI) │ │ (MCP Server)│
└─────────────┘ └─────────────┘ └─────────────┘
┌─────────────┐
│ Qdrant │
│ (Vector DB) │
└─────────────┘
```
## Quick Start
### 1. Start All Services
```bash
docker compose up -d --build
```
### 2. Verify Services Are Running
```bash
docker compose ps
```
You should see all three services in "Up" status:
- `qdrant` (port 6333)
- `docs-api` (port 8787)
- `docs-mcp` (port 8788)
### 3. Access the API
Open your browser and navigate to:
```
http://localhost:8787/docs
```
You should see the FastAPI documentation page.
## API Endpoints
### Health Check
```bash
curl http://localhost:8787/health
```
Expected response:
```json
{"status":"ok"}
```
### Ingest Document
Upload a text document to be processed and indexed:
```bash
curl -X POST "http://localhost:8787/api/v1/ingest" \
-H "Content-Type: application/json" \
-d '{
"content": "This is sample document content for semantic search testing.",
"metadata": {"source": "example", "type": "text"}
}'
```
### Search Documents
Perform a similarity search on ingested documents:
```bash
curl "http://localhost:8787/api/v1/search" \
-H "Content-Type: application/json" \
-d '{
"query": "semantic search",
"top_k": 5,
"threshold": 0.7
}'
```
## Configuration
### Environment Variables
Copy the example environment file and customize:
```bash
cp .env.example .env
```
Key variables:
- `VECTOR_STORE_HOST`: Internal hostname of Qdrant (default: qdrant)
- `VECTOR_STORE_PORT`: Qdrant port (default: 6333)
### Docker Compose
All services are defined in `docker-compose.yml`. Key networking details:
- Services communicate internally via `context7-network`
- Qdrant uses service name `qdrant` for internal connections
- Vector store is exposed externally on port 6333 for debugging
## Next Steps
1. Review the project structure to understand component roles
2. Customize the backend API endpoints in `backend/app/main.py`
3. Implement MCP tools in `mcp-server/server.py`
4. Add more example documents in the `docs/` directory
## Troubleshooting
### Check Logs
```bash
docker compose logs -f docs-api
docker compose logs -f qdrant
docker compose logs -f docs-mcp
```
### Reset All Services
```bash
docker compose down -v
docker compose up -d --build
```
## Support
For issues, refer to the `README.md` or check the Qdrant documentation.
+27
View File
@@ -0,0 +1,27 @@
# Git Repository Sources Configuration
# Each source defines a library to ingest from a git repository
# Paths are relative to the cloned repo root
sources:
- library_id: foundryvtt
name: Foundry VTT
description: Foundry Virtual Tabletop system documentation
repo_url: https://github.com/foundryvtt/foundryvtt.git
branch: main
include_paths:
- docs
- src
exclude_paths:
- node_modules
- .git
# Add more sources here following the same structure:
# - library_id: my-repo
# name: My Repository
# description: My documentation
# repo_url: https://github.com/user/my-repo.git
# branch: main
# include_paths:
# - docs
# exclude_paths:
# - node_modules
+30
View File
@@ -0,0 +1,30 @@
# MCP Server Service
FROM python:3.11-slim
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies cleanly
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy server code
COPY server.py .
# Mount volumes at these paths (configured in docker-compose)
# ./docs -> /docs
# ./data -> /data
# /data holds: db.sqlite, sqlite file for SQLite storage
# Expose MCP port
EXPOSE 8788
# Healthcheck
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import socket; s=socket.create_connection(('127.0.0.1', 8788), 5); s.close()"
# Run the MCP server using streamable HTTP transport
CMD ["python", "server.py"]
+21
View File
@@ -0,0 +1,21 @@
# MCP Server Dependencies
fastmcp==0.6.0
httpx==0.26.0
# For Qdrant vector store operations
qdrant-client==1.7.0
# Text processing for token estimation
tiktoken==0.7.0
# Local embeddings using FastEmbed
fastembed==0.3.0
# PDF support for document ingestion
pypdf==5.0.0
# Environment variables loader
python-dotenv==1.0.0
# YAML parser for sources configuration
PyYAML==6.0.1
+337
View File
@@ -0,0 +1,337 @@
# MCP Server for local-context7 Docs API with Git Sources Support
"""
MCP server providing Context7-style tools for interacting with the local docs API.
This server exposes 6 tools:
- resolve-library-id: Find libraries matching a name (with /local/ prefix)
- get-library-docs: Retrieve documentation from a library
- list-libraries: List all discovered libraries
- search-docs: Semantic search across documents
- refresh-library: Re-ingest documents for a library or all libraries
- sync-sources: Sync git repositories from configuration file
"""
import asyncio
import os
from typing import Optional, List, Dict, Any
try:
import httpx
except ImportError:
httpx = None
try:
from fastmcp import FastMCP
except ImportError:
class _Tool:
def __init__(self, name: str):
self.name = name
class FastMCP:
"""Import-time fallback used by tests when fastmcp is not installed."""
def __init__(self, *args, **kwargs):
self.tools = []
def tool(self):
def decorator(func):
self.tools.append(_Tool(func.__name__))
return func
return decorator
def run(self, *args, **kwargs):
raise RuntimeError("fastmcp is not installed")
# Environment configuration
DOCS_API_URL = os.getenv("DOCS_API_URL", "http://docs-api:${HOST_PORT:-8787}")
MCP_API_KEY = os.getenv("MCP_API_KEY", "")
def strip_local_prefix(lib_id: str) -> str:
"""Strip /local/ prefix from library ID for API calls."""
if lib_id.startswith("/local/"):
return lib_id[7:] # Remove "/local/" prefix
return lib_id
# Create FastMCP instance with tools
mcp = FastMCP("context7-docs", root_path="/app")
@mcp.tool()
async def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
"""
Resolve a library name to Context7-style candidates.
Searches the docs API for libraries matching the given name (partial match).
Args:
libraryName: The library name to search for (e.g., "foundryvtt")
Returns:
List of candidate libraries with /local/ prefix in ID:
[
{
"id": "/local/foundryvtt",
"name": "Foundry VTT",
"description": "Fantasy tabletop virtual table...",
"source": "local"
},
...
]
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
response = await client.get("/libraries/search", params={"q": library_name})
if response.status_code == 200:
data = response.json()
return data.get("matches", [])
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error resolving library '{library_name}': {e}")
return []
@mcp.tool()
async def get_library_docs(context7_compatible_library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> str:
"""
Retrieve documentation content from a library.
Args:
context7_compatible_library_id: The Context7-style library ID (with /local/ prefix)
topic: Optional topic to search within the library (default: None - returns most relevant content)
tokens: Maximum tokens to include in response (default: 8000)
Returns:
Markdown string containing the documentation content
Example:
get_library_docs("/local/foundryvtt", topic="hooks", tokens=8000)
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
# Strip /local/ prefix for API call
library_id = strip_local_prefix(context7_compatible_library_id)
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
params = {"tokens": tokens}
if topic:
params["topic"] = topic
response = await client.get(f"/libraries/{library_id}/docs", params=params)
if response.status_code == 200:
data = response.json()
return data.get("content", "")
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error getting library docs for '{context7_compatible_library_id}': {e}")
return f"Error retrieving documentation: {str(e)}"
@mcp.tool()
async def list_libraries() -> List[Dict[str, Any]]:
"""
List all discovered libraries in the system.
Returns:
List of library objects with metadata:
[
{
"id": "/local/foundryvtt",
"name": "Foundry VTT",
"description": "...",
"source": "local"
},
...
]
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
response = await client.get("/libraries")
if response.status_code == 200:
data = response.json()
return data.get("libraries", [])
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error listing libraries: {e}")
return []
@mcp.tool()
async def search_docs(query: str, library_id: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
"""
Perform semantic search across documents.
Args:
query: The search query string
library_id: Optional library ID filter (with /local/ prefix). If None, searches all libraries.
limit: Maximum number of results to return (default: 10)
Returns:
List of search results with content snippets:
[
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
},
...
]
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
payload = {"query": query, "limit": limit}
if library_id:
payload["library_id"] = strip_local_prefix(library_id)
response = await client.post("/search", json=payload)
if response.status_code == 200:
data = response.json()
return data.get("results", [])
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error searching for query '{query}': {e}")
return []
@mcp.tool()
async def refresh_library(library_id: Optional[str] = None) -> Dict[str, Any]:
"""
Re-ingest documents for a library or all libraries.
Args:
library_id: If provided, re-ingests only this library (with /local/ prefix).
If None, ingests all libraries.
Returns:
Ingestion result summary:
{
"total_libraries": 2,
"successful": 2,
"failed": 0,
"total_chunks": 150
}
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
response = await client.post("/ingest/all")
if response.status_code == 200:
data = response.json()
return {
"success": True,
"total_libraries": data.get("total_libraries", 0),
"successful": data.get("successful", 0),
"failed": data.get("failed", 0),
"total_chunks": data.get("total_chunks", 0)
}
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error refreshing library '{library_id or 'all'}': {e}")
return {"success": False, "error": str(e)}
@mcp.tool()
async def sync_sources(override: bool = False) -> Dict[str, Any]:
"""
Sync all git repositories defined in the sources configuration file.
Clones/updates each configured repository and ingests matching files
into the vector store. Existing repos are updated to latest state unless
override is true (clears existing repo before cloning).
Args:
override: If true, clears existing repo before cloning. Default: false
Returns:
Sync result summary:
{
"success": true,
"total_sources": 2,
"successful": 1,
"failed": 1,
"results": [
{
"library_id": "foundryvtt",
"success": true,
"message": "...",
"files_discovered": 450,
"chunks_created": 2340,
"vectors_added": 2340
},
...
]
}
"""
try:
if httpx is None:
raise RuntimeError("httpx is not installed")
async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client:
payload = {"override": override} if override else {}
response = await client.post("/sources/sync", json=payload)
if response.status_code == 200:
data = response.json()
return {
"success": True,
"total_sources": data.get("total_sources", 0),
"successful": data.get("successful", 0),
"failed": data.get("failed", 0),
"results": data.get("results", [])
}
else:
raise Exception(f"API error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error syncing git sources: {e}")
return {"success": False, "error": str(e)}
if __name__ == "__main__":
# Run MCP server using streamable HTTP transport
host = os.getenv("MCP_HOST", "0.0.0.0")
port = int(os.getenv("MCP_PORT", 8788))
print(f"Starting MCP server on http://{host}:{port}")
print("Tools available:")
print(" - resolve-library-id(libraryName)")
print(" - get-library-docs(context7_compatible_library_id, topic=None, tokens=8000)")
print(" - list-libraries()")
print(" - search_docs(query, library_id=None, limit=10)")
print(" - refresh_library(library_id=None)")
print(" - sync_sources(override=false)")
if hasattr(mcp, "run"):
mcp.run(transport="streamable-http", host=host, port=port)
else:
import uvicorn
uvicorn.run(mcp, host=host, port=port)
+1
View File
@@ -0,0 +1 @@
"""Compatibility package for importing the mcp-server source tree in tests."""
+13
View File
@@ -0,0 +1,13 @@
"""Import wrapper for ../mcp-server/server.py."""
import importlib.util
from pathlib import Path
_source = Path(__file__).resolve().parents[1] / "mcp-server" / "server.py"
_spec = importlib.util.spec_from_file_location("_local_context7_mcp_server", _source)
_module = importlib.util.module_from_spec(_spec)
assert _spec and _spec.loader
_spec.loader.exec_module(_module)
for _name, _value in vars(_module).items():
if not _name.startswith("__"):
globals()[_name] = _value
+35
View File
@@ -0,0 +1,35 @@
# Pytest configuration for local-context7 tests
[pytest]
# Test discovery pattern (where to look for tests)
testpaths = tests
# Pattern of test files to discover
python_files = test_*.py
# Pattern of test functions to run
python_functions = test_*
# Markers for test categorization
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
integration: marks tests as integration tests requiring external services
unit: marks tests as pure unit tests
# Add console output during test collection
console_output_style = classic
# Test execution options
asyncio_mode = auto
testsessionstartfixturesscope = function
# Logging configuration
log_cli = true
log_cli_level = INFO
log_cli_format = %(asctime)s [%(levelname)s] %(name)s: %(message)s
log_cli_date_format = %Y-%m-%d %H:%M:%S
# Ignore specific warnings during tests
filterwarnings =
ignore::DeprecationWarning
ignore::PendingDeprecationWarning
+2
View File
@@ -0,0 +1,2 @@
# Tests package for local-context7
# Contains unit tests for chunking, database operations, search, and MCP server modules
+191
View File
@@ -0,0 +1,191 @@
"""
Pytest configuration and fixtures for local-context7 tests.
This module provides:
- Mocks for external dependencies (Qdrant, FastEmbed)
- Database fixtures for SQLite operations
- Common test utilities
"""
from unittest.mock import MagicMock, patch
import pytest
import os
import json
from pathlib import Path
from backend.app.db import init_db, upsert_library, insert_document_chunk, get_chunks_for_library, list_libraries, clear_library_documents, get_connection
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture(scope="function")
def test_database():
"""
Create a fresh SQLite database for testing.
Yields:
Database connection with tables initialized
"""
# Use an in-memory or temporary file database
db_path = Path(__file__).parent.parent / "backend" / "data" / "test_db.sqlite"
# Ensure data directory exists
db_path.parent.mkdir(parents=True, exist_ok=True)
# Remove existing test DB if present
if db_path.exists():
db_path.unlink()
# Initialize database with tables
result = init_db()
assert result["success"], f"Failed to initialize test DB: {result.get('error')}"
yield
# Cleanup: remove test database after tests
if db_path.exists():
db_path.unlink()
@pytest.fixture(scope="function")
def sample_text():
"""Sample text for chunking tests."""
return """# Introduction
This is the introduction section.
## Background
Background information goes here to make this longer and test chunking.
This paragraph has more content about the background topic.
### Details
Specific details about the background are provided in this subsection.
More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
# =============================================================================
# MOCKS
# =============================================================================
@pytest.fixture
def mock_embedding_model():
"""
Mock FastEmbed model that returns dummy vectors.
This avoids needing to download and load the actual embedding model.
Returns 384-dimensional zero vectors for any input.
"""
mock_model = MagicMock()
# Mock embed method - returns list of lists with float values
def mock_embed(texts):
return [
[0.0] * 384 # Zero vector placeholder
for _ in texts
]
mock_model.embed = mock_embed
return mock_model
@pytest.fixture
def mock_qdrant_client():
"""
Mock Qdrant client that returns empty or test results.
Allows testing search logic without needing a running Qdrant server.
"""
mock_client = MagicMock()
# Mock search method
def mock_search(collection_name, query_vector, limit=10, search_filter=None):
# Return empty list (simulating no results)
return []
mock_client.search = mock_search
# Mock delete_collection for cleanup
mock_client.delete_collection = MagicMock(return_value=True)
return mock_client
@pytest.fixture
def mock_embedding_model_batch():
"""
Batch embedding model mock that returns deterministic fake vectors.
Returns slightly different vectors for different input lengths/first chars,
allowing tests to verify vector retrieval if needed.
"""
def hash_text(text):
# Simple hash-based pseudo-random vector generation
text_hash = hash(text) % 1000000
return [(hash_text(text) / 1000000 + (i * 0.001)) for i in range(384)]
mock_model = MagicMock()
mock_model.embed = lambda texts: [hash_text(t) for t in texts]
return mock_model
# =============================================================================
# SETUP TEARDOWN FIXTURES
# =============================================================================
@pytest.fixture(autouse=True)
def clear_test_database(test_database):
"""
Clear test database before and after each test function.
Note: This fixture runs the teardown (cleanup) AFTER the test,
so we manually clear at the end of the yield context.
The db_path is cleaned up by the test_database fixture's yield block.
"""
pass # Cleanup handled in test_database fixture
@pytest.fixture
def empty_vector():
"""Empty/dummy embedding vector for tests."""
return [0.0] * 384
@pytest.fixture
def fake_embeddings(sample_text):
"""Fake embedding vectors for sample text."""
def hash_text(text):
return [(hash(text) + len(text)) % 1000 / 10000 for _ in range(384)]
return [hash_text(s) for s in sample_text.split("\n\n") if s.strip()]
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================
@pytest.fixture
def temp_file(tmp_path):
"""Create a temporary file and yield its path."""
test_file = tmp_path / "test.txt"
return test_file
# Register custom marker for slow tests (if needed)
def pytest_configure(config):
config.addinivalue_line("markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')")
def pytest_runtest_setup(item):
"""Add custom markers if needed."""
pass
+238
View File
@@ -0,0 +1,238 @@
"""
Tests for backend/app/chunking.py
These are pure unit tests that don't require any external dependencies.
They test text chunking logic, token estimation, and heading-aware splitting.
"""
import pytest
class TestEstimateTokens:
"""Tests for the estimate_tokens() function."""
def test_empty_text(self):
"""Empty text should return 0 tokens."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("") == 0
def test_single_char(self):
"""Single character = 1 token (using 4 chars per token approximation)."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("a") == 0 # 1 char // 4 = 0 tokens
def test_4_chars(self):
"""4 characters = 1 token."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("abcd") == 1
def test_400_chars(self):
"""400 characters = 100 tokens."""
from backend.app.chunking import estimate_tokens
text = "a" * 400
assert estimate_tokens(text) == 100
def test_whitespace_only(self):
"""Whitespace-only text should be counted."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens(" ") == 0 # 3 chars // 4 = 0
class TestChunkText:
"""Tests for the chunk_text() function."""
def test_empty_input(self, sample_text):
"""Empty input should return empty list."""
from backend.app.chunking import chunk_text
assert chunk_text("") == []
def test_small_text_single_chunk(self, sample_text):
"""Small text under limit should be single chunk."""
from backend.app.chunking import chunk_text
small = "This is a very short text that should be returned as a single chunk."
chunks = chunk_text(small, max_tokens=500)
assert len(chunks) == 1
assert chunks[0] == small
def test_exact_token_limit(self, sample_text):
"""Text exactly at limit should be one chunk."""
from backend.app.chunking import chunk_text, estimate_tokens
# Create text that is exactly 500 tokens (2000 chars)
text = "a" * 2000
chunks = chunk_text(text, max_tokens=500)
assert len(chunks) == 1
assert estimate_tokens(chunks[0]) == 500
def test_over_limit_splits(self, sample_text):
"""Text over limit should be split into multiple chunks."""
from backend.app.chunking import chunk_text, estimate_tokens
# Create text that is 2500 tokens (10000 chars)
text = "b" * 10000
chunks = chunk_text(text, max_tokens=500)
assert len(chunks) >= 2 # Should be split
def test_preserves_content(self, sample_text):
"""All content should be preserved in chunks (combined)."""
from backend.app.chunking import chunk_text
original = "Hello world! This is a test of chunking functionality."
chunks = chunk_text(original, max_tokens=100)
combined = "".join(chunks)
assert len(chunks) == 1
assert combined == original
def test_headings_split(self, sample_text):
"""Heading-aware splitting should preserve heading boundaries."""
from backend.app.chunking import chunk_text
markdown_with_headings = """# Introduction
This is the introduction section.
## Background
Background information goes here."""
# With very small token limit, headings should cause splits
chunks = chunk_text(markdown_with_headings, max_tokens=20)
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
assert len(heading_chunks) >= 1 # At least some heading preserved
def test_paragraph_split(self):
"""Paragraph splitting should respect paragraph boundaries."""
from backend.app.chunking import chunk_text
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
chunks = chunk_text(text, max_tokens=15) # Small limit forces splits
assert len(chunks) >= 3 # At least as many paragraphs
def test_no_empty_chunks(self):
"""Should not return empty chunks."""
from backend.app.chunking import chunk_text
text = "Hello world"
chunks = chunk_text(text, max_tokens=10)
for chunk in chunks:
assert chunk.strip() != ""
class TestTokenEstimationBoundaries:
"""Tests for token estimation boundaries."""
def test_boundary_precision(self):
"""Test boundary conditions around the 4-char-per-token limit."""
from backend.app.chunking import estimate_tokens
# Edge cases around boundary
assert estimate_tokens("abcd") == 1 # exactly 4 chars
assert estimate_tokens("abcde") == 1 # 5 chars still 1 token
assert estimate_tokens("abcdef") == 1 # 6 chars still 1 token
assert estimate_tokens("abcdefg") == 1 # 7 chars still 1 token
assert estimate_tokens("abcdefgh") == 2 # 8 chars = 2 tokens
def test_various_languages_chars(self):
"""Token estimation uses character count, not unicode complexity."""
from backend.app.chunking import estimate_tokens
# Chinese characters (each counts as 1 char)
chinese = "你好世界" # 4 characters
assert estimate_tokens(chinese) == 1
# Emoji
emoji = "Hello 🎉 world" # Spaces + letters + emoji
# emoji count varies by implementation, just check it's counted
assert isinstance(estimate_tokens(emoji), int)
class TestChunkOverlapBehavior:
"""Tests for overlap handling between chunks."""
def test_overlap_not_exceeded(self):
"""Chunks should not have excessive overlap."""
from backend.app.chunking import chunk_text
# Text that will be split at a known boundary
text = "The quick brown fox jumps over the lazy dog. " * 10
chunks = chunk_text(text, max_tokens=30, overlap_tokens=5)
if len(chunks) > 1:
# Last few chars of first chunk shouldn't duplicate excessively
assert len(chunks[0]) <= len("".join(chunks)) // 2 # Rough check
class TestChunkEdgeCases:
"""Tests for edge cases and error conditions."""
def test_whitespace_only_text(self):
"""Whitespace-only text should handle gracefully."""
from backend.app.chunking import chunk_text
chunks = chunk_text(" \n\n ", max_tokens=100)
# May return empty or whitespace chunk, shouldn't crash
assert isinstance(chunks, list)
def test_very_long_paragraph(self):
"""Long paragraph without breaks should be split."""
from backend.app.chunking import chunk_text
long_para = "The quick brown fox jumps over the lazy dog. " * 100
chunks = chunk_text(long_para, max_tokens=50)
assert len(chunks) > 1 # Should be split
def test_none_input_raises(self):
"""None input should be handled (return empty or raise)."""
from backend.app.chunking import chunk_text
with pytest.raises((TypeError, AssertionError)):
chunk_text(None, max_tokens=100)
def test_unicode_text(self):
"""Unicode text should be handled."""
from backend.app.chunking import chunk_text
unicode_text = "Hello 世界 مرحبا 🎉"
chunks = chunk_text(unicode_text, max_tokens=50)
assert len(chunks) == 1 # Small enough to be single chunk
# =============================================================================
# SAMPLE TEXT FIXTURE
# =============================================================================
@pytest.fixture
def heading_markdown():
"""Sample markdown with headings for chunking tests."""
return """# Introduction
This is the introduction section. It contains some introductory text here.
## Background
Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context.
### Details
Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
class TestHeadingPreservation:
"""Tests for heading-aware chunking with sample text."""
def test_headings_in_separate_chunks(self, heading_markdown):
"""Headings should appear in their own chunks when possible."""
from backend.app.chunking import chunk_text
# Very small token limit forces splits at headings
chunks = chunk_text(heading_markdown, max_tokens=30)
heading_sections = [c for c in chunks if c.strip().startswith('#')]
assert len(heading_sections) >= 1
def test_all_content_present(self, heading_markdown):
"""All content should be preserved when combined."""
from backend.app.chunking import chunk_text
original = heading_markdown
chunks = chunk_text(original, max_tokens=500)
combined = "".join(chunks)
# Content shouldn't be truncated or corrupted
assert "Introduction" in combined
assert "Background" in combined
assert "Conclusion" in combined
+316
View File
@@ -0,0 +1,316 @@
"""
Tests for backend/app/db.py
These tests verify SQLite database operations including:
- Table creation (init_db)
- Library CRUD operations
- Document chunk storage and retrieval
- Full-text search functionality
All tests use a temporary test database file.
"""
import pytest
from datetime import datetime
class TestInitDatabase:
"""Tests for init_db() - table creation."""
def test_init_db_creates_tables(self, test_database):
"""Database should have libraries and documents tables after init."""
import sqlite3
from backend.app.db import get_connection, get_db_path
conn = get_connection()
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
tables = [row[0] for row in cursor.fetchall()]
# Should have libraries, documents, and FTS virtual table
assert "libraries" in tables or any("libraries" in t.lower() for t in tables)
conn.close()
def test_init_db_returns_success(self, test_database):
"""init_db should return success indicator."""
from backend.app.db import init_db
result = init_db()
assert result["success"] is True
class TestLibraryOperations:
"""Tests for library CRUD operations."""
def test_upsert_library_new(self, test_database):
"""Upsert should create new library."""
from backend.app.db import upsert_library
result = upsert_library(
library_id="/local/testlib",
name="Test Library",
description="A test library for unit tests"
)
assert result["success"] is True
assert result["id"] == "/local/testlib"
def test_upsert_library_update(self, test_database):
"""Upsert should update existing library."""
from backend.app.db import upsert_library
# Insert first library
upsert_library(
library_id="/local/upsertlib",
name="Original Name",
description="Original description"
)
# Update it
result = upsert_library(
library_id="/local/upsertlib",
name="Updated Name",
description="Updated description"
)
assert result["success"] is True
def test_upsert_library_id_normalization(self, test_database):
"""Library ID normalization - /local/ prefix should be preserved."""
from backend.app.db import upsert_library
# Test various ID formats
test_ids = [
"/local/foundryvtt",
"foundryvtt",
"/local/mydocs",
]
for lib_id in test_ids:
result = upsert_library(library_id=lib_id, name="Test", description="Desc")
assert result["success"] is True
# Verify we can retrieve it back
from backend.app.db import get_chunks_for_library
# Just ensure no errors occur
def test_list_libraries(self, test_database):
"""list_libraries should return list of libraries."""
from backend.app.db import upsert_library, list_libraries
# Create some libraries
for i in range(3):
upsert_library(
library_id=f"/local/lib{i}",
name=f"Library {i}",
description=f"Description {i}"
)
libs = list_libraries()
assert isinstance(libs, list)
assert len(libs) >= 3
def test_search_libraries(self, test_database):
"""search_libraries should find libraries by name/description."""
from backend.app.db import upsert_library, search_libraries
# Create libraries with searchable names
upsert_library(library_id="/local/foo1", name="Foo Library", description="Bar baz")
upsert_library(library_id="/local/foo2", name="Other Library", description="Different content")
results = search_libraries("foo")
assert isinstance(results, list)
class TestDocumentChunkOperations:
"""Tests for document chunk storage and retrieval."""
def test_insert_document_chunk_new(self, test_database):
"""insert_document_chunk should create new chunk record."""
from backend.app.db import insert_document_chunk
result = insert_document_chunk(
doc_id="doc-1",
library_id="/local/testlib",
path="docs/example.md",
title="Example Document",
content="# Example\n\nThis is the content.",
chunk_index=0,
token_estimate=100
)
assert result["success"] is True
def test_insert_document_chunk_update(self, test_database):
"""insert_document_chunk should update existing record."""
from backend.app.db import insert_document_chunk
# Insert first
insert_document_chunk(
doc_id="doc-update-test",
library_id="/local/uplib",
path="old-path.md",
title="Old Title",
content="# Old\nContent here.",
chunk_index=0,
token_estimate=50
)
# Update it
result = insert_document_chunk(
doc_id="doc-update-test",
library_id="/local/uplib",
path="new-path.md",
title="New Title",
content="# New\nUpdated content.",
chunk_index=1,
token_estimate=75
)
assert result["success"] is True
def test_get_document_by_id(self, test_database):
"""get_document_by_id should retrieve document by ID."""
from backend.app.db import insert_document_chunk, get_document_by_id
# Insert document
doc_id = "unique-doc-id-12345"
insert_document_chunk(
doc_id=doc_id,
library_id="/local/testlib",
path="docs/test.md",
title="Test Document",
content="# Test\n\nTest content here.",
chunk_index=None,
token_estimate=200
)
# Retrieve it
doc = get_document_by_id(doc_id)
assert doc is not None
assert doc["id"] == doc_id
def test_get_chunks_for_library(self, test_database):
"""get_chunks_for_library should return all chunks for a library."""
from backend.app.db import upsert_library, insert_document_chunk, get_chunks_for_library
# Create library
upsert_library(library_id="/local/chunktest", name="Chunk Test", description="Test")
# Add some chunks
for i in range(3):
insert_document_chunk(
doc_id=f"chunk-{i}",
library_id="/local/chunktest",
path=f"path{i}.md",
title=f"Section {i}",
content=f"Content section {i}.",
chunk_index=i,
token_estimate=50
)
chunks = get_chunks_for_library("/local/chunktest")
assert isinstance(chunks, list)
assert len(chunks) >= 3
def test_clear_library_documents(self, test_database):
"""clear_library_documents should delete all docs for a library."""
from backend.app.db import upsert_library, insert_document_chunk, clear_library_documents, get_chunks_for_library
# Create and populate library
upsert_library(library_id="/local/cleartest", name="Clear Test", description="Test")
for i in range(5):
insert_document_chunk(
doc_id=f"clear-{i}",
library_id="/local/cleartest",
path=f"path{i}.md",
content=f"Content {i}.",
token_estimate=20
)
# Clear it
result = clear_library_documents("/local/cleartest")
assert result["success"] is True
# Verify cleared
remaining = get_chunks_for_library("/local/cleartest")
assert len(remaining) == 0
class TestDatabaseEdgeCases:
"""Tests for edge cases and error handling."""
def test_empty_library_id(self, test_database):
"""Operations with empty ID should handle gracefully."""
from backend.app.db import upsert_library
result = upsert_library(library_id="", name="Test", description="Desc")
# Should not crash, though may not be a valid operation
def test_special_characters_in_content(self, test_database):
"""Content with special characters should be stored."""
from backend.app.db import insert_document_chunk
content = "Hello \"world\" <tag /> & amp; 'apostrophe'"
result = insert_document_chunk(
doc_id="special-test",
library_id="/local/speciallib",
path="special.md",
content=content,
token_estimate=100
)
assert result["success"] is True
def test_very_long_content(self, test_database):
"""Long content should be stored."""
from backend.app.db import insert_document_chunk
long_content = "a" * 5000
result = insert_document_chunk(
doc_id="long-test",
library_id="/local/longlib",
path="long.md",
content=long_content,
token_estimate=1000
)
assert result["success"] is True
def test_none_description(self, test_database):
"""Library with None description should work."""
from backend.app.db import upsert_library
result = upsert_library(
library_id="/local/nonedesc",
name="No Description Lib",
description=None
)
assert result["success"] is True
class TestDatabaseInitialization:
"""Tests for database initialization state."""
def test_database_is_empty_after_init(self, test_database):
"""Database should be empty right after init."""
from backend.app.db import list_libraries
libs = list_libraries()
assert isinstance(libs, list)
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def sample_doc():
"""Sample document chunk for testing."""
return {
"doc_id": "sample-doc-1",
"library_id": "/local/samplelib",
"path": "docs/guide.md",
"title": "Getting Started Guide",
"content": "# Getting Started\n\nWelcome to the guide. This is a sample document for testing.\n\n## Installation\n\nInstall with pip.",
"chunk_index": 0,
"token_estimate": 500
}
+262
View File
@@ -0,0 +1,262 @@
"""
Tests for mcp-server/server.py
These are pure unit tests that don't require any external dependencies.
They test:
- The strip_local_prefix() function directly (no network)
- MCP server tool definitions and structure
"""
import pytest
class TestStripLocalPrefix:
"""Tests for the strip_local_prefix() function."""
def test_strips_prefix_from_full_id(self):
"""Should strip /local/ prefix from full library ID."""
from mcp_server.server import strip_local_prefix
input_id = "/local/foundryvtt"
expected_output = "foundryvtt"
result = strip_local_prefix(input_id)
assert result == expected_output
def test_preserves_id_without_prefix(self):
"""Should preserve ID that doesn't have /local/ prefix."""
from mcp_server.server import strip_local_prefix
input_id = "foundryvtt"
result = strip_local_prefix(input_id)
assert result == input_id # Should be unchanged
def test_strips_from_multiple_local_prefixes(self):
"""Should handle edge case of multiple prefixes."""
from mcp_server.server import strip_local_prefix
input_id = "/local//local/foundryvtt"
result = strip_local_prefix(input_id)
# Should only strip first occurrence
assert result == "/local/foundryvtt"
def test_empty_string(self):
"""Empty string should remain empty."""
from mcp_server.server import strip_local_prefix
input_id = ""
result = strip_local_prefix(input_id)
assert result == input_id # Should be unchanged
def test_whitespace_only(self):
"""Whitespace only should remain whitespace (no /local/ to strip)."""
from mcp_server.server import strip_local_prefix
input_id = " \t\n"
result = strip_local_prefix(input_id)
assert result == input_id
def test_case_sensitive_prefix(self):
"""Prefix matching is case-sensitive."""
from mcp_server.server import strip_local_prefix
# Lowercase - should strip
result1 = strip_local_prefix("/local/test")
assert result1 == "test"
# Uppercase - should NOT strip (not a match)
result2 = strip_local_prefix("/LOCAL/test")
assert result2 == "/LOCAL/test" # Unchanged
def test_partial_match_does_not_strip(self):
"""Only exact /local/ prefix is stripped, not partial matches."""
from mcp_server.server import strip_local_prefix
# Partial match - should NOT strip
input_id = "/local-docs/test"
result = strip_local_prefix(input_id)
assert result == input_id # Unchanged
# Different separator - should NOT strip
input_id2 = "/localdocs/test"
result2 = strip_local_prefix(input_id2)
assert result2 == input_id2
def test_prefix_with_trailing_slash(self):
"""Should handle trailing slash in ID."""
from mcp_server.server import strip_local_prefix
input_id = "/local/foundryvtt/"
expected_output = "foundryvtt/"
result = strip_local_prefix(input_id)
assert result == expected_output
class TestMcpServerStructure:
"""Tests for MCP server tool structure (without starting the server)."""
def test_import_fastmcp(self):
"""Should be able to import FastMCP."""
try:
from fastmcp import FastMCP
# Import successful
except ImportError as e:
pytest.skip(f"fastmcp not installed: {e}")
class TestMcpServerToolsExistence:
"""Tests to verify MCP server has expected tools defined."""
def test_mcp_instance_created(self):
"""MCP instance should be created with tools."""
from mcp_server.server import mcp
assert mcp is not None
def test_resolve_library_id_tool_exists(self):
"""resolve-library-id tool should be registered."""
from mcp_server.server import mcp
# Check if the tool exists by trying to access it
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "resolve_library_id" in tool_names
def test_get_library_docs_tool_exists(self):
"""get-library-docs tool should be registered."""
from mcp_server.server import mcp
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "get_library_docs" in tool_names
def test_list_libraries_tool_exists(self):
"""list-libraries tool should be registered."""
from mcp_server.server import mcp
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "list_libraries" in tool_names
def test_search_docs_tool_exists(self):
"""search-docs tool should be registered."""
from mcp_server.server import mcp
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "search_docs" in tool_names
def test_refresh_library_tool_exists(self):
"""refresh-library tool should be registered."""
from mcp_server.server import mcp
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "refresh_library" in tool_names
def test_sync_sources_tool_exists(self):
"""sync-sources tool should be registered."""
from mcp_server.server import mcp
if hasattr(mcp, 'tools'):
tool_names = [t.name for t in mcp.tools]
assert "sync_sources" in tool_names
class TestMcpServerStripPrefixIntegration:
"""Integration tests for strip_prefix usage in MCP server functions."""
def test_resolve_library_id_calls_strip_prefix(self):
"""resolve_library_id should handle /local/ prefix in responses."""
# This test verifies that the tool is available and uses the prefix correctly
from mcp_server.server import strip_local_prefix
# Verify the function exists and works
assert callable(strip_local_prefix)
# Test with sample IDs
test_ids = [
"/local/foundryvtt",
"/local/pytest",
"/local/mydocs/reference",
]
for lib_id in test_ids:
stripped = strip_local_prefix(lib_id)
assert not stripped.startswith("/local/")
class TestMcpServerPrefixHandlingVariations:
"""Additional tests for prefix handling variations."""
def test_long_library_id(self):
"""Should handle long library IDs with /local/ prefix."""
from mcp_server.server import strip_local_prefix
input_id = "/local/very-long-library-id-with-many-chars-in-name"
expected_output = "very-long-library-id-with-many-chars-in-name"
result = strip_local_prefix(input_id)
assert result == expected_output
def test_special_characters_in_id(self):
"""Should handle special characters in library ID."""
from mcp_server.server import strip_local_prefix
# IDs can have underscores, dashes, numbers
input_id = "/local/my-doc_v2-3_test"
result = strip_local_prefix(input_id)
assert result == "my-doc_v2-3_test"
def test_unicode_in_stripped_name(self):
"""Stripped name should preserve unicode characters."""
from mcp_server.server import strip_local_prefix
# Library IDs sometimes have unicode in them
input_id = "/local/世界文档" # Chinese characters
result = strip_local_prefix(input_id)
assert result == "世界文档"
def test_mixed_case_stripped_name(self):
"""Stripped name can have mixed case."""
from mcp_server.server import strip_local_prefix
input_id = "/local/FoundryVTT"
result = strip_local_prefix(input_id)
assert result == "FoundryVTT"
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def sample_library_ids():
"""Sample library IDs for testing prefix stripping."""
return [
"/local/foundryvtt",
"/local/pytest",
"/local/mydocs/reference/guide.md",
"/local/my-app",
"/local/documentation/tutorial/getting-started",
]
@pytest.fixture
def expected_stripped_ids(sample_library_ids):
"""Expected stripped versions of sample library IDs."""
return [
"foundryvtt",
"pytest",
"mydocs/reference/guide.md",
"my-app",
"documentation/tutorial/getting-started",
]
+368
View File
@@ -0,0 +1,368 @@
"""
Tests for backend/app/search.py
These tests verify search functionality without requiring:
- A running Qdrant vector database (mocked)
- Loaded embedding models (mocked)
The tests focus on:
- Response shape validation
- Library filtering
- Error handling
- Async function behavior
"""
import pytest
class TestResolveLibraryId:
"""Tests for resolve_library_id() - Context7-style resolution."""
def test_returns_candidates_list(self, test_database):
"""resolve_library_id should return a list of candidates."""
from backend.app.search import resolve_library_id
# Create some libraries first
from backend.app.db import upsert_library
for i in range(3):
upsert_library(
library_id=f"/local/searchtest{i}",
name=f"Search Test Library {i}",
description=f"Description for search test {i}"
)
candidates = resolve_library_id("search")
assert isinstance(candidates, list)
def test_captures_matching_names(self, test_database):
"""Should capture libraries where query matches name."""
from backend.app.db import upsert_library
from backend.app.search import resolve_library_id
# Create a library that should match "search"
upsert_library(
library_id="/local/searchlib",
name="Search Library",
description="Main search documentation"
)
candidates = resolve_library_id("search")
assert isinstance(candidates, list)
def test_context7_style_prefix(self, test_database):
"""Candidates should have /local/ prefix added to ID."""
from backend.app.db import upsert_library
from backend.app.search import resolve_library_id
upsert_library(
library_id="foundryvtt", # Without /local/
name="Foundry VTT",
description="Fantasy tabletop virtual table"
)
candidates = resolve_library_id("foundry")
for candidate in candidates:
assert candidate.get("source") == "local"
def test_partial_name_match(self, test_database):
"""Should match on partial name."""
from backend.app.db import upsert_library
from backend.app.search import resolve_library_id
upsert_library(
library_id="/local/gamefoundry",
name="Foundry Game Module",
description="Module for foundry games"
)
candidates = resolve_library_id("game")
assert isinstance(candidates, list)
def test_empty_result_on_no_matches(self, test_database):
"""Should return empty list when no matches."""
from backend.app.search import resolve_library_id
# No libraries matching "xyznonexistent123"
candidates = resolve_library_id("xyznonexistent123")
assert isinstance(candidates, list)
class TestSearchDocs:
"""Tests for search_docs() - semantic search with mocked vector store."""
def test_returns_results_list(self, mock_qdrant_client, test_database):
"""search_docs should return a list of results."""
from backend.app.search import search_docs
# Create some chunks first
from backend.app.db import upsert_library, insert_document_chunk
upsert_library(library_id="/local/searchdocslib", name="Search Docs Lib", description="Test")
for i in range(5):
insert_document_chunk(
doc_id=f"searchdoc-{i}",
library_id="/local/searchdocslib",
path=f"path{i}.md",
title=f"Section {i}",
content=f"# Section {i}\n\nContent about section {i} that matches search queries.",
chunk_index=i,
token_estimate=100
)
results = search_docs("section")
assert isinstance(results, list)
def test_empty_query_returns_empty_list(self):
"""Empty query should return empty results."""
from backend.app.search import search_docs
results = search_docs("")
assert isinstance(results, list)
def test_limit_parameter(self, mock_qdrant_client):
"""Limit parameter should affect result count."""
from backend.app.search import search_docs
results_10 = search_docs("test", limit=10)
results_5 = search_docs("test", limit=5)
assert isinstance(results_10, list)
assert isinstance(results_5, list)
def test_response_shape_matches_spec(self):
"""Verify response shape when mocked returns data."""
from unittest.mock import patch
from backend.app.search import search_docs
# Mock client to return formatted results
mock_results = [
{
"id": "test-id-1",
"score": 0.95,
"library_id": "/local/testlib",
"path": "docs/example.md",
"title": "Example Document",
"chunk_index": 0
}
]
with patch('backend.app.vector_store.get_client') as mock_get_client:
# Setup mock client to return our test data
mock_client = mock_get_client.return_value
mock_point = type('ScoredPoint', (), {
'score': 0.95,
'payload': {
"id": "test-id-1",
"library_id": "/local/testlib",
"path": "docs/example.md",
"title": "Example Document",
"chunk_index": 0
}
})()
mock_client.search.return_value = [mock_point]
results = search_docs("test query")
assert isinstance(results, list)
if results:
# Verify each result has expected fields
result = results[0]
assert "id" in result
assert "score" in result
assert "library_id" in result
assert "path" in result
assert "title" in result
assert "chunk_index" in result
class TestGetLibraryDocs:
"""Tests for get_library_docs() - document retrieval."""
def test_returns_empty_string_when_no_documents(self, mock_qdrant_client):
"""Should return empty/error when no docs exist."""
from backend.app.search import get_library_docs
result = get_library_docs("/local/nonexistent")
# Either returns empty string or error message
assert isinstance(result, str)
def test_returns_content_when_documents_exist(self, mock_qdrant_client):
"""Should return combined document content."""
from backend.app.db import upsert_library, insert_document_chunk
from backend.app.search import get_library_docs
# Create library with chunks
upsert_library(library_id="/local/docretrievetest", name="Doc Retrieve", description="Test")
insert_document_chunk(
doc_id="doc-retrieve-1",
library_id="/local/docretrievetest",
path="docs/getting-started.md",
title="Getting Started",
content="# Getting Started\n\nWelcome to the documentation. This is a test document.",
chunk_index=0,
token_estimate=200
)
result = get_library_docs("/local/docretrievetest")
assert isinstance(result, str)
# Should contain at least library title or content
def test_topic_filter_searches(self, mock_qdrant_client):
"""With topic filter, should search for relevant chunks."""
from backend.app.db import upsert_library, insert_document_chunk
from backend.app.search import get_library_docs
upsert_library(library_id="/local/topicsearchlib", name="Topic Search", description="Test")
# Add documents with different topics
insert_document_chunk(
doc_id="topic-install",
library_id="/local/topicsearchlib",
path="docs/install.md",
title="Installation Guide",
content="# Installation\n\nInstall with pip install mypackage.",
chunk_index=0,
token_estimate=150
)
insert_document_chunk(
doc_id="topic-usage",
library_id="/local/topicsearchlib",
path="docs/usage.md",
title="Usage Guide",
content="# Usage\n\nUse mycommand --help for help.",
chunk_index=0,
token_estimate=150
)
# Search for "install" topic
result = get_library_docs("/local/topicsearchlib", topic="install")
assert isinstance(result, str)
def test_token_limit_respected(self):
"""Token limit should truncate content appropriately."""
from backend.app.search import get_library_docs
# Create a library with lots of content
from backend.app.db import upsert_library, insert_document_chunk
upsert_library(library_id="/local/tokenlimittest", name="Token Limit", description="Test")
long_content = "# Long Content\n\n" + " ".join(["word"] * 500)
insert_document_chunk(
doc_id="long-doc",
library_id="/local/tokenlimittest",
path="docs/long.md",
title="Long Document",
content=long_content,
chunk_index=0,
token_estimate=2000
)
# Request with small token limit
result = get_library_docs("/local/tokenlimittest", token_limit=100)
assert isinstance(result, str)
class TestGetLibraryDocsWithMock:
"""Tests that verify content retrieval when mocked data is available."""
def test_retrieves_chunks_by_library_id(self, mock_qdrant_client):
"""get_library_docs without topic should fetch all chunks for library."""
from backend.app.db import upsert_library, insert_document_chunk
from backend.app.search import get_library_docs
upsert_library(library_id="/local/mockretrievetest", name="Mock Retrieve", description="Test")
for i in range(3):
insert_document_chunk(
doc_id=f"mock-retrieve-{i}",
library_id="/local/mockretrievetest",
path=f"path{i}.md",
title=f"Path {i}",
content=f"Content for path {i}.",
chunk_index=i,
token_estimate=50
)
result = get_library_docs("/local/mockretrievetest")
assert isinstance(result, str)
class TestSearchErrorHandling:
"""Tests for error handling in search functions."""
def test_search_handles_missing_library(self):
"""Should handle missing library gracefully."""
from backend.app.search import search_docs
results = search_docs("test", library_id="/local/missing_lib_xyz123")
assert isinstance(results, list)
def test_resolve_handles_no_libraries_in_db(self):
"""Should handle empty database gracefully."""
from backend.app.db import init_db
from backend.app.search import resolve_library_id
# Initialize fresh DB (empty)
from backend.app.db import get_connection, get_chunks_for_library
# The test_database fixture already does this
def test_get_library_docs_handles_empty_library(self):
"""Should handle library with no chunks."""
from backend.app.search import get_library_docs
result = get_library_docs("/local/emptylib")
assert isinstance(result, str)
# =============================================================================
# FIXTURES FOR SEARCH TESTS
# =============================================================================
@pytest.fixture
def search_sample_text():
"""Sample text with headings for search chunking tests."""
return """# Installation Guide
To install the package:
```bash
pip install mypackage
```
## Configuration
Configure your environment by setting these variables:
- MY_VAR=123
- DEBUG=true
## Usage Examples
Example 1: Basic usage
```python
import mymodule
module = mymodule.Module()
result = module.run()
print(result)
```
Example 2: Advanced usage with options
```python
options = {"verbose": True, "output": "stdout"}
result = module.run(options=options)
```
## Troubleshooting
Common issues and their solutions:
- ImportError: Ensure package is installed
- AttributeError: Check that attributes exist on object"""
+29
View File
@@ -0,0 +1,29 @@
# Context7 Docs WebUI Configuration
# Copy this file to .env and configure for your environment
# === Ports (optional - use if you need custom ports) ===
HOST_PORT=8787 # docs-api port (default: 8787)
MCP_HOST_PORT=8788 # docs-mcp port (default: 8788)
WEBUI_PORT=8790 # WebUI port (default: 8790)
# === API Keys (optional - uncomment to enable auth) ===
# Docs API key for protecting endpoints like /search, /ingest, etc.
# DOCS_API_KEY=your-secret-docs-api-key
# WebUI API key (optional - separate from docs-api for UI authentication)
# DOCS_WEBUI_API_KEY=your-webui-api-key
# === Application Configuration ===
# Path to documentation files (relative to service container)
DOCS_PATH=/docs
# SQLite database path
DB_PATH=/data/db.sqlite
# Logging level: DEBUG, INFO, WARNING, ERROR
LOG_LEVEL=INFO
# === Vector Store ===
# Qdrant host and port (internal Docker network)
VECTOR_STORE_HOST=qdrant
VECTOR_STORE_PORT=6333
+19
View File
@@ -0,0 +1,19 @@
# WebUI Dockerfile
FROM python:3.12-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
DOCS_API_URL=http://docs-api:8787
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app /app/webui
RUN mkdir -p /app/webui/templates/uploads
EXPOSE 8790
CMD ["uvicorn", "webui.main:app", "--host", "0.0.0.0", "--port", "8790"]
+72
View File
@@ -0,0 +1,72 @@
"""Async docs-api client for the WebUI."""
import os
from typing import Any, Dict, Optional
from httpx import AsyncClient, Timeout
class DocsAPIClient:
"""Small async HTTP client for the docs-api backend."""
def __init__(self, base_url: Optional[str] = None, api_key: Optional[str] = None):
self.base_url = (base_url or os.environ.get("DOCS_API_URL", "http://docs-api:8787")).rstrip("/")
self.api_key = api_key if api_key is not None else os.environ.get("WEBUI_API_KEY")
self.headers = {"X-API-Key": self.api_key} if self.api_key else {}
self._client: Optional[AsyncClient] = None
async def _get_client(self) -> AsyncClient:
if self._client is None or self._client.is_closed:
self._client = AsyncClient(
base_url=self.base_url,
headers=self.headers,
timeout=Timeout(120.0),
)
return self._client
async def request(self, method: str, path: str, **kwargs: Any) -> Dict[str, Any]:
client = await self._get_client()
resp = await client.request(method, path, **kwargs)
if resp.status_code >= 400:
raise RuntimeError(f"{method} {path} failed: {resp.status_code} {resp.text}")
if resp.headers.get("content-type", "").startswith("application/json"):
data = resp.json()
return data if isinstance(data, dict) else {"data": data}
return {"data": resp.text}
async def get(self, path: str, **kwargs: Any) -> Dict[str, Any]:
return await self.request("GET", path, **kwargs)
async def post(self, path: str, **kwargs: Any) -> Dict[str, Any]:
return await self.request("POST", path, **kwargs)
async def delete(self, path: str, **kwargs: Any) -> Dict[str, Any]:
return await self.request("DELETE", path, **kwargs)
async def health(self) -> Dict[str, Any]:
try:
return await self.get("/health")
except Exception as e:
return {"status": "error", "message": str(e)}
async def upload_file(self, library_id: str, filename: str, content: bytes) -> Dict[str, Any]:
files = {"file": (filename, content)}
return await self.post(f"/api/v1/upload/{library_id}", files=files)
async def close(self) -> None:
if self._client is not None and not self._client.is_closed:
await self._client.aclose()
_client_instance: Optional[DocsAPIClient] = None
async def get_client() -> DocsAPIClient:
global _client_instance
if _client_instance is None:
_client_instance = DocsAPIClient()
return _client_instance
async def close_client() -> None:
if _client_instance is not None:
await _client_instance.close()
+17
View File
@@ -0,0 +1,17 @@
"""WebUI configuration."""
from typing import Optional
class Settings:
"""WebUI settings from environment variables."""
# Core API connection
DOCS_API_URL: str = "http://docs-api:8787"
WEBUI_API_KEY: Optional[str] = None
# Default parameters for common operations
DEFAULT_SEARCH_LIMIT: int = 10
DEFAULT_RESULT_TOKENS: int = 8000
settings = Settings()
+259
View File
@@ -0,0 +1,259 @@
"""WebUI FastAPI application."""
import html
import os
from pathlib import Path
from typing import List, Optional
from fastapi import FastAPI, File, Form, Request, UploadFile
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from .api_client import DocsAPIClient
app = FastAPI(
title="Context7 Docs WebUI",
description="Web dashboard for managing documentation system",
version="1.0.0",
)
templates = Jinja2Templates(directory=os.path.join(os.path.dirname(__file__), "templates"))
templates.env.globals["escapeHtml"] = lambda value: html.escape(str(value or ""))
app.mount("/static", StaticFiles(directory=os.path.join(os.path.dirname(__file__), "static")), name="static")
_client: Optional[DocsAPIClient] = None
def get_client() -> DocsAPIClient:
global _client
if _client is None:
_client = DocsAPIClient(
os.environ.get("DOCS_API_URL", "http://docs-api:8787"),
os.environ.get("WEBUI_API_KEY"),
)
return _client
@app.on_event("shutdown")
async def shutdown() -> None:
if _client is not None:
await _client.close()
def page(title: str, body: str) -> HTMLResponse:
return HTMLResponse(
f"""<!DOCTYPE html>
<html><head><meta charset="UTF-8"><title>{html.escape(title)}</title></head>
<body style="font-family:sans-serif;padding:20px;">{body}</body></html>"""
)
@app.get("/")
async def dashboard(request: Request):
client = get_client()
health = await client.health()
try:
collections_data = await client.get("/collections")
total_vectors = sum(
item.get("vectors", 0)
for item in collections_data.get("collections", {}).values()
if isinstance(item, dict)
)
except Exception:
total_vectors = 0
try:
libs_data = await client.get("/libraries")
libraries = libs_data.get("libraries", [])
except Exception:
libraries = []
return templates.TemplateResponse(
"dashboard.html",
{"request": request, "health": health, "vectors": total_vectors, "libraries": libraries},
)
@app.post("/actions/ingest-all")
async def ingest_all():
client = get_client()
try:
result = await client.post("/ingest/all")
body = f"<h1>Ingestion Complete</h1><pre>{html.escape(str(result))}</pre><a href='/'>Back</a>"
except Exception as e:
body = f"<h1>Ingestion Failed</h1><pre>{html.escape(str(e))}</pre><a href='/'>Back</a>"
return page("Ingestion", body)
@app.post("/actions/sync-sources")
async def sync_sources_action():
client = get_client()
try:
result = await client.post("/sources/sync", json={"override": False})
body = f"<h1>Git Sync Complete</h1><pre>{html.escape(str(result))}</pre><a href='/'>Back</a>"
except Exception as e:
body = f"<h1>Git Sync Failed</h1><pre>{html.escape(str(e))}</pre><a href='/'>Back</a>"
return page("Git Sync", body)
@app.get("/libraries")
async def libraries(request: Request):
client = get_client()
try:
data = await client.get("/libraries")
libraries_data = data.get("libraries", [])
except Exception:
libraries_data = []
return templates.TemplateResponse("libraries.html", {"request": request, "data": libraries_data})
@app.post("/libraries/create")
async def create_library(
library_id: str = Form(...),
name: str = Form(...),
description: Optional[str] = Form(None),
):
client = get_client()
try:
result = await client.post(
f"/api/v1/libraries/{library_id.strip()}",
data={"name": name, "description": description or ""},
)
body = f"<h1>Library Created</h1><pre>{html.escape(str(result))}</pre><a href='/libraries'>Back</a>"
except Exception as e:
body = f"<h1>Create Failed</h1><pre>{html.escape(str(e))}</pre><a href='/libraries'>Back</a>"
return page("Library Created", body)
@app.post("/libraries/{library_id}/ingest")
async def ingest_library(library_id: str):
client = get_client()
try:
result = await client.post(f"/ingest/{library_id}")
body = f"<h1>Ingestion Complete</h1><pre>{html.escape(str(result))}</pre><a href='/libraries'>Back</a>"
except Exception as e:
body = f"<h1>Ingestion Failed</h1><pre>{html.escape(str(e))}</pre><a href='/libraries'>Back</a>"
return page("Ingest Library", body)
@app.post("/libraries/{library_id}/delete")
async def delete_library(library_id: str):
client = get_client()
try:
result = await client.delete(f"/api/v1/libraries/{library_id}")
body = f"<h1>Library Deleted</h1><pre>{html.escape(str(result))}</pre><a href='/libraries'>Back</a>"
except Exception as e:
body = f"<h1>Delete Failed</h1><pre>{html.escape(str(e))}</pre><a href='/libraries'>Back</a>"
return page("Delete Library", body)
@app.get("/libraries/{library_id}/docs")
async def view_library_docs(library_id: str):
client = get_client()
try:
result = await client.get(f"/docs/{library_id}")
content = result.get("content", "")
except Exception as e:
content = str(e)
return page(
f"Docs: {library_id}",
f"<h1>{html.escape(library_id)}</h1><pre>{html.escape(content)}</pre><a href='/libraries'>Back</a>",
)
@app.get("/upload")
async def upload_form(request: Request):
client = get_client()
try:
libs_data = await client.get("/libraries")
libraries = libs_data.get("libraries", [])
except Exception:
libraries = []
return templates.TemplateResponse("upload.html", {"request": request, "libraries": libraries})
@app.post("/upload")
async def upload_file(
request: Request,
library_id: str = Form(""),
ingest_after_upload: Optional[str] = Form(None),
files: List[UploadFile] = File(...),
):
client = get_client()
results = []
total_size = 0
for upload in files:
filename = upload.filename or "upload.txt"
target_library = library_id.strip()
if not target_library:
target_library = Path(filename).stem.lower().replace(" ", "-") or "uploaded"
try:
contents = await upload.read()
total_size += len(contents)
result = await client.upload_file(target_library, filename, contents)
results.append({"filename": filename, "status": "success", "message": result})
except Exception as e:
results.append({"filename": filename, "status": "error", "message": str(e)})
if ingest_after_upload == "on":
for result in list(results):
if result["status"] != "success":
continue
target_library = result["message"]["library_id"]
try:
ingest_result = await client.post(f"/ingest/{target_library}")
results.append({"filename": "__INGEST__", "status": "success", "message": ingest_result})
except Exception as e:
results.append({"filename": "__INGEST__", "status": "error", "message": str(e)})
return templates.TemplateResponse(
"upload.html",
{"request": request, "libraries": [], "results": results, "total_size_bytes": total_size},
)
@app.get("/search")
async def search_form(request: Request):
return templates.TemplateResponse("search.html", {"request": request, "query": "", "results": []})
@app.get("/search/results")
async def search_results(request: Request, q: str = "", limit: int = 10):
client = get_client()
results = []
if q:
try:
data = await client.post("/search", json={"query": q, "library_id": None, "limit": limit})
results = data.get("results", [])
except Exception:
results = []
return templates.TemplateResponse(
"search.html",
{"request": request, "query": q, "results": results, "limit": limit},
)
@app.get("/sources")
async def sources_page(request: Request):
client = get_client()
try:
data = await client.get("/api/v1/sources")
sources = data.get("sources", [])
except Exception:
sources = []
return templates.TemplateResponse("sources.html", {"request": request, "sources": sources})
@app.post("/sources/sync")
async def sync_sources(override: bool = Form(False)):
client = get_client()
try:
result = await client.post("/sources/sync", json={"override": override})
body = f"<h1>Git Sync Complete</h1><pre>{html.escape(str(result))}</pre><a href='/sources'>Back</a>"
except Exception as e:
body = f"<h1>Git Sync Failed</h1><pre>{html.escape(str(e))}</pre><a href='/sources'>Back</a>"
return page("Git Sync", body)
+159
View File
@@ -0,0 +1,159 @@
// WebUI Static JavaScript Utilities
// Simple helper functions shared across templates
/**
* Escape HTML to prevent XSS attacks when displaying user content
*/
function escapeHtml(text) {
if (typeof text !== 'string') return "";
var e = document.createElement('div');
try {
e.textContent = text;
return e.innerHTML;
} catch (err) {
return String(text).replace(/[&<>"']/g, function(m) {
switch (m) {
case '&': return '&amp;';
case '<': return '&lt;';
case '>': return '&gt;';
case '"': return '&quot;';
case "'": return '&#x27;';
default: return m;
}
});
}
}
/**
* Format number with thousands separators
*/
function formatNumber(num) {
if (num === null || num === undefined) return "N/A";
return new Intl.NumberFormat().format(Math.floor(num));
}
/**
* Show loading spinner
*/
function showLoading(elementId) {
var el = document.getElementById(elementId);
if (el) {
el.innerHTML = '<div class="loading-spinner">Loading...</div>';
}
}
/**
* Hide loading spinner
*/
function hideLoading(elementId) {
var el = document.getElementById(elementId);
if (el) {
el.innerHTML = "";
}
}
/**
* Create a toast notification
*/
function showToast(message, type) {
var toast = document.createElement('div');
toast.className = 'toast ' + (type || 'info');
toast.textContent = message;
toast.style.cssText = 'position:fixed;bottom:20px;right:20px;' +
'padding:12px 20px;border-radius:4px;margin-bottom:10px;' +
'background:#333;color:white;font-size:0.9rem;z-index:1000';
document.body.appendChild(toast);
setTimeout(function() {
toast.style.opacity = '0';
setTimeout(function() { toast.remove(); }, 200);
}, 3000);
}
/**
* Show error notification
*/
function showError(message) {
showToast("Error: " + message, "error");
}
/**
* Show success notification
*/
function showSuccess(message) {
showToast("Success: " + message, "success");
}
/**
* Make an API request with error handling
*/
async function apiRequest(endpoint, method = 'GET', data = null) {
const config = window.webuiConfig;
let url = config.apiUrl;
if (!url.endsWith('/')) url += '/';
url += endpoint;
const headers = {};
if (config.apiKey) {
headers['X-API-Key'] = config.apiKey;
}
try {
let response;
if (method === 'POST') {
response = await fetch(url, {
method: method,
headers: headers,
body: JSON.stringify(data)
});
} else {
response = await fetch(url, {
method: method,
headers: headers
});
}
if (!response.ok) {
throw new Error(response.statusText);
}
const contentType = response.headers.get('content-type');
if (contentType && contentType.includes('application/json')) {
return await response.json();
} else {
return await response.text();
}
} catch (err) {
console.error('API request failed:', err);
throw err;
}
}
/**
* Initialize tooltips if using them
*/
function initTooltips() {
// Add tooltip functionality here if needed
}
/**
* Debounce function for input handling
*/
function debounce(func, wait) {
var timeout;
return function executedFunction(...args) {
var later = function() {
clearTimeout(timeout);
func.apply(this, args);
};
timeout = setTimeout(later, wait);
};
}
// Export to window for use in templates
window.escapeHtml = escapeHtml;
window.formatNumber = formatNumber;
window.showToast = showToast;
window.showError = showError;
window.showSuccess = showSuccess;
+395
View File
@@ -0,0 +1,395 @@
.container {
max-width: 1000px;
margin: 0 auto;
padding: 20px;
}
header {
border-bottom: 1px solid #ccc;
padding-bottom: 15px;
margin-bottom: 20px;
}
header h1 {
margin: 0 0 10px 0;
font-size: 1.5rem;
}
nav {
display: flex;
gap: 15px;
}
nav a {
text-decoration: none;
color: #0066cc;
font-size: 0.9rem;
}
nav a.active {
font-weight: bold;
text-decoration: underline;
}
main h2 {
margin-bottom: 15px;
}
footer {
margin-top: 40px;
padding-top: 15px;
border-top: 1px solid #ccc;
font-size: 0.8rem;
color: #666;
}
.status-card {
background: #f5f5f5;
padding: 20px;
border-radius: 8px;
border-left: 4px solid #00c467;
margin-bottom: 15px;
}
.status-message {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin: 5px 0;
}
pre.code-block {
background: #f5f5f5;
padding: 15px;
border-radius: 4px;
overflow-x: auto;
white-space: pre-wrap;
word-break: break-word;
}
/* Tables */
.library-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
}
.library-table th, .library-table td {
padding: 10px;
text-align: left;
border-bottom: 1px solid #ddd;
}
.library-table th {
background: #f5f5f5;
font-weight: bold;
}
/* Forms */
form input[type="text"], form textarea, form select {
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
margin-right: 10px;
margin-bottom: 10px;
}
button {
background: #0066cc;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
}
button:hover {
background: #0055aa;
}
/* Upload form */
.upload-form, .search-form, .sync-form {
max-width: 600px;
}
/* Search results */
.results-count {
background: #e8f4fd;
padding: 10px;
border-radius: 4px;
margin-bottom: 15px;
}
.result-card {
background: #fff;
border: 1px solid #ddd;
padding: 15px;
margin: 10px 0;
border-radius: 4px;
}
.result-card h3 {
margin: 0 0 8px 0;
}
/* Results box */
.results-box {
max-height: 600px;
overflow-y: auto;
}
.results-box .new-search-link {
display: block;
text-align: center;
margin-top: 15px;
}
/* Source cards */
.source-cards {
display: grid;
gap: 10px;
}
.source-card {
background: #f5f5f5;
padding: 15px;
border-radius: 4px;
border-left: 4px solid #666;
}
.status-message code {
background: #333;
color: #fff;
padding: 2px 6px;
border-radius: 3px;
}
.hint {
color: #666;
font-size: 0.85rem;
margin-top: 15px;
}
.results-box .error {
color: #cc0000;
font-weight: bold;
}
.source-list, .source-cards, pre {
white-space: normal;
}
/* Status cards grid */
.status-cards {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
margin-bottom: 20px;
}
.status-card h3 {
margin: 0 0 8px 0;
font-size: 0.9rem;
color: #555;
}
.status-card p {
margin: 0;
font-size: 1.2rem;
font-weight: bold;
}
/* Message box */
.message-box {
background: #e8f4fd;
padding: 12px;
border-radius: 6px;
margin-bottom: 20px;
border-left: 4px solid #3b82f6;
}
/* Action buttons */
.action-buttons {
display: flex;
gap: 15px;
margin-bottom: 20px;
}
.btn {
padding: 10px 20px;
border: none;
border-radius: 4px;
cursor: pointer;
text-decoration: none;
display: inline-block;
font-size: 0.9rem;
}
.btn-primary {
background: #00c467;
color: white;
}
.btn-primary:hover {
background: #00a855;
}
.btn-secondary {
background: #2563eb;
color: white;
}
.btn-secondary:hover {
background: #1d4ed8;
}
/* Links section */
.links-section h2 {
font-size: 1rem;
margin-bottom: 10px;
}
.links-section a {
color: #0066cc;
text-decoration: none;
padding: 5px 10px;
}
.links-section a:hover {
text-decoration: underline;
}
/* Create library form */
.create-form {
background: #f9f9f9;
padding: 15px;
border-radius: 6px;
margin-bottom: 20px;
border-left: 4px solid #00c467;
}
.create-form label {
display: block;
margin-bottom: 8px;
font-weight: bold;
color: #333;
}
.create-form input[type="text"] {
width: 100%;
padding: 8px;
margin-bottom: 12px;
border: 1px solid #ccc;
border-radius: 4px;
box-sizing: border-box;
}
/* Table actions column */
.actions {
white-space: nowrap;
}
/* Button sizes */
.btn-sm {
padding: 5px 12px;
font-size: 0.8rem;
}
/* Additional action button colors */
.btn-info {
background: #17a2b8;
color: white;
}
.btn-info:hover {
background: #138496;
}
.btn-warning {
background: #ffc107;
color: black;
}
.btn-warning:hover {
background: #ffa000;
}
.btn-danger {
background: #dc3545;
color: white;
}
.btn-danger:hover {
background: #c82333;
}
.btn-primary {
background: #007bff;
color: white;
}
.btn-primary:hover {
background: #0056b3;
}
/* Highlight row for popular libraries */
tr.highlight {
background: #f0fdf4;
}
/* Upload form specific styles */
#library_id, #files {
width: 100%;
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
margin-bottom: 12px;
box-sizing: border-box;
}
#files {
font-family: sans-serif;
}
/* Results box for upload */
.result-box {
background: #fff;
border: 1px solid #ddd;
border-radius: 4px;
padding: 10px;
margin-top: 20px;
min-height: 100px;
}
.result-box.error {
border-color: #dc3545;
background: #fff5f5;
}
/* Result items */
.result-item {
padding: 6px;
margin: 4px 0;
border-radius: 3px;
font-family: monospace;
font-size: 0.85rem;
word-break: break-word;
}
.result-item.success {
background: #d4edda;
border-left: 3px solid #28a745;
color: #155724;
}
.result-item.error {
background: #f8d7da;
border-left: 3px solid #dc3545;
color: #721c24;
}
.result-item.info {
background: #d1ecf1;
border-left: 3px solid #17a2b8;
color: #0c5460;
}
+32
View File
@@ -0,0 +1,32 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}Context7 Docs{% endblock %}</title>
<link rel="stylesheet" href="{{ url_for('static', path='style.css') }}">
</head>
<body>
<div class="container">
<header>
<h1>Context7 Docs UI</h1>
<nav>
<a href="/" {% if request.url.path == '/' %}class="active"{% endif %}>Dashboard</a>
<a href="/libraries" {% if request.url.path.startswith('/libraries') %}class="active"{% endif %}>Libraries</a>
<a href="/upload" {% if request.url.path.startswith('/upload') %}class="active"{% endif %}>Upload</a>
<a href="/search" {% if request.url.path.startswith('/search') %}class="active"{% endif %}>Search</a>
<a href="/sources" {% if request.url.path.startswith('/sources') %}class="active"{% endif %}>Sources</a>
</nav>
</header>
<main>
{% block content %}{% endblock %}
</main>
<footer>Context7 Docs WebUI</footer>
</div>
<script src="{{ url_for('static', path='app.js') }}"></script>
{% block scripts %}{% endblock %}
</body>
</html>
+83
View File
@@ -0,0 +1,83 @@
{% extends "base.html" %}
{% block title %}Dashboard - Context7 Docs{% endblock %}
{% block content %}
<h1>Dashboard</h1>
<!-- Status Cards -->
<div class="status-cards">
<div class="status-card" style="{% if health.status == 'ok' %}border-left-color: #00c467{% else %}border-left-color: #f53800{% endif %}">
<h3>Docs API Service</h3>
{% if health.status and health.status == 'ok' %}
<p style="color: #00c467;"><strong>Status:</strong> Online ✓</p>
{% else %}
<p style="color: #f53800;"><strong>Status:</strong> {% if health.status == 'error' %}Error{% else %}Offline{% endif %}</p>
{% endif %}
</div>
<div class="status-card">
<h3>Vectors Stored</h3>
<p>{{ vectors|default(0) }}</p>
</div>
<div class="status-card">
<h3>Libraries Registered</h3>
<p>{{ libraries|length }}</p>
</div>
</div>
<!-- Recent Messages -->
{% if libraries and libraries|length > 0 %}
<div class="message-box" style="background: #e8f4fd;">
<strong>Libraries:</strong> {{ escapeHtml(libraries) }}
</div>
{% endif %}
<!-- Action Buttons -->
<div class="action-buttons">
<form method="post" action="/actions/ingest-all" style="display: inline;">
<button type="submit" name="ingest-all" class="btn btn-primary">
🔄 Ingest All Libraries
</button>
</form>
<form method="post" action="/actions/sync-sources" style="display: inline;">
<input type="hidden" name="override" value="false">
<button type="submit" name="sync-sources" class="btn btn-secondary">
📦 Sync Git Sources
</button>
</form>
</div>
<!-- Links -->
<div class="links-section">
<h2>Navigate to Other Pages</h2>
<a href="/libraries" style="display: inline-block; margin-right: 15px;">View Libraries →</a>
<a href="/upload" style="display: inline-block; margin-right: 15px;">Upload Files →</a>
<a href="/search" style="display: inline-block; margin-right: 15px;">Search Docs →</a>
<a href="/sources" style="display: inline-block;">Git Sources →</a>
</div>
<!-- Script for health refresh on reload -->
<script>
// On page reload, re-fetch and update status if needed
document.addEventListener("DOMContentLoaded", async function() {
try {
const api = window.docsApiClient;
// Refresh health status from server-rendered data
document.querySelector('.status-cards .status-card:first-of-type')?.classList.remove('error');
const newHealth = await api.get("/health");
if (newHealth.status === 'ok') {
document.querySelector('.status-cards .status-card:first-of-type')?.querySelector('p')?.classList.add('online');
} else {
document.querySelector('.status-cards .status-card:first-of-type')?.querySelector('p')?.classList.add('error');
}
} catch (err) {
console.log('Health refresh skipped:', err);
}
});
</script>
{% endblock %}
+74
View File
@@ -0,0 +1,74 @@
{% extends "base.html" %}
{% block title %}Libraries - Context7 Docs{% endblock %}
{% block content %}
<h1>Libraries</h1>
<!-- Create Library Form -->
<div class="create-form">
<form method="post" action="/libraries/create">
<label for="new_library_id">Library ID:</label>
<input type="text" id="new_library_id" name="library_id" placeholder="e.g., foundryvtt" required>
<label for="new_name">Name:</label>
<input type="text" id="new_name" name="name" placeholder="Display name for this library" required>
<label for="new_description">Description (optional):</label>
<input type="text" id="new_description" name="description" placeholder="Brief description...">
<button type="submit" class="btn btn-primary">Create Library</button>
</form>
</div>
<hr>
<!-- Libraries Table -->
<table class="library-table">
<thead>
<tr>
<th>ID</th>
<th>Name</th>
<th>Description</th>
<th>Source Path</th>
<th>Updated At</th>
<th>Actions</th>
</tr>
</thead>
<tbody id="libraries-body">
{% if data|length > 0 %}
{% for lib in data %}
<tr class="{% if lib.source_path and 'foundry' in (lib.source_path or '').lower() %}highlight{% endif %}">
<td><code>{{ escapeHtml(lib.id) }}</code></td>
<td><strong>{{ escapeHtml(lib.name) }}</strong></td>
<td>{{ escapeHtml(lib.description) or '-' }}</td>
<td><small>{{ escapeHtml(lib.source_path) or '-' }}</small></td>
<td><small>{{ lib.updated_at|default('N/A') }}</small></td>
<td class="actions">
<a href="/libraries/{{ lib.id }}/docs" class="btn btn-sm btn-info">View Docs</a> |
<form method="post" action="/libraries/{{ lib.id }}/ingest" style="display:inline;"
onsubmit="return confirm('Trigger ingestion for this library?');">
<button type="submit" class="btn btn-sm btn-warning">Ingest</button>
</form> |
<form method="post" action="/libraries/{{ lib.id }}/delete"
onsubmit="return confirm('Delete this library and all its contents? This cannot be undone.');">
<button type="submit" class="btn btn-sm btn-danger">Delete</button>
</form>
</td>
</tr>
{% endfor %}
{% else %}
<tr>
<td colspan="6" style="text-align:center;">No libraries found. Create one above.</td>
</tr>
{% endif %}
</tbody>
</table>
{% if data and data[0] and data[0].get('content') %}
<!-- Docs view mode -->
<pre class="code-block">{% for chunk in data.get('content', []) %}{% if chunk|length > 0 %}{{ chunk.text | default(chunk.content) | default(chunk) }}{% endif %}{% endfor %}</pre>
<a href="/libraries" style="display:block;margin-top:20px;">← Back to Libraries</a>
{% endif %}
{% endblock %}
+71
View File
@@ -0,0 +1,71 @@
{% extends "base.html" %}
{% block title %}Search - Context7 Docs{% endblock %}
{% block content %}
<h2>Search Documentation</h2>
<form method="get" action="/search/results" class="search-form">
<label for="query">Query:</label>
<input type="text" id="query" name="q" required placeholder="Enter your search query..." value="{{ query or '' }}">
<label for="limit">Limit results:</label>
<select id="limit" name="limit">
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="20">20</option>
<option value="50">50</option>
</select>
<button type="submit">Search</button>
</form>
<div id="search-results" class="results-box"></div>
{% if results %}
<div class="results-count">{{ results|length }} results found</div>
{% endif %}
<script>
async function loadResults(query, limit) {
const searchBox = document.getElementById("search-results");
try {
const payload = { query: query || "{{ initial_query or '' }}", library_id: null, limit: parseInt(limit) };
const api = window.docsApiClient;
const result = await api.post("/search", payload);
if (result.results && Array.isArray(result.results)) {
searchBox.className = "results-box";
let html = '<div class="results-count">' + result.results.length + ' results found</div>';
for (const r of result.results) {
const title = r.title || (r.content || '').substring(0, 100);
const content = (r.content || '').substring(0, 500);
html += '<div class="result-card">' +
'<h3>' + escapeHtml(title) + '</h3>' +
'<p>' + escapeHtml(content) + '...</p>' +
'<a href="/docs/' + (r.library_id || '') + '">View Full</a></div>';
}
html += '<a href="/search/form" class="new-search-link">← New Search</a>';
searchBox.innerHTML = html;
}
} catch (err) {
searchBox.innerHTML = '<p class="error">Error loading results: ' + escapeHtml(err.message) + '</p>';
}
}
// Load initial results if query parameter exists in URL
var urlParams = new URLSearchParams(window.location.search);
{% if query %}loadResults(urlParams.get('q') || urlParams.get('q'), urlParams.get('limit'));{% endif %}
function escapeHtml(str) {
if (!str) return "";
var e = document.createElement('div');
e.textContent = str;
return e.innerHTML;
}
</script>
{% endblock %}
+34
View File
@@ -0,0 +1,34 @@
{% extends "base.html" %}
{% block title %}Sources - Context7 Docs{% endblock %}
{% block content %}
<h2>Git Repository Sync</h2>
<div class="status-message">Syncs all git repositories configured in <code>docs_sources.yaml</code>.</div>
<form method="post" action="/sources/sync" class="sync-form">
<label for="override">Override existing repos:</label>
<input type="checkbox" id="override" name="override">
<button type="submit">Sync All Repositories</button>
</form>
<div id="source-list"></div>
{% if sources %}
<h3>Configured Sources</h3>
<div class="source-cards">
{% for src in sources %}
<div class="source-card">
<strong>{{ src.library_id | default('unknown') }}</strong><br>
URL: {{ src.repo_url | default('N/A')[:60] }}<br>
Branch: {{ src.branch | default('main') }}<br>
Include: {{ (src.include_paths | default(['*']) | join(', ')) }}
</div>
{% endfor %}
</div>
{% else %}
<p>No git sources configured. Add repositories to <code>docs_sources.yaml</code>.</p>
{% endif %}
{% endblock %}
+48
View File
@@ -0,0 +1,48 @@
{% extends "base.html" %}
{% block title %}Upload - Context7 Docs{% endblock %}
{% block content %}
<h2>Upload Documentation Files</h2>
<form method="post" enctype="multipart/form-data" class="upload-form">
<!-- Library Selector -->
<label for="library_id">Select Library:</label>
<select id="library_id" name="library_id" required>
<option value="">(New library - will be created from filename)</option>
{% for lib in libraries %}
<option value="{{ lib.id }}" data-name="{{ lib.name or lib.id }}">{{ lib.name or lib.id }}</option>
{% endfor %}
</select>
<!-- File Input (multiple files allowed) -->
<label for="files">Select Files:</label>
<input type="file" name="files" id="files" multiple accept=".md,.txt,.py,.js,.ts,.json,.yaml,.yml,.html,.css,.pdf" required>
<!-- Ingest Checkbox -->
<div style="margin-top: 10px;">
<label>
<input type="checkbox" name="ingest_after_upload" value="on">
Trigger ingestion after upload
</label>
</div>
<button type="submit" class="btn btn-primary">Upload Files</button>
</form>
<!-- Allowed extensions hint -->
<p class="hint">Allowed: .md, .txt, .py, .js, .ts, .json, .yaml, .yml, .html, .css, .pdf (max 5MB each)</p>
<!-- Results Display -->
<div id="upload-result" class="result-box"></div>
{% if results %}
<h3>Upload Results</h3>
<ul>
{% for result in results %}
<li><strong>{{ result.filename }}</strong>: {{ result.status }} - {{ escapeHtml(result.message) }}</li>
{% endfor %}
</ul>
{% endif %}
{% endblock %}
+7
View File
@@ -0,0 +1,7 @@
# WebUI Dependencies
fastapi==0.109.0
uvicorn[standard]==0.27.0
pydantic==2.5.3
python-multipart==0.0.6
httpx==0.26.0
PyYAML==6.0.1