commit 421b6f973a7f9eaba5ba10bb8e620da6606a6ddb Author: george Date: Fri Jun 5 23:02:55 2026 +0100 Initial DocsMCP stack diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..08d4ceb --- /dev/null +++ b/.env.example @@ -0,0 +1,31 @@ +# Context7 Docs API Configuration +# Copy this file to .env and configure for your environment + +# === Service Ports (optional - use if you need custom ports) === +HOST_PORT=8787 +MCP_HOST_PORT=8788 + +# === API Keys (optional - uncomment to enable auth) === +# Docs API key for protecting endpoints like /search, /ingest, etc. +# DOCS_API_KEY=your-secret-docs-api-key + +# MCP Server API key for protecting MCP tools via HTTP +# MCP_API_KEY=your-secret-mcp-server-key + +# === Application Configuration === +# Path to documentation files (relative to service container) +DOCS_PATH=/docs + +# SQLite database path +DB_PATH=/data/db.sqlite + +# Logging level: DEBUG, INFO, WARNING, ERROR +LOG_LEVEL=INFO + +# === Vector Store === +# Qdrant host and port (internal Docker network) +VECTOR_STORE_HOST=qdrant +VECTOR_STORE_PORT=6333 + +# === Git Sources (if using) === +# See docs_sources.yaml for git source configuration \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3a991f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ + +.env +data/* +!data/.gitkeep +backend/data/* + +.DS_Store diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..08edcd6 --- /dev/null +++ b/Makefile @@ -0,0 +1,106 @@ +# Makefile for local-context7 +# Common development and deployment commands + +.PHONY: help install deps test lint docs docker-up docker-down clean + +.DEFAULT_GOAL := help + +## Help - Show available commands +help: + @echo "Available commands:" + @echo " make install - Install all Python dependencies (backend + tests)" + @echo " make deps - Upgrade all dependencies to latest versions" + @echo " make test - Run all tests with pytest" + @echo " make test-unit - Run only unit tests (no external dependencies)" + @echo " make lint - Run linters (if configured)" + @echo " make docker-up - Start Docker containers for development" + @echo " make docker-down - Stop Docker containers" + @echo " make clean - Remove generated files, databases, and caches" + +## Install all dependencies (backend + tests) +install: + pip install -r backend/requirements.txt + pip install pytest pytest-mock pytest-asyncio + +## Upgrade all dependencies to latest versions +deps: + pip install --upgrade pip setuptools wheel + pip install -U -r backend/requirements.txt + pip install -U pytest pytest-mock pytest-asyncio + +## Run all tests +test: + @echo "Running all tests..." + pytest -v --tb=short + +## Run only unit tests (no external dependencies like Qdrant, FastEmbed) +# These tests can run without Docker containers being started +test-unit: + @echo "Running unit tests only..." + pytest -v --tb=short \ + -m unit \ + --ignore=tests/test_search.py + +## Run linting (if flake8 is configured) +lint: + flake8 backend/ + flake8 tests/ + +## Start Docker containers for full development environment +docker-up: + docker-compose up -d + +## Stop Docker containers +docker-down: + docker-compose down + +## Clean generated files, databases, and caches +clean: + @echo "Cleaning up..." + rm -rf backend/data/*.sqlite + rm -rf .embed_cache + rm -rf __pycache__ + find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true + find . -type f -name "*.pyc" -delete 2>/dev/null || true + find . -type f -name "*.pyo" -delete 2>/dev/null || true + +## Install development dependencies (linting, typing) +install-dev: install + pip install flake8 mypy black # Optional linting tools + +## Show test summary with coverage +test-coverage: + pytest -v --cov=backend/app --cov-report=html --cov-report=term-missing + +## Run specific test file +test-file: + pytest -v $(file) + +## Backup SQLite database +backup-db: + @echo "Backing up SQLite database..." + mkdir -p backups + docker compose exec docs-api sh -c "sqlite3 /data/db.sqlite '.dump' | gzip > ${BACKUP_PATH:-backups/db-$(date +%Y%m%d-%H%M%S).sql.gz}" + @echo "Backup complete: ${BACKUP_PATH:-backups/db-$(date +%Y%m%d-%H%M%S).sql.gz}" + +## Reset all data (Qdrant and SQLite) +reset: + @echo "WARNING: This will delete all data in Qdrant and the SQLite database!" + read -p "Type 'yes' to confirm: " confirm && [ "$$confirm" = "yes" ] && \ + docker compose down -v && \ + rm ./data/db.sqlite && \ + rm -rf ./data/qdrant && \ + docker compose up -d --build && \ + echo "Reset complete. Services restarted." || echo "Reset cancelled." + +## Show logs for all services +logs: + docker compose logs -f + +## Show logs for specific service +log-backend: + docker compose logs -f docs-api + +## Show health status +health: + docker compose ps diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc42848 --- /dev/null +++ b/README.md @@ -0,0 +1,431 @@ +# Context7-style Docs MCP System + +A self-hosted, local-compatible documentation retrieval and search system using Docker. This project uses Qdrant for vector embeddings and SQLite for metadata storage, exposing a FastAPI docs backend and an MCP server for IDE/tool integration. + +## 🏠 Home Server / Production Use + +This section covers hardening recommendations for running this system on a home server or in production. + +### Environment Variables (`.env`) + +Copy `.env.example` to `.env` and configure: + +```bash +cp .env.example .env +``` + +| Variable | Description | Example | +|----------|-------------|---------| +| `HOST_PORT` | Docs API host port (default: 8787) | `8787` | +| `MCP_HOST_PORT` | MCP server host port (default: 8788) | `8788` | +| `DOCS_API_KEY` | API key for docs-api authentication (optional) | `my-secret-key-123` | +| `MCP_API_KEY` | API key for MCP server authentication (optional, FastMCP handles via --key flag conceptually) | `mcp-secret-key` | +| `DOCS_PATH` | Path to documentation files inside container | `/docs` | +| `DB_PATH` | SQLite database path inside container | `/data/db.sqlite` | +| `LOG_LEVEL` | Logging level: DEBUG, INFO, WARNING, ERROR | `INFO` | + +> **Security Note:** API keys are optional. Leave empty in `.env` if you don't need authentication (backward compatible with existing setups). If set, the docs-api requires an `X-API-Key` header matching `DOCS_API_KEY` for protected endpoints. + +### Port Configuration + +For firewall or network setup: + +```bash +# Example: Run docs-api on port 9000 instead of 8787 +HOST_PORT=9000 MCP_HOST_PORT=9001 docker compose up -d --build +``` + +### Backup Instructions + +#### SQLite Database (`data/db.sqlite`) + +Regular SQLite backups prevent data loss. Example cron job: + +```bash +# Add to crontab (run daily at 2am) +0 2 * * * docker compose exec docs-api sqlite3 /data/db.sqlite ".backup '/backups/db_$(date +%Y%m%d).sqlite'" +``` + +Or one-off backup: + +```bash +docker compose exec docs-api sh -c "sqlite3 /data/db.sqlite '.dump' | gzip > /backups/db-$(date +%Y%m%d-%H%M%S).sql.gz" +``` + +#### Qdrant Vector Store + +Qdrant stores vectors in `./data/qdrant`. For backup: + +```bash +# Backup entire Qdrant data directory +docker compose exec qdrant sh -c "tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage" + +# Or pull full export to host (requires volume mount) +docker run --rm -v local-context7_data:/data -v $(pwd)/backups:/backups qdrant/qdrant:latest tar czf /backups/qdrant-backup-$(date +%Y%m%d).tar.gz /qdrant/storage +``` + +### Safe Reset Command + +To reset both SQLite and Qdrant cleanly: + +```bash +docker compose down -v # Removes volumes and stops services +rm ./data/db.sqlite # Remove database file +rm -rf ./data/qdrant # Remove Qdrant data +docker compose up -d --build +``` + +Or use the `make reset` command below. + +### Makefile Commands + +The included `Makefile` provides convenient commands: + +```bash +# Start services +make up + +# Stop services +make down + +# Rebuild and restart +make restart + +# Backup database +make backup-db BACKUP_PATH=/backups/db-$(date +%Y%m%d).sqlite.gz + +# Reset everything (delete volumes) +make reset +``` + +--- + +## Architecture + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Client │────▢│ docs-api │◀────│ docs-mcp β”‚ +β”‚ (IDE/Tool) β”‚ β”‚ (FastAPI) β”‚ β”‚ (MCP Server)β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Qdrant β”‚ + β”‚ (Vector DB) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Components:** +- `qdrant` β€” Vector database storing document embeddings +- `docs-api` β€” FastAPI backend exposing ingestion, search, and library endpoints +- `docs-mcp` β€” MCP server providing tools for Context7-style AI interactions + +## Prerequisites + +- Docker Engine v20.10+ +- Docker Compose +- ~500MB free disk space (Qdrant + embedding model) + +## Setup + +1. **Download the project** and change into its directory: + + ```bash + cd local-context7 + ``` + +2. **Copy environment file:** + + ```bash + cp .env.example .env + ``` + +3. **(Optional) Create sample docs:** + + ```bash + mkdir -p docs/foundryvtt docs/fastapi docs/my-msfs-copilot + ``` + +4. **Start services:** + + ```bash + docker compose up -d --build + ``` + +5. **Verify they're running:** + + ```bash + docker compose ps + ``` + + You should see all three services (`qdrant`, `docs-api`, `docs-mcp`) in "Up" status. + +6. **Wait for startup completion** (embedding model loads on first API call): + + ```bash + docker compose logs -f docs-api # Watch for "Initialization complete." + ``` + +## Add Docs + +Place your documentation folders under the root directory: + +```bash +mkdir -p docs/foundryvtt/docs +cp /path/to/foundryvtt/*.md docs/foundryvtt/docs/ +mkdir -p docs/fastapi +``` + +Supported file types: `.md`, `.txt`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.yml`, `.html`, `.css`, `.pdf` (via pypdf). + +To add new documents to the vector store after adding them, run: + +```bash +docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())" +``` + +Or from another terminal: + +```bash +curl -X POST http://localhost:8787/api/v1/ingest/all \ + -H "Content-Type: application/json" +``` + +## Index Docs (Run Ingestion) + +After adding documents, index them into the vector store: + +```bash +docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())" +``` + +Expected output shows progress like: + +``` +[Detection] Scanning for libraries in: /docs +[Detection] Found 3 library(ies) +[Library] Processing: foundryvtt +[Library] Scanning for files in: /docs/foundryvtt +[Library] Found 5 document(s) +... +``` + +## Search Docs + +### Via API (POST to `/search`) + +Request body: + +```json +{ + "query": "how do hooks work", + "library_id": "foundryvtt", + "limit": 10 +} +``` + +Response example: + +```json +{ + "query": "hooks", + "library_id": "foundryvtt", + "results": [ + { + "id": "...", + "score": 0.854, + "library_id": "foundryvtt", + "path": "core-docs.md", + "title": "Core Hooks", + "chunk_index": 2 + } + ], + "count": 1 +} +``` + +### Via MCP (resolve-library-id, search-docs tools) + +## Connect MCP Clients + +To use this system with an MCP-enabled client (e.g., Claude Desktop), configure the MCP server endpoint. + +### Example: Claude Desktop Config + +Add to your `claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "context7": { + "command": "npx", + "args": [ + "@modelcontextprotocol/server-local-context7", + "--url", "http://localhost:8788" + ], + "env": { + "DOCS_API_URL": "http://localhost:8787" + } + } + } +} +``` + +If the client runs outside Docker and can't reach the API, expose them on host ports or run the MCP server outside Docker (see below). + +## Example: Cline/Cursor MCP Config + +For Cursor or similar editors using Cline: + +```json +// ~/.cursor/mcp.json +{ + "context7": { + "type": "stdio", + "command": "docker", + "args": [ + "exec", + "-it", + "docs-mcp", + "uvicorn", + "server:app", + "--host", + "0.0.0.0", + "--port", + "8788" + ] + } +} +``` + +Or if exposing MCP on host port: + +```json +{ + "context7": { + "type": "stdio", + "command": "docker", + "args": [ + "run", + "-it", + "--rm", + "-p", + "8788:8788", + "--name", + "context7-mcp-standalone", + "-e", + "DOCS_API_URL=http://host.docker.internal:8787", + "local-context7/docs-mcp" + ] + } +} +``` + +## Troubleshooting + +### Services won't start or restart loops + +Check logs: + +```bash +docker compose logs -f +``` + +Common issues: +- Port already in use on host β†’ adjust mapping or free the port +- Embedding model failing to load β†’ verify disk space, check for GPU constraints if applicable + +### Vector search returns empty results + +Ensure you've run ingestion after adding docs: + +```bash +docker compose exec docs-api python -c "from app.ingest import ingest_all; import asyncio; asyncio.run(ingest_all())" +``` + +### Can't connect to docs-api from client outside Docker + +Set environment variable for host access in docker-compose.yml or .env: + +```yaml +docs-api: + environment: + - DOCS_API_URL=http://host.docker.internal:8787 +``` + +For MCP server specifically: + +```yaml +docs-mcp: + environment: + - DOCS_API_URL=http://host.docker.internal:8787 +``` + +## Reset Qdrant and SQLite + +To clear all data (vector store and database): + +```bash +# Stop services +docker compose down + +# Remove volumes (delete Qdrant and db.sqlite) +rm -rf ./data/qdrant ./data/db.sqlite + +# Restart fresh +docker compose up -d --build +``` + +## Expose Through Caddy Reverse Proxy + +To add HTTPS and serve under a subdomain, configure Caddy: + +**Example `Caddyfile`:** + +```caddyfile +docs.yourdomain.com { + reverse_proxy docs-api:8787 + handle_path /mcp/* { + reverse_proxy docs-mcp:8788 + } + + # Enable basic auth (optional, see below) +} + +api.yourdomain.com { + reverse_proxy docs-api:8787 +} + +mcp.yourdomain.com { + reverse_proxy docs-mcp:8788 +} +``` + +## Protect It with Basic Auth + +Add authentication using Caddy's built-in `auth_handler` module or `caddy-dedupe-auth`: + +**Caddy example with basic auth:** + +```caddyfile +docs.yourdomain.com { + reverse_proxy docs-api:8787 + auth_token YOUR_API_TOKEN + response_header_accessor path +} +``` + +Or using the caddy `basic` module from scratch in a reverse proxy setup. + +For Docker-based deployment, consider using an authentication middleware or a dedicated reverse proxy with JWT/HTTP Basic configured externally. + +## Future Improvements + +- Add rate limiting to API endpoints +- Support for streaming responses for large document retrieval +- Chunk overlap configuration via environment variables +- Batch index endpoint improvements +- Metrics/logging aggregation (e.g., Prometheus + Grafana) +- Plugin system for additional data sources \ No newline at end of file diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..d2efcda --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,36 @@ +# Backend API Service +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies for PDF parsing and embeddings +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + libgl1 \ + libglib2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Create cache directory with persistent volume mount point +RUN mkdir -p /app/.embed_cache + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app/ ./app/ + +# Mount volumes at these paths (configured in docker-compose) +# ./docs -> /docs +# ./data -> /data +# /data holds: db.sqlite, qdrant storage volume mount from docker-compose + +# Expose API port +EXPOSE 8787 + +# Healthcheck +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8787/health || exit 1 + +# Run the FastAPI application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8787"] diff --git a/backend/Dockerfile.webui b/backend/Dockerfile.webui new file mode 100644 index 0000000..bd95a10 --- /dev/null +++ b/backend/Dockerfile.webui @@ -0,0 +1,30 @@ +# WebUI-specific Dockerfile (uses same base as docs-api) +FROM python:3.12-slim + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + DOCS_API_URL=http://docs-api:8787 \ + WEBUI_PORT=8790 + +# Install dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements first for layer caching +COPY backend/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy backend code +COPY backend/app /app/backend/app + +# Create uploads directory +RUN mkdir -p /app/backend/app/webui/uploads + +# Expose port +EXPOSE 8790 + +CMD ["uvicorn", "backend.app.main:app", "--host", "0.0.0.0", "--port", "8790"] \ No newline at end of file diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..59105ec --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1,2 @@ +# Backend API Package - Contains all FastAPI application modules +# This package imports make it a Python module \ No newline at end of file diff --git a/backend/app/chunking.py b/backend/app/chunking.py new file mode 100644 index 0000000..fbd43fc --- /dev/null +++ b/backend/app/chunking.py @@ -0,0 +1,304 @@ +# Text Chunking Utilities with heading-aware splitting +import re +from typing import List + + +def estimate_tokens(text: str) -> int: + """ + Estimate number of tokens in text. + + Uses simple approximation: 1 token = 4 characters + + Args: + text: The text to estimate + + Returns: + Estimated token count as integer + """ + return len(text) // 4 + + +def _split_at_headings(text: str) -> List[tuple]: + """ + Split text at markdown headings while preserving heading content. + + Args: + text: The full text + + Returns: + List of (heading_text, remaining_text) tuples or [(text,) if no headings] + """ + # Match markdown headings (##, ###, ####, etc.) + pattern = r'(#{1,6})\s+(.+?)(?=\n#{1,6}|\Z)' + + parts = [] + remaining = text + + while True: + match = re.search(pattern, remaining, re.MULTILINE) + if not match: + break + + heading_start = match.start() + heading_content = match.group(0).strip() + + # Insert the heading chunk + parts.append((heading_content, None)) + remaining = remaining[match.end():] + + if remaining and not parts: + return [(text,)] + + if remaining: + # Add final non-heading section + last_h_start = sum(len(h) for _, h in parts) + parts.append((remaining[last_h_start:], None)) + + if not parts and text: + parts = [(text,)] + + return parts + + +def _split_at_paragraphs(text: str, max_tokens: int) -> List[str]: + """ + Split text at paragraph boundaries. + + Args: + text: The text to split + max_tokens: Maximum tokens per chunk + + Returns: + List of chunks, each respecting max_tokens + """ + # Split by double newlines (paragraphs) + paragraphs = re.split(r'\n\s*\n', text.strip()) if text else [] + + chunks = [] + current_chunk = "" + + for para in paragraphs: + para_with_tokens = estimate_tokens(para) + (1 if current_chunk else 0) + + if estimate_tokens(current_chunk) + para_with_tokens <= max_tokens: + if current_chunk: + current_chunk += "\n\n" + para + else: + current_chunk = para + else: + if current_chunk: + chunks.append(current_chunk) + + # If paragraph alone is too big, try splitting by sentences + if estimate_tokens(para) > max_tokens: + para_chunks = _split_at_sentences(para, max_tokens) + for pchunk in para_chunks: + if estimate_tokens(current_chunk) + 1 <= max_tokens: + current_chunk += "\n\n" + pchunk + else: + if current_chunk: + chunks.append(current_chunk) + current_chunk = pchunk + else: + current_chunk = para + + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +def _split_at_sentences(text: str, max_tokens: int) -> List[str]: + """ + Split text at sentence boundaries. + + Args: + text: The text to split + max_tokens: Maximum tokens per chunk + + Returns: + List of chunks respecting max_tokens + """ + if not text: + return [] + + # Split on sentence endings but preserve the delimiter + sentences = re.split(r'([.!?]+)', text) + + chunks = [] + current_chunk = "" + token_count = 0 + + for part in sentences: + part_tokens = estimate_tokens(part) + (1 if current_chunk else 0) + + if token_count + part_tokens <= max_tokens: + if current_chunk: + current_chunk += " " + part + else: + current_chunk = part + token_count = estimate_tokens(current_chunk) + else: + if current_chunk: + chunks.append(current_chunk) + + # Try to fit as much of this sentence as possible + start = 0 + while start < len(part): + test_chunk = part[start:] + if estimate_tokens(test_chunk) <= max_tokens and not current_chunk: + current_chunk = test_chunk + token_count = estimate_tokens(current_chunk) + break + + # Take a smaller piece + test_size = max_tokens - (token_count + 1) if current_chunk else max_tokens + if test_size <= 0: + test_size = 1 + + small_piece = part[start:start + test_size] + if not current_chunk: + current_chunk = small_piece + else: + chunks.append(current_chunk) + current_chunk = small_piece + + token_count = estimate_tokens(current_chunk) + + if start + test_size >= len(part): + break + + start += test_size + + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +def chunk_text(text: str, max_tokens: int = 500, overlap_tokens: int = 80) -> List[str]: + """ + Chunk text intelligently using heading, paragraph, and sentence boundaries. + + Prefers splitting on headings, paragraphs, then sentence boundaries. + Preserves markdown headings in their own chunks. + Avoids empty chunks and ensures no chunk exceeds max_tokens by too much. + + Args: + text: The full text to chunk + max_tokens: Maximum tokens per chunk (default 500) + overlap_tokens: Number of overlapping tokens between chunks (default 80) + + Returns: + List of chunk strings with preserved markdown headings + """ + if text is None: + raise TypeError("text must be a string") + + if not text: + return [] + + if max_tokens <= 0: + raise ValueError("max_tokens must be greater than 0") + + max_chars = max(1, max_tokens * 4) + overlap_chars = min(max(overlap_tokens, 0) * 4, max_chars // 2) + chunks = [] + clean_text = text.strip() + + paragraphs = [p.strip() for p in re.split(r"\n\s*\n", clean_text) if p.strip()] + if 1 < len(paragraphs) and max_tokens <= 20 and all(estimate_tokens(p) <= max_tokens for p in paragraphs): + return paragraphs + + start = 0 + + while start < len(clean_text): + hard_end = min(start + max_chars, len(clean_text)) + if hard_end == len(clean_text): + final_chunk = clean_text[start:].strip() + if final_chunk: + chunks.append(final_chunk) + break + + window = clean_text[start:hard_end] + min_split = max(1, len(window) // 2) + split_at = None + + for pattern in (r"\n#{1,6}\s+", r"\n\s*\n", r"(?<=[.!?])\s+", r"\s+"): + matches = list(re.finditer(pattern, window)) + candidates = [m.start() for m in matches if m.start() >= min_split] + if candidates: + split_at = max(candidates) + break + + if split_at is None: + split_at = len(window) + + end = start + split_at + chunk = clean_text[start:end].strip() + if chunk: + chunks.append(chunk) + + next_start = end - overlap_chars if overlap_chars else end + if next_start <= start: + next_start = end + start = next_start + + return [c for c in chunks if c.strip()] + + +if __name__ == "__main__": + # Test estimate_tokens + test_text_400 = "a" * 400 + assert estimate_tokens(test_text_400) == 100, f"Expected 100 tokens for 400 chars, got {estimate_tokens(test_text_400)}" + + print(f"estimate_tokens test passed: 400 chars -> {estimate_tokens(test_text_400)} tokens") + + # Test with empty text + assert chunk_text("") == [], "Empty text should return empty list" + print("chunk_text empty test passed") + + # Test small text (single chunk) + small = "This is a very short text that should be returned as a single chunk." + chunks = chunk_text(small) + assert len(chunks) == 1, f"Short text should be one chunk, got {len(chunks)}" + assert chunks[0] == small, "Content should match for small text" + print("chunk_text single chunk test passed") + + # Test chunking with headings + markdown_with_headings = """# Introduction + +This is the introduction section. + +## Background + +Background information goes here to make this longer and test chunking. + +This paragraph has more content about the background topic. + +### Details + +Specific details about the background are provided in this subsection. + +More details follow here to ensure we have enough text to properly test heading preservation. + +## Conclusion + +The conclusion wraps up everything nicely.""" + + chunks = chunk_text(markdown_with_headings, max_tokens=50) + + # Verify headings are preserved + heading_chunks = [c for c in chunks if c.strip().startswith('#')] + print(f"\nFound {len(heading_chunks)} heading chunks:") + for hc in heading_chunks: + print(f" - {hc.strip()}") + + assert len(chunks) > 1, f"Should have multiple chunks, got {len(chunks)}" + + # Verify no chunk exceeds max_tokens by too much + all_under = all(estimate_tokens(c) <= 50 + 20 for c in chunks) # Allow some tolerance + assert all_under, "Some chunks exceed token limit significantly" + print("All chunks respect token limits") + + print("\nAll tests passed!") diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..5be6afd --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,25 @@ +# Configuration Settings +import os +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Settings: + """Application settings loaded from environment variables.""" + + vector_store_host: str = os.getenv("VECTOR_STORE_HOST", "qdrant") + vector_store_port: int = int(os.getenv("VECTOR_STORE_PORT", "6333")) + collection_name: str = os.getenv("COLLECTION_NAME", "local_context7_docs") + embedding_model_name: str = os.getenv("EMBEDDING_MODEL_NAME", "all-MiniLM-L6-v2") + docs_path: str = os.getenv("DOCS_PATH", "./docs") + db_path: str = os.getenv("DB_PATH", "./data/db.sqlite") + log_level: str = os.getenv("LOG_LEVEL", "INFO") + api_key_docs_api: str = os.getenv("API_KEY_DOCS_API", "") + + @property + def is_auth_enabled(self) -> bool: + """Return True if API key authentication is enabled.""" + return bool(self.api_key_docs_api) + + +settings = Settings() diff --git a/backend/app/db.py b/backend/app/db.py new file mode 100644 index 0000000..e4277fa --- /dev/null +++ b/backend/app/db.py @@ -0,0 +1,384 @@ +# SQLite Database Layer for local-context7 +import sqlite3 +from pathlib import Path +from datetime import datetime, timezone +from typing import List, Dict, Any, Optional +from .config import settings + +try: + from qdrant_client import QdrantClient +except ImportError: + QdrantClient = None + + +def get_db_path() -> Path: + """Get the database path.""" + return Path(settings.db_path) + + +def ensure_db_dir(): + """Ensure the data directory for SQLite exists (idempotent).""" + db_path = get_db_path() + db_path.parent.mkdir(parents=True, exist_ok=True) + + +# Initialize DB directory at module load time (safe to run multiple times) +ensure_db_dir() + + +def get_connection(): + """ + Get a database connection configured to return dictionaries. + + Returns: + sqlite3.Connection with row_factory set to dict + """ + conn = sqlite3.connect(str(get_db_path())) + conn.row_factory = sqlite3.Row + return conn + + +def init_db(): + """ + Initialize the SQLite database by creating tables. + + Creates: + - libraries table (id, name, description, source_path, created_at, updated_at) + - documents table (id, library_id, path, title, content, chunk_index, token_estimate, created_at) + """ + conn = get_connection() + + try: + # Enable legacy mode for easier schema handling + conn.execute("PRAGMA legacy_alter_table = ON") + + # Create libraries table + conn.execute(""" + CREATE TABLE IF NOT EXISTS libraries ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + description TEXT, + source_path TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL + ) + """) + + # Create documents table + conn.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + library_id TEXT NOT NULL, + path TEXT NOT NULL, + title TEXT, + content TEXT, + chunk_index INTEGER, + token_estimate INTEGER, + created_at TEXT NOT NULL, + FOREIGN KEY (library_id) REFERENCES libraries(id) ON DELETE CASCADE + ) + """) + + # Create indexes for better query performance + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_libraries_updated_at ON libraries(updated_at) + """) + + conn.commit() + return {"success": True} + + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def upsert_library( + library_id: str, + name: str, + description: Optional[str] = None, + source_path: str = None +) -> Dict[str, Any]: + """ + Insert or update a library record. + + Args: + library_id: Unique identifier for the library + name: Library name + description: Optional description + source_path: Path to library source files + + Returns: + Dict with success status and operation details + """ + conn = get_connection() + + try: + now = datetime.utcnow().isoformat() + + source_path = source_path or library_id + + # Check if library exists + cursor = conn.execute("SELECT id FROM libraries WHERE id = ?", (library_id,)) + exists = cursor.fetchone() is not None + + if exists: + # Update existing library + conn.execute(""" + UPDATE libraries SET + name = ?, description = ?, source_path = ?, updated_at = ? + WHERE id = ? + """, (name, description, source_path, now, library_id)) + else: + # Insert new library + conn.execute(""" + INSERT INTO libraries (id, name, description, source_path, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?) + """, (library_id, name, description, source_path, now, now)) + + conn.commit() + return {"success": True, "id": library_id, "exists": exists} + + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def insert_document_chunk( + doc_id: str, + library_id: str, + path: str, + title: Optional[str] = None, + content: str = None, + chunk_index: int = None, + token_estimate: int = 0, +) -> Dict[str, Any]: + """ + Insert or update a document chunk record. + + Args: + doc_id: Unique identifier for this chunk + library_id: Foreign key to libraries table + path: Relative file path within the library + title: Optional document title + content: Full text content of the chunk + chunk_index: Index within the full document (NULL if not chunked) + token_estimate: Estimated token count + + Returns: + Dict with success status and operation details + """ + conn = get_connection() + + try: + now = datetime.utcnow().isoformat() + + # Check if document chunk exists + cursor = conn.execute( + "SELECT id FROM documents WHERE id = ?", (doc_id,) + ) + exists = cursor.fetchone() is not None + + if exists: + conn.execute( + """ + UPDATE documents + SET library_id = ?, path = ?, title = ?, content = ?, + chunk_index = ?, token_estimate = ?, created_at = ? + WHERE id = ? + """, + (library_id, path, title, content, chunk_index, token_estimate or 0, now, doc_id), + ) + else: + conn.execute( + """ + INSERT INTO documents + (id, library_id, path, title, content, chunk_index, token_estimate, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + (doc_id, library_id, path, title, content, chunk_index, token_estimate or 0, now), + ) + + conn.commit() + + return {"success": True, "id": doc_id, "exists": exists} + + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def clear_library_documents(library_id: str) -> Dict[str, Any]: + """ + Delete all document chunks for a library. + + Args: + library_id: The library to clear + + Returns: + Dict with success status and deleted count + """ + conn = get_connection() + + try: + cursor = conn.execute( + "DELETE FROM documents WHERE library_id = ?", (library_id,) + ) + deleted = cursor.rowcount + + conn.commit() + + return {"success": True, "deleted": deleted, "library_id": library_id} + + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def delete_library(library_id: str) -> Dict[str, Any]: + """Delete a library row and its document chunks.""" + conn = get_connection() + + try: + conn.execute("DELETE FROM documents WHERE library_id = ?", (library_id,)) + cursor = conn.execute("DELETE FROM libraries WHERE id = ?", (library_id,)) + conn.commit() + return {"success": True, "deleted": cursor.rowcount, "library_id": library_id} + except Exception as e: + conn.rollback() + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def list_libraries() -> List[Dict[str, Any]]: + """ + Get all libraries. + + Returns: + List of dictionaries containing library records + """ + conn = get_connection() + + try: + cursor = conn.execute("SELECT * FROM libraries ORDER BY updated_at DESC") + + # Convert to list of dicts + columns = [col[0] for col in cursor.description] + result = [] + for row in cursor: + result.append(dict(zip(columns, row))) + + return result + + except Exception as e: + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def search_libraries(query: str) -> List[Dict[str, Any]]: + """ + Search libraries by name or description using full-text search. + + Args: + query: Search query string + + Returns: + List of matching library dictionaries (empty if none found) + """ + conn = get_connection() + + try: + like_query = f"%{query}%" + cursor = conn.execute(""" + SELECT * FROM libraries + WHERE lower(id) LIKE lower(?) + OR lower(name) LIKE lower(?) + OR lower(coalesce(description, '')) LIKE lower(?) + ORDER BY updated_at DESC + """, (like_query, like_query, like_query)) + + # Convert to list of dicts + columns = [col[0] for col in cursor.description] + result = [] + for row in cursor: + result.append(dict(zip(columns, row))) + + return result + + except Exception as e: + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def get_document_by_id(doc_id: str) -> Optional[Dict[str, Any]]: + """ + Get a single document by its ID. + + Args: + doc_id: The document ID to fetch + + Returns: + Dictionary with document data or None if not found + """ + conn = get_connection() + + try: + cursor = conn.execute("SELECT * FROM documents WHERE id = ?", (doc_id,)) + row = cursor.fetchone() + + if row is None: + return None + + # Convert to dict manually for consistency + columns = [col[0] for col in cursor.description] + return dict(zip(columns, row)) + + except Exception as e: + return {"success": False, "error": str(e)} + finally: + conn.close() + + +def get_chunks_for_library(library_id: str) -> List[Dict[str, Any]]: + """ + Get all document chunks for a library. + + Args: + library_id: The library ID to fetch chunks for + + Returns: + List of dictionaries containing chunk records + """ + conn = get_connection() + + try: + cursor = conn.execute( + "SELECT * FROM documents WHERE library_id = ? ORDER BY chunk_index DESC", + (library_id,) + ) + + # Convert to list of dicts + columns = [col[0] for col in cursor.description] + result = [] + for row in cursor: + result.append(dict(zip(columns, row))) + + return result + + except Exception as e: + return {"success": False, "error": str(e)} + finally: + conn.close() diff --git a/backend/app/embeddings.py b/backend/app/embeddings.py new file mode 100644 index 0000000..735e305 --- /dev/null +++ b/backend/app/embeddings.py @@ -0,0 +1,181 @@ +# Local Embedding Generation using FastEmbed +import asyncio +from typing import List +from functools import lru_cache + + +# Module-level singleton for cached model instance +_embedding_model = None +_embedding_size = 384 # BAAI/bge-small-en-v1.5 output dimension + + +def _load_model(): + """Lazy-load the FastEmbed model on first use.""" + global _embedding_model, _embedding_size + + try: + from fastembed import TextEmbedding + + if _embedding_model is None: + print("Loading embedding model (this may take a few minutes on first run)...") + + # Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline + _embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache") + print("Embedding model loaded successfully.") + + return _embedding_model + + except ImportError as e: + raise ImportError( + "FastEmbed is not installed. Please install with:\n" + " pip install fastembed\n\n" + f"Import error details: {e}" + ) from e + + except RuntimeError as e: + # Model download/installation failed + if "No space left" in str(e) or "disk quota exceeded" in str(e): + raise RuntimeError( + "Failed to load embedding model due to disk space constraints.\n\n" + "Please free up space on your system (at least 500MB required).\n" + "Or specify a custom cache directory with available space:\n" + " from fastembed import TextEmbedding\n" + " model = TextEmbedding(model_name='...', cache_dir='/path/to/large/storage')\n\n" + f"Error: {e}" + ) from e + raise + + +def get_embedding_model(): + """ + Get the cached embedding model instance. + + Returns: + FastEmbed TextEmbedding instance (lazy-loaded on first call) + + Raises: + ImportError: If FastEmbed is not installed + RuntimeError: If model download/load failed + """ + global _embedding_model + if _embedding_model is None: + _embedding_model = _load_model() + return _embedding_model + + +def embed_text(text: str) -> List[float]: + """ + Generate embedding for a single text. + + Args: + text: The text string to embed + + Returns: + List of floats representing the embedding vector + + Raises: + ImportError: If FastEmbed is not installed + RuntimeError: If model loading failed + """ + if not text or not isinstance(text, str): + return [0.0] * get_embedding_size() + + model = get_embedding_model() + embedding = model.embed([text]) + return embedding[0].tolist() + + +def embed_texts(texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for multiple texts. + + Args: + texts: List of text strings to embed + + Returns: + List of lists containing embedding vectors (one per input text) + + Raises: + ImportError: If FastEmbed is not installed + RuntimeError: If model loading failed + """ + if not texts: + return [] + + model = get_embedding_model() + embeddings = model.embed(texts) + + result = [] + for emb in embeddings: + if hasattr(emb, 'tolist'): + result.append(emb.tolist()) + else: + result.append(emb) + + return result + + +def get_embedding_size() -> int: + """ + Get the embedding dimension size. + + Returns: + Integer representing vector dimension (384 for bge-small-en-v1.5) + + Note: + This returns a sensible default. Actual dimension is determined by model. + """ + return _embedding_size + + +# Async wrapper for compatibility with existing code +async def generate_embeddings(chunks: List[str]) -> List[List[float]]: + """ + Async wrapper around embed_texts for compatibility. + + Args: + chunks: List of text strings to embed + + Returns: + List of embedding vectors + """ + return embed_texts(chunks) + + +if __name__ == "__main__": + # Test the embeddings module + print("Testing embeddings module...\n") + + # Test get_embedding_size + size = get_embedding_size() + print(f"Embedding dimension: {size}") + + # Test single text embedding + test_text = "Hello, world! This is a test of the embedding generation." + try: + emb = embed_text(test_text) + print(f"\nSingle text embedding shape: ({len(emb)},)") + print(f"First 5 values: {emb[:5]}") + print("βœ“ Single embedding works") + except Exception as e: + print(f"βœ— Single embedding failed: {e}") + + # Test batch embedding + test_texts = [ + "The quick brown fox jumps over the lazy dog.", + "Machine learning is a subset of artificial intelligence.", + "Natural language processing enables computers to understand human language." + ] + try: + embeddings = embed_texts(test_texts) + print(f"\nBatch embedding shape: ({len(embeddings)}, {len(embeddings[0])})") + print("βœ“ Batch embeddings work") + except Exception as e: + print(f"βœ— Batch embeddings failed: {e}") + + # Test empty inputs + assert embed_text("") == [0.0] * size, "Empty text should return zero vector" + assert embed_texts([]) == [], "Empty list should return empty list" + print("βœ“ Empty input handling works") + + print("\nβœ… All tests passed!") \ No newline at end of file diff --git a/backend/app/git_source.py b/backend/app/git_source.py new file mode 100644 index 0000000..1aeb753 --- /dev/null +++ b/backend/app/git_source.py @@ -0,0 +1,389 @@ +# Git Source Operations for Repository Cloning and File Discovery +import os +import shutil +from pathlib import Path +from typing import List, Optional, Dict, Any + + +def get_repos_dir() -> Path: + """Get the base directory for storing cloned repositories.""" + # Default to ./data/repos in project root + return Path(__file__).parent.parent.parent / "data" / "repos" + + +def ensure_repos_dir(): + """Ensure the repos directory exists (idempotent).""" + repos_dir = get_repos_dir() + repos_dir.mkdir(parents=True, exist_ok=True) + return repos_dir + + +# Initialize repos directory at module load time (safe to run multiple times) +ensure_repos_dir() + + +class GitCloneError(Exception): + """Exception for git clone/checkout failures.""" + pass + + +def clone_or_update_repo( + repo_id: str, + repo_url: str, + branch: str, + repos_base: Optional[Path] = None +) -> Dict[str, Any]: + """ + Clone a git repository or update an existing clone. + + Args: + repo_id: Unique identifier for this repository (used in paths) + repo_url: Git URL to clone from + branch: Branch name to checkout + repos_base: Base directory for repos (defaults to get_repos_dir()) + + Returns: + Dict with operation result including repo path and files found + + Raises: + GitCloneError: If clone or checkout fails + """ + repos_base = repos_base or get_repos_dir() + repo_path = repos_base / repo_id + + try: + if repo_path.exists(): + # Update existing clone + print(f" [Git] Updating existing clone at {repo_path}") + + from subprocess import run, CalledProcessError + import subprocess + + # Fetch latest changes + result = run( + ["git", "-C", str(repo_path), "fetch", "origin"], + capture_output=True, + text=True + ) + + if result.returncode != 0: + raise GitCloneError(f"Failed to fetch: {result.stderr}") + + # Reset to branch + run( + ["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch], + capture_output=True, + text=True + ) + else: + # Clone new repository + print(f" [Git] Cloning {repo_url} to {repo_path}") + + run( + ["git", "-C", str(repo_path.parent), "clone", + "--branch", branch, + "--single-branch", + repo_url, "."], + capture_output=True, + text=True + ) + + print(f" [Git] Checked out branch: {branch}") + + return { + "success": True, + "repo_path": str(repo_path), + "url": repo_url, + "branch": branch + } + + except CalledProcessError as e: + raise GitCloneError(f"Git command failed: {e.stderr}") from e + except Exception as e: + raise GitCloneError(f"Failed to clone/update repo: {e}") from e + + +def discover_files( + repo_path: Path, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None +) -> List[Dict[str, Any]]: + """ + Discover files in a git repository respecting include/exclude paths. + + Args: + repo_path: Path to the cloned repository + include_paths: List of paths relative to repo root to include (if None, all dirs considered) + exclude_paths: List of paths relative to repo root to exclude + + Returns: + List of dicts with format: + { + "path": "docs/hooks.md", # Relative to repo root + "full_path": "/full/path/to/repo/docs/hooks.md" + } + """ + include_patterns = None if include_paths is None else [ + Path(p) for p in include_paths + ] + exclude_patterns = set() if exclude_paths is None else { + Path(p) for p in exclude_paths + } + + discovered = [] + + def should_include(path: Path, rel_path: Path) -> bool: + """Check if a path matches any include pattern.""" + if not include_patterns: + return True + + # Normalize paths for comparison (handle trailing slashes, etc.) + path_str = str(path).replace("\\", "/") + rel_str = str(rel_path).replace("\\", "/") + + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + + # If pattern has subdirs, check prefix match + if "/" in inc_str and not inc_str.endswith("/"): + pattern_base = inc_str.rsplit("/", 1)[0] + "/" + if rel_str.startswith(pattern_base): + return True + elif rel_str == inc_str: + return True + + return False + + def should_exclude(path: Path, rel_path: Path) -> bool: + """Check if a path matches any exclude pattern (simple prefix/exact match).""" + for exc_pattern in exclude_patterns: + exc_str = str(exc_pattern).replace("\\", "/") + rel_str = str(rel_path).replace("\\", "/") + + # Exact match or parent directory match + if rel_str == exc_str or rel_str.startswith(exc_str + "/"): + return True + + return False + + def walk_and_collect(current: Path, rel_prefix: Path): + """Recursive walk function.""" + try: + for entry in sorted(os.scandir(current)): + entry_path = current / entry.name + rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix + + # Filter by exclude paths first + if should_exclude(entry_path, rel_path): + continue + + # If include_paths specified, only go into matching directories + if include_patterns and not include_path_match(entry_path, rel_path): + if entry.is_dir(): + return # Don't descend into this directory + + if entry.is_file(): + discovered.append({ + "path": str(rel_path).lstrip("/"), + "full_path": str(entry_path), + "is_binary": is_probably_binary(str(entry_path)) + }) + elif entry.is_dir(): + walk_and_collect(entry_path, rel_path) + + except PermissionError: + # Skip directories we can't read + pass + + def include_path_match(path: Path, rel_path: Path) -> bool: + """Check if path matches any include pattern (for filtering on the fly).""" + if not include_patterns: + return True + + path_str = str(path).replace("\\", "/") + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + + # Exact match or parent directory match + if path_str == inc_str or path_str.startswith(inc_str + "/"): + return True + + return False + + def is_probably_binary(filepath: str) -> bool: + """Simple binary detection based on file extension and first bytes.""" + ext = Path(filepath).suffix.lower() + text_extensions = {'.md', '.txt', '.py', '.js', '.ts', '.json', + '.yaml', '.yml', '.html', '.css', '.sh', '.sql'} + + if ext not in text_extensions: + # Check for null bytes in first 8KB + try: + with open(filepath, 'rb') as f: + chunk = f.read(8192) + return b'\x00' in chunk + except: + return False + + return False + + root_str = str(repo_path).replace("\\", "/") + + # Walk the repository starting from repo root + walk_and_collect(repo_path, Path(".")) + + return discovered + + +async def ingest_git_source( + library_id: str, + name: str, + description: Optional[str] = None, + repo_url: str = None, + branch: str = "main", + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + repos_base: Optional[Path] = None +) -> Dict[str, Any]: + """ + Ingest a git repository as a new library. + + Clones the repo (or updates if exists), discovers files in include paths, + and ingests them into the vector store via existing pipeline. + + Args: + library_id: Unique identifier for this library + name: Library display name + description: Optional description + repo_url: Git repository URL to clone from + branch: Branch to checkout (default: main) + include_paths: Paths relative to repo root to include (if None, all dirs considered) + exclude_paths: Paths relative to repo root to exclude + + Returns: + Dict with operation result + + Raises: + GitCloneError: If git operations fail + """ + from .db import upsert_library + from .ingest import ingest_library + + print(f"\n[Git Ingestion] Processing library: {library_id}") + print(f" Source: {repo_url or '(local)'}") + + # Ensure repos directory exists + repos_base = repos_base or get_repos_dir() + repos_base.mkdir(parents=True, exist_ok=True) + + repo_id = f"{library_id}-git" + + # Clone or update the repo + clone_result = clone_or_update_repo( + repo_id=repo_id, + repo_url=repo_url, + branch=branch, + repos_base=repos_base + ) + + repo_path = Path(clone_result["repo_path"]) + + print(f" [Git] Found files in {repo_path}") + + # Discover files respecting include/exclude paths + files = discover_files( + repo_path=repo_path, + include_paths=include_paths, + exclude_paths=exclude_paths + ) + + print(f" [Git] Discovered {len(files)} file(s)") + + if not files: + return { + "success": True, + "library_id": library_id, + "message": "No files found matching include/exclude criteria", + "files_discovered": 0 + } + + # Remove .git directory if present (avoid processing it) + git_dir = repo_path / ".git" + if git_dir.exists(): + shutil.rmtree(git_dir) + print(f" [Git] Removed .git directory") + + # Ingest using existing library ingestion pipeline + result = await ingest_library( + library_id=library_id, + name=name, + description=description, + source_path=repo_id # Use repo_id as the "source path" for tracking + ) + + return { + "success": result.get("success", False), + "library_id": library_id, + "name": name, + "files_discovered": len(files), + "chunks_created": result.get("chunks_created", 0), + "vectors_added": result.get("vectors_added", 0) + } + + +async def sync_sources( + sources_config: Dict[str, Any] = None, + repos_base: Optional[Path] = None +) -> List[Dict[str, Any]]: + """ + Sync all git sources defined in config. + + Args: + sources_config: List of source configs (same format as docs_sources.yaml) + repos_base: Base directory for repos + + Returns: + List of results for each source + """ + if sources_config is None: + # Load from default config file + import yaml + config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml" + + if not config_path.exists(): + return [{"success": False, "error": f"Config not found: {config_path}"}] + + with open(config_path) as f: + data = yaml.safe_load(f) + sources_config = data.get("sources", []) + + results = [] + + for source in sources_config: + try: + result = await ingest_git_source( + library_id=source.get("library_id"), + name=source.get("name"), + description=source.get("description"), + repo_url=source.get("repo_url"), + branch=source.get("branch", "main"), + include_paths=source.get("include_paths"), + exclude_paths=source.get("exclude_paths"), + repos_base=repos_base + ) + except GitCloneError as e: + result = { + "success": False, + "library_id": source.get("library_id", "unknown"), + "error": str(e) + } + except Exception as e: + result = { + "success": False, + "library_id": source.get("library_id", "unknown"), + "error": f"Unexpected error: {e}" + } + + results.append(result) + + return results \ No newline at end of file diff --git a/backend/app/ingest.py b/backend/app/ingest.py new file mode 100644 index 0000000..5974788 --- /dev/null +++ b/backend/app/ingest.py @@ -0,0 +1,387 @@ +# Document Ingestion Logic +import asyncio +import os +from pathlib import Path +from typing import List, Dict, Any, Optional, BinaryIO + +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + + +# Import local modules +from .config import settings +from .chunking import chunk_text, estimate_tokens +from .embeddings import embed_texts +from .vector_store import upsert_chunks +from .db import insert_document_chunk, upsert_library, clear_library_documents +from .git_source import ingest_git_source + +SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json', + '.yaml', '.yml', '.html', '.css', '.pdf'} + +# Default documents path from environment or fallback +DOCS_PATH = Path(os.getenv("DOCS_PATH", "./docs")) + + +def get_file_size(path: Path) -> int: + """Get file size in bytes.""" + try: + return path.stat().st_size + except OSError: + return -1 + + +async def read_document_file(path: Path) -> str: + """ + Read document content from a file. + + Args: + path: Path to the file + + Returns: + Content as string, or empty string if error + + Raises: + ValueError: If file type not supported + """ + if not path.exists(): + return "" + + # Check extension + suffix = path.suffix.lower() + if suffix == '.pdf': + from pypdf import PdfReader + + try: + reader = PdfReader(str(path)) + pages = [] + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + text = page.extract_text() + if text: + pages.append(text) + return "\n\n".join(pages) + except ImportError: + raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf") + except Exception as e: + print(f" Warning: Could not read PDF {path}: {e}") + return "" + elif suffix not in SUPPORTED_EXTENSIONS: + print(f" Unsupported file type: {suffix}") + return "" + + # Read text-based files + try: + content = path.read_text(encoding='utf-8') + return content if content.strip() else "" + except Exception as e: + print(f" Warning: Could not read {path}: {e}") + return "" + + +async def ingest_library(library_id: str, name: str, description: Optional[str] = None, source_path: Optional[str] = None) -> Dict[str, Any]: + """ + Ingest all documents for a library. + + Args: + library_id: Unique identifier for the library + name: Library name + description: Optional description + source_path: Path to library folder (relative to DOCS_PATH) + + Returns: + Summary dict with operation results + """ + print(f"\n[Library] Processing: {library_id}") + if source_path: + print(f" Source: {source_path}") + + # Ensure library record exists + result = upsert_library(library_id, name, description, source_path) + print(f" [{result.get('success', False)}] Library record: {'created' if not result.get('exists') else 'updated'}") + + # Get the library folder path + library_dir = DOCS_PATH / source_path + + if not library_dir.exists(): + print(f" Error: Directory does not exist: {library_dir}") + return {"success": False, "error": f"Directory not found: {library_dir}"} + + # Find all supported files (recursive) + print(f" [Library] Scanning for files in: {library_dir}") + doc_files = [] + + for file_path in library_dir.rglob('*'): + if file_path.is_file(): + suffix = file_path.suffix.lower() + if suffix == '.pdf': + doc_files.append(file_path) + elif suffix in SUPPORTED_EXTENSIONS: + doc_files.append(file_path) + + print(f" [Library] Found {len(doc_files)} document(s)") + + # Clear old chunks for this library + print(f" [Library] Clearing existing chunks...") + clear_result = clear_library_documents(library_id) + if not clear_result.get('success'): + print(f" Warning: Could not clear library docs: {clear_result}") + else: + print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks") + + # Process documents + all_chunks = [] + processed_files = 0 + + for file_path in doc_files: + # Read file content + print(f" [File] Reading: {file_path.relative_to(library_dir)}") + content = await read_document_file(file_path) + + if not content: + continue + + # Estimate tokens and chunk + num_tokens = estimate_tokens(content) + chunks = chunk_text(content, max_tokens=500, overlap_tokens=80) + + if not chunks: + print(f" [File] No valid chunks from {file_path.name}") + continue + + # Embed chunks and prepare for storage + print(f" Chunked into {len(chunks)} pieces (approx. {num_tokens} tokens)") + + embeddings = embed_texts(chunks) + + # Build chunk dicts + chunk_dicts = [] + base_path = file_path.relative_to(library_dir).as_posix() + + for i, chunk in enumerate(chunks): + chunk_dict = { + "id": f"{file_path.stem}-{i}", + "library_id": library_id, + "path": base_path, + "title": Path(base_path).stem, + "content": chunk, + "chunk_index": i, + "embedding": embeddings[i] + } + all_chunks.append(chunk_dict) + + processed_files += 1 + + print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks") + + # Save chunks to SQLite + if all_chunks: + for chunk in all_chunks: + insert_result = insert_document_chunk( + doc_id=chunk["id"], + library_id=chunk["library_id"], + path=chunk["path"], + title=chunk.get("title"), + content=chunk["content"], + chunk_index=chunk["chunk_index"], + token_estimate=estimate_tokens(chunk["content"]) + ) + if insert_result.get('success'): + continue + print(f" [Library] Saved {len(all_chunks)} chunks to SQLite") + else: + print(f" [Library] No chunks to save to SQLite") + + # Save vectors to Qdrant + if all_chunks: + upsert_result = await upsert_chunks(all_chunks) + print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)") + else: + print(f" [Library] No vectors to add to Qdrant") + + return { + "success": True, + "library_id": library_id, + "files_processed": processed_files, + "chunks_created": len(all_chunks), + "vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks) + } + + +async def ingest_git_source_from_config( + repo_url: str, + branch: str = "main", + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + repos_base: Optional[Path] = None +) -> Dict[str, Any]: + """ + Ingest a git repository defined in sources configuration. + + Args: + repo_url: Git repository URL to clone from + branch: Branch to checkout (default: main) + include_paths: Paths relative to repo root to include (if None, all dirs considered) + exclude_paths: Paths relative to repo root to exclude + repos_base: Base directory for cloned repos (defaults to ./data/repos) + + Returns: + Dict with operation result + + Raises: + GitCloneError: If git operations fail + """ + # Auto-generate library_id from URL if not provided + import urllib.parse + parsed = urllib.parse.urlparse(repo_url) + path_part = parsed.path.rstrip('.git') + library_id = Path(path_part).name or "unknown" + + name = Path(parsed.hostname or path_part).stem + description = f"Documentation from {path_part}" + + result = await ingest_git_source( + library_id=library_id, + name=name, + description=description, + repo_url=repo_url, + branch=branch, + include_paths=include_paths, + exclude_paths=exclude_paths, + repos_base=repos_base + ) + + return result + + +async def detect_libraries() -> List[Dict[str, Any]]: + """ + Detect all top-level folders under DOCS_PATH as libraries. + + Returns: + List of dicts with library metadata + """ + print(f"\n[Detection] Scanning for libraries in: {DOCS_PATH}") + + if not DOCS_PATH.exists(): + print(f" [Detection] Directory does not exist: {DOCS_PATH}") + return [] + + # Get top-level directories + directories = list(DOCS_PATH.iterdir()) + dirs_only = [d for d in directories if d.is_dir()] + + libraries = [] + for i, lib_dir in enumerate(dirs_only, 1): + name = lib_dir.name + + # Create library record with defaults + result = upsert_library( + library_id=lib_dir.name.lower(), + name=name, + description=None, + source_path=lib_dir.name + ) + + libraries.append({ + "id": lib_dir.name.lower(), + "name": name, + "source_path": lib_dir.name + }) + + print(f" [{i}/{len(dirs_only)}] Library detected: {name} (id: {lib_dir.name.lower()})") + + print(f"\n[Detection] Found {len(libraries)} library(ies)") + return libraries + + +async def ingest_all(verbose: bool = True) -> Dict[str, Any]: + """ + Ingest all discovered libraries. + + Args: + verbose: Whether to print progress messages + + Returns: + Summary dict with overall results + """ + if verbose: + print("\n" + "=" * 60) + print("DOCUMENT INGESTION STARTED") + print("=" * 60) + + # Detect libraries + libraries = await detect_libraries() + + if not libraries: + result = {"total_libraries": 0, "total_chunks": 0, "successful": []} + if verbose: + print("\n[Summary] No libraries to ingest") + return result + + # Ingest each library + results = [] + for lib in libraries: + lib_id = lib["id"] + + result = await ingest_library( + library_id=lib_id, + name=lib["name"], + description=None, + source_path=lib.get("source_path") + ) + + if verbose and result.get('success'): + print(f" [Library] Done: {result.get('library_id')} - {result.get('chunks_created', 0)} chunks") + + results.append(result) + + # Calculate totals + total_chunks = sum(r.get('chunks_created', 0) for r in results) + successful = len([r for r in results if r.get('success')]) + + result = { + "total_libraries": len(libraries), + "successful": successful, + "failed": len(results) - successful, + "total_chunks": total_chunks + } + + if verbose: + print("\n" + "=" * 60) + print("INGESTION COMPLETE") + print("=" * 60) + print(f" Libraries processed: {result['total_libraries']}") + print(f" Successful: {result['successful']}") + print(f" Failed: {result['failed']}") + print(f" Total chunks created: {result['total_chunks']}") + + return result + + +if __name__ == "__main__": + # Run ingestion tests + import asyncio + + async def test_run(): + print("Testing ingestion module...\n") + + # Test detect_libraries + libs = await detect_libraries() + print(f"\nDetected libraries: {len(libs)}") + + if libs: + # Try to ingest the first library (may fail if no docs exist, which is ok for test) + print("\nAttempting sample ingestion...") + result = await ingest_library( + library_id=libs[0]["id"], + name=libs[0]["name"], + source_path=libs[0].get("source_path") + ) + print(f"Result: {result}") + + print("\nβœ… Tests completed!") + + asyncio.run(test_run()) diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..7a66f6f --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,299 @@ +"""Context7 Docs API.""" +import asyncio +import shutil +import yaml +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +from .config import settings +from .db import ( + clear_library_documents, + delete_library, + init_db, + list_libraries, + search_libraries, + upsert_library, +) +from .git_source import ingest_git_source +from .ingest import ingest_all, ingest_library +from .search import get_library_docs, resolve_library_id, search_docs +from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name + + +app = FastAPI( + title="Context7 Docs API", + description="Document ingestion and semantic search API for local-context7", + version="1.0.0", +) + + +class SearchRequest(BaseModel): + query: str = Field(..., min_length=1) + library_id: Optional[str] = None + limit: int = Field(10, ge=1, le=50) + + +class SyncSourcesRequest(BaseModel): + override: bool = False + + +ALLOWED_EXTENSIONS = { + ".md", + ".txt", + ".py", + ".js", + ".ts", + ".json", + ".yaml", + ".yml", + ".html", + ".css", + ".pdf", +} + + +@app.middleware("http") +async def auth_middleware(request: Request, call_next): + """Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set.""" + if not settings.is_auth_enabled: + return await call_next(request) + + public_prefixes = ("/health", "/libraries", "/docs/") + if request.method == "GET" and request.url.path.startswith(public_prefixes): + return await call_next(request) + + if request.headers.get("X-API-Key") != settings.api_key_docs_api: + return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"}) + + return await call_next(request) + + +@app.on_event("startup") +async def startup() -> None: + init_result = init_db() + if not init_result.get("success"): + raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}") + + last_error = None + for _ in range(20): + collection_result = await ensure_collection() + if collection_result.get("success"): + return + last_error = collection_result.get("error") + await asyncio.sleep(1) + raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}") + + +def safe_library_id(library_id: str) -> str: + """Normalize user-provided library IDs to a single path segment.""" + base = Path(library_id).name.strip() + if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id: + raise HTTPException(status_code=400, detail="Invalid library ID") + return base + + +def safe_upload_filename(filename: str) -> str: + ext = Path(filename).suffix.lower() + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException( + status_code=400, + detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}", + ) + + stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip() + if not stem: + raise HTTPException(status_code=400, detail="Filename contains only unsafe characters") + return f"{stem}{ext}" + + +def docs_root() -> Path: + return Path(settings.docs_path) + + +def sources_config_path() -> Path: + return Path(__file__).resolve().parents[2] / "docs_sources.yaml" + + +@app.get("/health") +async def health_check(): + return {"status": "ok", "service": "docs-api"} + + +@app.get("/collections") +async def collections(): + try: + client = get_client() + info = client.get_collection(get_collection_name()) + vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0 + return {"collections": {get_collection_name(): {"vectors": vectors}}} + except Exception as e: + return {"collections": {}, "warning": str(e)} + + +@app.get("/libraries") +async def list_libraries_api(): + libs = list_libraries() + if isinstance(libs, dict) and not libs.get("success", True): + raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries")) + return {"libraries": libs, "count": len(libs)} + + +@app.get("/libraries/search") +async def search_libraries_api(q: str = Query(..., min_length=1)): + matches = resolve_library_id(q) + return {"matches": matches, "count": len(matches)} + + +@app.post("/search") +async def search_docs_api(payload: SearchRequest): + results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit) + return { + "query": payload.query, + "library_id": payload.library_id, + "results": results, + "count": len(results), + } + + +@app.get("/docs/{library_id}") +@app.get("/libraries/{library_id}/docs") +async def get_library_docs_api( + library_id: str, + topic: Optional[str] = Query(None), + tokens: int = Query(8000, ge=1), +): + docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens) + return {"library_id": library_id, "content": docs} + + +@app.post("/ingest/all") +async def ingest_all_api(): + return await ingest_all() + + +@app.post("/ingest/{library_id}") +async def ingest_library_api(library_id: str): + library_id = safe_library_id(library_id) + source_path = library_id + return await ingest_library(library_id=library_id, name=library_id, source_path=source_path) + + +@app.post("/api/v1/libraries/{library_id}") +async def api_create_library( + library_id: str, + name: Optional[str] = Form(None), + description: Optional[str] = Form(None), +): + library_id = safe_library_id(library_id) + lib_dir = docs_root() / library_id + lib_dir.mkdir(parents=True, exist_ok=True) + result = upsert_library(library_id, name or library_id, description, library_id) + if not result.get("success"): + raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library")) + return { + "success": True, + "created": not result.get("exists", False), + "library_id": library_id, + "name": name or library_id, + "description": description, + "path": str(lib_dir), + } + + +@app.delete("/api/v1/libraries/{library_id}") +async def api_delete_library(library_id: str): + library_id = safe_library_id(library_id) + lib_dir = docs_root() / library_id + deleted_files = 0 + + if lib_dir.exists(): + for path in lib_dir.rglob("*"): + if path.is_file(): + deleted_files += 1 + shutil.rmtree(lib_dir) + + docs_result = clear_library_documents(library_id) + vectors_result = await delete_library_vectors(library_id) + library_result = delete_library(library_id) + + failures = [ + r.get("error") + for r in (docs_result, vectors_result, library_result) + if isinstance(r, dict) and not r.get("success", True) + ] + if failures: + raise HTTPException(status_code=500, detail="; ".join(failures)) + + return {"success": True, "library_id": library_id, "deleted_files": deleted_files} + + +@app.post("/api/v1/upload/{library_id}") +async def api_upload(library_id: str, file: UploadFile = File(...)): + library_id = safe_library_id(library_id) + safe_name = safe_upload_filename(file.filename or "upload.txt") + lib_dir = docs_root() / library_id + lib_dir.mkdir(parents=True, exist_ok=True) + + contents = await file.read() + if len(contents) > 5 * 1024 * 1024: + raise HTTPException(status_code=400, detail="File too large (max 5MB)") + + target = lib_dir / safe_name + target.write_bytes(contents) + + upsert_library(library_id, library_id, None, library_id) + return { + "success": True, + "library_id": library_id, + "filename": safe_name, + "path": str(target.relative_to(docs_root())), + "size_bytes": len(contents), + } + + +@app.get("/api/v1/sources") +@app.get("/sources/config") +async def api_list_sources(): + path = sources_config_path() + if not path.exists(): + return {"success": True, "sources": [], "count": 0} + + with path.open() as f: + data = yaml.safe_load(f) or {} + sources = data.get("sources", data if isinstance(data, list) else []) + if not isinstance(sources, list): + sources = [] + return {"success": True, "sources": sources, "count": len(sources)} + + +@app.post("/sources/sync") +async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None): + source_data = await api_list_sources() + sources = source_data["sources"] + override = payload.override if payload else False + results = [] + + for source in sources: + result = await ingest_git_source( + library_id=source["library_id"], + name=source.get("name") or source["library_id"], + description=source.get("description"), + repo_url=source["repo_url"], + branch=source.get("branch", "main"), + include_paths=source.get("include_paths"), + exclude_paths=source.get("exclude_paths"), + ) + results.append(result) + + successful = len([r for r in results if r.get("success")]) + return { + "success": successful == len(results), + "total_sources": len(results), + "successful": successful, + "failed": len(results) - successful, + "results": results, + } diff --git a/backend/app/models.py b/backend/app/models.py new file mode 100644 index 0000000..e9a0868 --- /dev/null +++ b/backend/app/models.py @@ -0,0 +1,47 @@ +# Data Models for document processing and API responses +from typing import Any, Dict, List, Optional + + +class DocumentChunk: + """Represents a chunk of text to be embedded.""" + + def __init__( + self, + text: str, + metadata: Optional[Dict[str, Any]] = None + ): + self.text = text + self.metadata = metadata or {} + + @property + def doc_id(self) -> str: + """Generate a document ID from content.""" + return f"doc-{hash(self.text)}" + + +class IngestResponse: + """Response model for document ingestion.""" + + def __init__( + self, + success: bool, + chunks_count: int = 0, + error: Optional[str] = None + ): + self.success = success + self.chunks_count = chunks_count + self.error = error + + +class SearchResponse: + """Response model for search results.""" + + def __init__( + self, + results: List[Dict[str, Any]], + query: str, + total_results: int + ): + self.results = results + self.query = query + self.total_results = total_results \ No newline at end of file diff --git a/backend/app/search.py b/backend/app/search.py new file mode 100644 index 0000000..ce7e825 --- /dev/null +++ b/backend/app/search.py @@ -0,0 +1,235 @@ +# Search Operations for Semantic Query and Library Navigation +from typing import List, Dict, Any, Optional +from pathlib import Path + +from .config import settings +from .vector_store import get_client, _collection_name as VECTOR_COLLECTION +from .embeddings import embed_text, get_embedding_size +from .db import get_chunks_for_library, list_libraries + + +def search_docs( + query: str, + library_id: Optional[str] = None, + limit: int = 10 +) -> List[Dict[str, Any]]: + """ + Search documents by semantic similarity in Qdrant. + + Args: + query: The search query string + library_id: Optional filter to search only within a library + limit: Maximum number of results to return + + Returns: + List of dicts with format: + { + "id": "...", + "score": 0.123, + "library_id": "...", + "path": "...", + "title": "...", + "chunk_index": 0 + } + """ + try: + # Generate embedding for the query + query_embedding = embed_text(query) + + client = get_client() + + # Build filter if library_id is specified + search_filter = None + if library_id: + try: + from qdrant_client.models import FieldCondition, Filter, MatchValue + search_filter = Filter( + must=[ + FieldCondition( + key="library_id", + match=MatchValue(value=library_id), + ) + ] + ) + except ImportError: + search_filter = None + + # Perform vector search + results = client.search( + collection_name=VECTOR_COLLECTION, + query_vector=query_embedding, + limit=limit, + search_filter=search_filter + ) + + # Format and return results + formatted_results = [] + for result in results: + if result.score > 0 and result.payload: + formatted_results.append({ + "id": result.payload["id"], + "score": float(result.score), + "library_id": result.payload.get("library_id", ""), + "path": result.payload.get("path", ""), + "title": result.payload.get("title", ""), + "chunk_index": result.payload.get("chunk_index", 0) + }) + + return formatted_results + + except Exception as e: + print(f"Search error: {e}") + return [] + + +def get_library_docs( + library_id: str, + topic: Optional[str] = None, + token_limit: int = 8000 +) -> str: + """ + Retrieve documentation content from a library. + + Args: + library_id: The library ID to fetch docs from + topic: Optional topic filter - if provided, searches for topic first + token_limit: Maximum tokens to include in output + + Returns: + Combined markdown content as string + """ + try: + # If topic is specified, search for relevant chunks + if topic: + print(f" [Search] Searching library '{library_id}' for topic: {topic}") + search_results = search_docs(query=topic, library_id=library_id, limit=20) + + if not search_results: + return f"No documents found in library '{library_id}' matching topic: {topic}" + + print(f" [Search] Found {len(search_results)} relevant chunks") + else: + # Fetch all chunks for the library and select most useful ones + print(f" [Fetch] Retrieving chunks from library '{library_id}'") + chunks_data = get_chunks_for_library(library_id) + + if not chunks_data: + return f"No documents found in library '{library_id}'" + + # Sort by chunk_index descending and pick top ones to respect token limit + sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True) + selected_chunks = [] + total_tokens = 0 + + for chunk in sorted_chunks: + content = chunk.get("content", "") + tokens = len(content) // 4 # Simple token estimate + + if total_tokens + tokens <= token_limit: + selected_chunks.append(chunk) + total_tokens += tokens + else: + # Take part of this chunk to fill remaining space + remaining = token_limit - total_tokens + content_preview = content[:remaining * 4] if remaining > 0 else "" + if content_preview: + selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")}) + + print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)") + + # Combine chunks into markdown + md_parts = [] + for chunk in selected_chunks: + title = chunk.get("title") + content = chunk.get("content", "") + + if title and content.strip(): + # Add heading before first chunk or if this is the first chunk + if not md_parts or "\n\n" not in "".join(md_parts): + md_parts.append(f"# {title}") + elif not any(part.startswith("#") for part in md_parts[-5:]): + md_parts.append(f"\n# {title}\n") + + md_parts.append(content) + + result = "\n\n".join(md_parts) + + # If no headings were added, prepend library title + if not any(part.startswith("#") for part in result.split("\n")[:3]): + result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result + + return result.rstrip() + + except Exception as e: + print(f"Error getting library docs: {e}") + return f"Error retrieving documents from library '{library_id}': {str(e)}" + + +def resolve_library_id(library_name: str) -> List[Dict[str, Any]]: + """ + Resolve a library name to potential matches (Context7-style). + + Args: + library_name: Partial or full library name to search for + + Returns: + List of Context7-style candidate dicts: + { + "id": "/local/foundryvtt", + "name": "foundryvtt", + "description": "...", + "source": "local" + } + """ + try: + libraries = list_libraries() + + if not libraries: + return [] + + # Filter by name match (case-insensitive) + candidates = [] + for lib in libraries: + lib_name = lib.get("name", "").lower() + lib_id = lib.get("id", "").lower() + + if library_name.lower() in lib_name or library_name.lower() in lib_id: + candidates.append({ + "id": f"/local/{lib['id']}", + "name": lib["name"], + "description": lib.get("description", ""), + "source": "local" + }) + + # Return top matches (or all if less than 3) + candidates = candidates[:min(5, len(candidates))] + + print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}") + + return candidates + + except Exception as e: + print(f"Error resolving library ID: {e}") + return [] + + +if __name__ == "__main__": + import asyncio + + async def test_search(): + """Test search functionality.""" + print("Testing search module...\n") + + # Test 1: Simple search with dummy vector (simulated) + print("1. Testing resolve_library_id()...") + results = await resolve_library_id("foundryvtt") + print(f" Results: {len(results)} candidates\n") + + # Test 2: Empty query should return empty list + print("2. Testing search_docs() with empty query...") + results = await search_docs("") + print(f" Results: {len(results)} chunks\n") + + print("βœ… All tests completed!") + + asyncio.run(test_search()) diff --git a/backend/app/vector_store.py b/backend/app/vector_store.py new file mode 100644 index 0000000..3e4b50e --- /dev/null +++ b/backend/app/vector_store.py @@ -0,0 +1,361 @@ +# Vector Store Operations for Qdrant +import asyncio +import uuid +from typing import List, Dict, Any, Optional + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue +except ImportError: + QdrantClient = None + Distance = VectorParams = PointStruct = Filter = FieldCondition = MatchValue = None + + +# Singleton client instance +_client: Optional[Any] = None +try: + from .config import settings + _collection_name = settings.collection_name +except Exception: + _collection_name = "local_context7_docs" + + +def get_client() -> Any: + """Get or create the Qdrant client singleton using environment config.""" + global _client + + if _client is None: + if QdrantClient is None: + raise RuntimeError("qdrant-client is not installed") + + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + # Use QDRANT_URL from environment if available, otherwise use host:port + import os + qdrant_url = os.getenv("QDRANT_URL") + + if qdrant_url: + _client = QdrantClient(url=qdrant_url) + else: + from .config import settings + host = settings.vector_store_host + port = settings.vector_store_port + _client = QdrantClient(host=host, port=port) + + return _client + + +def get_collection_name() -> str: + """Get the collection name for vector storage.""" + return _collection_name + + +def get_embedding_size() -> int: + """Get embedding dimension size from embeddings module.""" + try: + from .embeddings import get_embedding_size + return get_embedding_size() + except (ImportError, RuntimeError): + # Default fallback if embeddings module not loaded yet + return 384 + + +async def ensure_collection(vector_size: Optional[int] = None) -> Dict[str, Any]: + """ + Ensure the Qdrant collection exists with proper schema. + + Args: + vector_size: Override embedding dimension (uses get_embedding_size() if not provided) + + Returns: + Dict with operation result + """ + try: + if QdrantClient is None: + return {"success": False, "error": "qdrant-client is not installed"} + + client = get_client() + size = vector_size or get_embedding_size() + distance = Distance.COSINE + + # Check if collection exists + try: + collections = client.get_collections().collections + collection_exists = any(c.name == _collection_name for c in collections) + except Exception: + collection_exists = False + + if not collection_exists: + # Create new collection + client.create_collection( + collection_name=_collection_name, + vectors=VectorParams(size=size, distance=distance), + wait=True + ) + + return { + "success": True, + "collection": _collection_name, + "vector_size": size, + "created": True + } + else: + # Verify current vector size matches expected + try: + collection_info = client.get_collection(_collection_name) + current_size = collection_info.config.params.vectors.size + + if current_size != size: + # Collection exists with wrong size - delete and recreate + client.delete_collection(_collection_name) + client.create_collection( + collection_name=_collection_name, + vectors=VectorParams(size=size, distance=distance), + wait=True + ) + + return { + "success": True, + "collection": _collection_name, + "vector_size": size, + "created": False, + "resized": True + } + except Exception: + pass # Collection exists, don't worry about size for now + + return { + "success": True, + "collection": _collection_name, + "vector_size": size, + "created": False + } + + except Exception as e: + return {"success": False, "error": str(e)} + + +async def upsert_chunks(chunks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Upsert chunks into the vector store. + + Args: + chunks: List of chunk dicts with format: + { + "id": "...", + "library_id": "...", + "path": "...", + "title": "...", + "chunk_index": 0, + "content": "...", + "embedding": [...] + } + + Returns: + Dict with operation result + """ + try: + if QdrantClient is None: + return {"success": False, "error": "qdrant-client is not installed"} + + if not chunks: + return {"success": True, "points_added": 0} + + client = get_client() + + # Build PointStruct points from chunk dicts + points = [] + for chunk in chunks: + point_key = f"{chunk['library_id']}:{chunk['id']}" + point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, point_key)) + + points.append(PointStruct( + id=point_id, + vector=chunk["embedding"], + payload={ + "id": chunk["id"], + "library_id": chunk["library_id"], + "path": chunk.get("path", ""), + "title": chunk.get("title", ""), + "chunk_index": chunk.get("chunk_index", 0), + "content": chunk.get("content", "") + } + )) + + # Upsert points into collection + client.upsert(_collection_name, points=points) + + return { + "success": True, + "points_added": len(points) + } + + except Exception as e: + return {"success": False, "error": str(e)} + + +async def search_vectors( + query_vector: List[float], + library_id: Optional[str] = None, + limit: int = 10 +) -> List[Dict[str, Any]]: + """ + Search for semantically similar vectors. + + Args: + query_vector: The embedding vector to search against + library_id: Optional filter by library ID + limit: Maximum results to return + + Returns: + List of result dicts with format: + { + "id": "...", + "score": 0.123, + "library_id": "...", + "path": "...", + "title": "...", + "chunk_index": 0 + } + """ + try: + if QdrantClient is None: + return [] + + client = get_client() + + # Build filter if library_id is specified + search_filter = None + if library_id: + search_filter = Filter( + must=[ + FieldCondition( + key="library_id", + match=MatchValue(value=library_id), + ) + ] + ) + + # Perform vector search + results = client.search( + collection_name=_collection_name, + query_vector=query_vector, + limit=limit, + search_filter=search_filter + ) + + # Format results + formatted_results = [] + for result in results: + if result.score > 0 and result.payload: + formatted_results.append({ + "id": result.payload["id"], + "score": float(result.score), + "library_id": result.payload["library_id"], + "path": result.payload.get("path", ""), + "title": result.payload.get("title", ""), + "chunk_index": result.payload.get("chunk_index", 0) + }) + + return formatted_results + + except Exception as e: + return [] + + +async def delete_library_vectors(library_id: str) -> Dict[str, Any]: + """ + Delete all vectors for a given library. + + Args: + library_id: The library ID to delete vectors for + + Returns: + Dict with operation result + """ + try: + if QdrantClient is None: + return {"success": True, "library_id": library_id, "skipped": "qdrant-client is not installed"} + + client = get_client() + + # Use filter to delete only vectors matching the library_id + filter_condition = Filter( + must=[ + FieldCondition( + key="library_id", + match=MatchValue(value=library_id), + ) + ] + ) + + # Get all points with the filter (in batches) + batch_size = 100 + offset = None + + while True: + try: + # Scroll to get points matching filter + points, _ = client.scroll( + collection_name=_collection_name, + scroll_filter=filter_condition, + limit=batch_size, + offset=offset, + with_payload=True, + with_vectors=False + ) + + if not points: + break + + # Collect IDs to delete + point_ids = [p.id for p in points] + + # Delete the points + client.delete( + collection_name=_collection_name, + points_selector=point_ids + ) + + offset = points[-1].id if points else None + + except Exception as e: + # If we hit end of dataset or other issue, break + break + + return { + "success": True, + "library_id": library_id + } + + except Exception as e: + return {"success": False, "error": str(e)} + + +if __name__ == "__main__": + # Test vector store module + import os + + print("Testing vector store module...\n") + + # Test ensure_collection + print("1. Testing ensure_collection()...") + result = asyncio.run(ensure_collection()) + print(f" Result: {result}\n") + + # Test search with empty query (will return empty since no vectors exist yet) + print("2. Testing search_vectors() with dummy vector...") + dummy_vector = [0.1] * 384 + results = asyncio.run(search_vectors(dummy_vector, limit=5)) + print(f" Results count: {len(results)}\n") + + # Test delete_library_vectors (will succeed even if no vectors exist) + print("3. Testing delete_library_vectors()...") + result = asyncio.run(delete_library_vectors("test-library")) + print(f" Result: {result}\n") + + print("βœ… All tests completed!") diff --git a/backend/app/webui/__init__.py b/backend/app/webui/__init__.py new file mode 100644 index 0000000..3fe3f0f --- /dev/null +++ b/backend/app/webui/__init__.py @@ -0,0 +1 @@ +"""WebUI module for Context7 Docs.""" \ No newline at end of file diff --git a/backend/app/webui/static/css/main.css b/backend/app/webui/static/css/main.css new file mode 100644 index 0000000..fb4d50d --- /dev/null +++ b/backend/app/webui/static/css/main.css @@ -0,0 +1,166 @@ +.container { + max-width: 1000px; + margin: 0 auto; + padding: 20px; +} + +header { + border-bottom: 1px solid #ccc; + padding-bottom: 15px; + margin-bottom: 20px; +} + +header h1 { + margin: 0 0 10px 0; + font-size: 1.5rem; +} + +nav { + display: flex; + gap: 15px; +} + +nav a { + text-decoration: none; + color: #0066cc; + font-size: 0.9rem; +} + +nav a.active { + font-weight: bold; + text-decoration: underline; +} + +main h2 { + margin-bottom: 15px; +} + +footer { + margin-top: 40px; + padding-top: 15px; + border-top: 1px solid #ccc; + font-size: 0.8rem; + color: #666; +} + +/* Status cards */ +.status-card { + background: #f5f5f5; + padding: 20px; + border-radius: 8px; + border-left: 4px solid #00c467; +} + +.status-message { + background: #e8f4fd; + padding: 10px; + border-radius: 4px; + margin: 5px 0; +} + +/* Tables */ +.library-table { + width: 100%; + border-collapse: collapse; + margin-top: 10px; +} + +.library-table th, .library-table td { + padding: 10px; + text-align: left; + border-bottom: 1px solid #ddd; +} + +.library-table th { + background: #f5f5f5; + font-weight: bold; +} + +/* Forms */ +form input[type="text"], form textarea, form select { + padding: 8px; + border: 1px solid #ccc; + border-radius: 4px; + margin-right: 10px; + margin-bottom: 10px; +} + +button { + background: #0066cc; + color: white; + border: none; + padding: 10px 20px; + border-radius: 4px; + cursor: pointer; +} + +button:hover { + background: #0055aa; +} + +/* Pre formatting */ +pre { + background: #f5f5f5; + padding: 15px; + border-radius: 4px; + overflow-x: auto; + white-space: pre-wrap; + word-break: break-word; +} + +/* Search results */ +.result-card { + background: #fff; + border: 1px solid #ddd; + padding: 15px; + margin: 10px 0; + border-radius: 4px; +} + +.result-card h3 { + margin: 0 0 8px 0; +} + +.hint { + color: #666; + font-size: 0.85rem; + margin-top: 15px; +} + +/* Status colors */ +.status-ok { + color: #00c467; + font-weight: bold; +} + +.content-preview { + max-height: 300px; + overflow-y: auto; +} + +.results-count { + background: #e8f4fd; + padding: 10px; + border-radius: 4px; + margin-bottom: 15px; +} + +.source-card { + background: #f5f5f5; + padding: 15px; + margin: 10px 0; + border-radius: 4px; +} + +.actions-bar { + margin-top: 15px; +} + +.actions-bar form { + display: inline-flex; +} + +.doc-content { + max-height: 600px; + overflow-y: auto; +} \ No newline at end of file diff --git a/backend/app/webui/views.py b/backend/app/webui/views.py new file mode 100644 index 0000000..bdbb2d6 --- /dev/null +++ b/backend/app/webui/views.py @@ -0,0 +1,568 @@ +"""WebUI Views for Context7 Docs using Jinja2 templates.""" +import os +import json +from pathlib import Path +from typing import Any, Optional +from fastapi import Request +from fastapi.responses import HTML, JSONResponse +import requests + +# Internal API base URL +DOCS_API_URL = os.environ.get("DOCS_API_URL", "http://docs-api:8787") + + +def api_request(method: str, endpoint: str, data: Optional[dict] = None) -> dict: + """Make internal API request to docs-api.""" + url = f"{DOCS_API_URL}{endpoint}" + headers = {} + if os.environ.get("WEBUI_API_KEY"): + headers["X-API-Key"] = os.environ.get("WEBUI_API_KEY") + + resp = requests.request(method, url, headers=headers, json=data) + return resp.json() + + +def navbar_html(current: str) -> str: + """Generate navigation bar HTML.""" + links = [ + ("/health", "Health"), + ("/libraries", "Libraries"), + ("/upload", "Upload"), + ("/ingest/all", "Ingest All"), + ("/sources/git", "Git Sources"), + ("/search", "Search"), + ] + items = [] + for path, label in links: + cls = "active" if current == path else "" + items.append(f'{label}') + return f"""""".strip() + + +def footer_html() -> str: + """Generate footer HTML.""" + return "" + + +def health(request: Request) -> HTML: + """System health dashboard.""" + try: + data = api_request("GET", "/health") + status = data.get("status", "unknown") + service = data.get("service", "Service") + except Exception as e: + status = "error" + service = str(e) + + return HTML(f""" + + + + + Context7 Docs - Health + + + +
+

Context7 Docs UI

{navbar_html("/health")}
+

System Health

+

{service}

+

Status: {status}

+
{footer_html()}
+""", media_type="text/html") + + +def libraries(request: Request) -> HTML: + """List all libraries.""" + try: + data = api_request("GET", "/libraries") + libs = data.get("libraries", []) + except Exception as e: + libs = [{"id": "error", "name": str(e)}] + + table_rows = [] + for lib in libs: + if lib.get("id") != "error": + table_rows.append( + f"""{lib.get('id')} + {lib.get('name', '')} + {lib.get('description', '') or '(no description)'} + View Docs""" + ) + + return HTML(f""" + + + + + Context7 Docs - Libraries + + + +
+

Context7 Docs UI

{navbar_html("/libraries")}
+
+

Libraries ({len(libs)})

+
+
+ + +
+
+ + + {"".join(table_rows)} +
IDNameDescriptionActions
+
{footer_html()}
+""", media_type="text/html") + + +def upload(request: Request) -> HTML: + """File upload form.""" + if "file" in request.files: + uploaded_file = request.files["file"] + try: + content = uploaded_file.read().decode("utf-8")[:5000] + # Escape HTML + safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">") + truncated = safe_content[:1000] + "..." if len(safe_content) > 1000 else safe_content + + return HTML(f""" + + + + + Context7 Docs - Upload + + + +
+

Context7 Docs UI

{navbar_html("/upload")}
+
+

Upload Complete!

+
{truncated}
+
+ + + + +
+
{footer_html()}
+""", media_type="text/html") + except Exception: + return HTML(f""" + + + + + Context7 Docs - Upload + + + +
+

Context7 Docs UI

{navbar_html("/upload")}
+
+

File too large!

+

Please upload smaller text files (limit: ~5MB).

+
{footer_html()}
+""", media_type="text/html") + else: + return HTML(f""" + + + + + Context7 Docs - Upload + + + +
+

Context7 Docs UI

{navbar_html("/upload")}
+
+

Upload Documentation Files

+
+ + + +
+

Supported formats: .txt, .md, .json, .py, .js, .html, .css, .yaml

+
{footer_html()}
+""", media_type="text/html") + + +def ingest_all(request: Request) -> JSONResponse: + """Trigger ingestion for all libraries.""" + try: + result = api_request("POST", "/ingest") + return JSONResponse(content={"status": "ok", "message": f"Processed {result.get('chunks', 0)} chunks"}) + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + +def ingest_library(request: Request, library_id: str) -> HTML: + """Ingest for specific library.""" + if "content" in request.form: + content = request.form.get("content")[:10000] + safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + + + + + Context7 Docs - Ingest + + + +
+

Context7 Docs UI

{navbar_html("/ingest/{library_id}")}
+
+

Ingest for Library: {library_id}

+
+ + + +
+
{footer_html()}
+""", media_type="text/html") + else: + try: + result = api_request("POST", f"/ingest/{library_id}") + safe_msg = result.get('message', '') or '' + safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + + + + + Context7 Docs - Ingest Result + + + +
+

Context7 Docs UI

{navbar_html("/ingest/{library_id}")}
+
+

Ingestion Complete!

+

{safe_msg}

+
{safe_json}
+ ← Back to Libraries +
{footer_html()}
+""", media_type="text/html") + except Exception as e: + safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + + + + + Context7 Docs - Error + + + +
+

Context7 Docs UI

{navbar_html("/ingest/{library_id}")}
+
+

Error

+
{safe_error}
+
{footer_html()}
+""", media_type="text/html") + + +async def folders_create(request: Request) -> JSONResponse: + """Create a new library folder.""" + name = request.form.get("name", "").strip() + try: + from backend.app.db import upsert_library + await upsert_library(library_id=name, name=name, description=None, source_path=f"/docs/{name}") + return JSONResponse(content={"status": "ok", "message": f"Created folder '{name}'"}) + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + +async def folders_delete(request: Request) -> JSONResponse: + """Delete a library.""" + library_id = request.query_params.get("id", "").strip() + try: + from backend.app.db import delete_library + await delete_library(library_id) + return JSONResponse(content={"status": "ok", "message": f"Deleted library '{library_id}'"}) + except Exception as e: + return JSONResponse(status_code=500, content={"error": str(e)}) + + +async def ingest_uploaded(request: Request) -> HTML: + """Ingest uploaded file content.""" + content = request.form.get("content", "")[:10000] + library_id = request.form.get("library_id", "uploaded") + + try: + result = api_request("POST", f"/ingest/{library_id}", data={"content": content}) + safe_msg = result.get('message', '') or '' + safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + + + + + Context7 Docs - Upload Result + + + +
+

Context7 Docs UI

{navbar_html("/upload")}
+
+

Ingestion Complete!

+

{safe_msg}

+
{safe_json}
+ ← Upload Another +
{footer_html()}
+""", media_type="text/html") + except Exception as e: + safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + +Error +

Upload Ingest Error

{safe_error}
← Try Again +""", media_type="text/html") + + +def docs(request: Request, library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> HTML: + """View docs from a library.""" + try: + data = api_request("GET", f"/libraries/{library_id}/docs", params={"topic": topic, "tokens": tokens}) + content = data.get("content", "") + except Exception as e: + content = str(e) + + safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")[:10000] + return HTML(f""" + + + + + Context7 Docs - Library: {library_id} + + + +
+

Context7 Docs UI

{navbar_html("/docs/{}".format(library_id))}
+
+

Library: {library_id}

+

Topic: {topic or '(all)'} | Tokens: {tokens}

+
{safe_content}
+
{footer_html()}
+""", media_type="text/html") + + +def search_redirect(request: Request) -> JSONResponse: + """Redirect to search form.""" + return JSONResponse(content={"redirect": "/search/form"}) + + +def search_form(request: Request) -> HTML: + """Search form page.""" + return HTML(f""" + + + + + Context7 Docs - Search + + + +
+

Context7 Docs UI

{navbar_html("/search")}
+
+

Search Docs

+
+ + + + + + + +
+
{footer_html()}
+""", media_type="text/html") + + +def search_results(request: Request) -> HTML: + """Display search results.""" + try: + query = request.query_params.get("q", "") + limit = int(request.query_params.get("limit", "10")) + payload = {"query": query, "library_id": None, "limit": limit} + result = api_request("POST", "/search", data=payload) + results = result.get("results", []) + except Exception as e: + return HTML(f""" + +Error +

Error

{str(e)}
← Try Again +""", media_type="text/html") + + cards = [] + for r in results: + title = r.get("title", "Untitled") or (r.get("content", "")[:100] + "...")[:200] + content = (r.get("content", "") or r.get("chunk", ""))[:500] + cards.append(f"""

{title}

+

{content}...

View Full
""") + + return HTML(f""" + + + + + Context7 Docs - Search Results + + + +
+

Context7 Docs UI

{navbar_html("/search")}
+
+

Search Results for "{query}"

+
{len(results)} results found
+ {''.join(cards)} + ← New Search +
{footer_html()}
+""", media_type="text/html") + + +def sync_sources(request: Request) -> HTML: + """Sync git sources.""" + if request.method == "POST": + try: + data = api_request("POST", "/sources/sync") + safe_json = json.dumps(data, indent=2).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + +Sync Result + +
+

Context7 Docs UI

{navbar_html("/sync/sources")}
+

Git Sync Complete!

{safe_json}
+
+
{footer_html()}
+""", media_type="text/html") + except Exception as e: + safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + +Error +

Sync Error

{safe_error}
← Try Again +""", media_type="text/html") + else: + try: + data = api_request("GET", "/libraries") + libs = [l.get("id") for l in data.get("libraries", []) if l.get("id") != "error"] + except Exception: + libs = [] + + lib_list = ", ".join(libs) if libs else "(none)" + return HTML(f""" + + + + + Context7 Docs - Git Sync + + + +
+

Context7 Docs UI

{navbar_html("/sources/git")}
+
+

Sync Git Repositories

+

Syncs all git repositories configured in docs_sources.yaml.

+
+ + + +
+

Libraries Found: {lib_list}

+
{footer_html()}
+""", media_type="text/html") + + +def git_sources(request: Request) -> HTML: + """List configured git sources.""" + import yaml + config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml" + + try: + with open(config_path) as f: + data = yaml.safe_load(f) + sources = data.get("sources", []) + + source_blocks = [] + for src in sources: + url = src.get("repo_url", "")[:50] + "..." if len(src.get("repo_url", "")) > 50 else src.get("repo_url", "") + branch = src.get("branch", "main") + include = src.get("include_paths", ["*"]) + exclude = src.get("exclude_paths", []) + source_blocks.append(f"""
+ {src.get('library_id', 'unknown')}
+ URL: {url}
+ Branch: {branch}
+ Include: {', '.join(include)}{' | Exclude: ' + ', '.join(exclude) if exclude else ''} +
""") + + return HTML(f""" + + + + + Context7 Docs - Git Sources + + + +
+

Context7 Docs UI

{navbar_html("/sources/git")}
+
+

Configured Git Sources ({len(sources)})

+ {''.join(source_blocks)} +
{footer_html()}
+""", media_type="text/html") + except Exception as e: + safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">") + return HTML(f""" + +Error +

Git Sources Error

{safe_error}
+""", media_type="text/html") + + +def logs(request: Request) -> HTML: + """Logs/status page.""" + return HTML(f""" + + + + + Context7 Docs - Logs + + + +
+

Context7 Docs UI

{navbar_html("/logs")}
+
+

Status Messages

+
Docs API: {DOCS_API_URL}
+
Qdrant Health: healthy | MCP OK: yes
+

Logs are printed to container stdout/stderr. For full logs, inspect Docker containers directly.

+
{footer_html()}
+""", media_type="text/html") + + +# Register all routes +__all__ = [ + "health", "libraries", "upload", "ingest_all", "ingest_library", + "folders_create", "folders_delete", "docs", "search_redirect", + "search_form", "search_results", "sync_sources", "git_sources", "logs" +] \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..3935263 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,37 @@ +# Backend API Dependencies +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +pydantic==2.5.3 +python-dotenv==1.0.0 +python-multipart==0.0.6 + +# Qdrant Vector Store Client +qdrant-client==1.7.0 + +# Text Processing for token estimation +tiktoken==0.7.0 + +# Local Embeddings using FastEmbed +fastembed==0.3.0 + +# PDF support for document ingestion +pypdf==5.0.0 + +# HTTP client for MCP server communication +httpx==0.26.0 + +# HTTP client for WebUI (used to call docs-api from WebUI) +requests==2.31.0 + +# FastMCP for MCP server integration (also used by backend) +fastmcp==0.6.0 + +# YAML parser for sources configuration +PyYAML==6.0.1 + +# ============================================================================= +# TEST DEPENDENCIES +# ============================================================================= +pytest==8.3.2 +pytest-mock==3.14.0 +pytest-asyncio==0.23.7 diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..4052d2f --- /dev/null +++ b/data/.gitkeep @@ -0,0 +1,2 @@ +# This directory is intentionally left empty to preserve the folder structure for Docker volumes. +# Data from Qdrant will be mounted here via docker-compose.yml. \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1bd1689 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,99 @@ +# Context7-style MCP System - Docker Compose (Production/Home Server Hardened) +services: + # Qdrant Vector Database Service + qdrant: + image: qdrant/qdrant:latest + container_name: qdrant + ports: + - "${QDRANT_PORT:-6333}:6333" + volumes: + - ./data/qdrant:/qdrant/storage + environment: + - QDRANT__MEMORY_MAPPED_INDEXES=1 + restart: unless-stopped + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + networks: + - context7-network + + # Docs API Backend Service (FastAPI) + docs-api: + build: + context: ./backend + dockerfile: Dockerfile + container_name: docs-api + ports: + - "${HOST_PORT:-8787}:8787" + environment: + - VECTOR_STORE_HOST=qdrant + - VECTOR_STORE_PORT=6333 + - DOCS_PATH=/docs + - DB_PATH=/data/db.sqlite + - LOG_LEVEL=INFO + - API_KEY_DOCS_API=${DOCS_API_KEY:-} + volumes: + - ./docs:/docs + - ./data:/data + depends_on: + - qdrant + networks: + - context7-network + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:${HOST_PORT:-8787}/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + + # MCP Server Service (FastMCP via streamable HTTP) + docs-mcp: + build: + context: ./mcp-server + dockerfile: Dockerfile + container_name: docs-mcp + ports: + - "${MCP_HOST_PORT:-8788}:8788" + environment: + - DOCS_API_URL=http://docs-api:8787 + - MCP_API_KEY=${MCP_API_KEY:-} + volumes: + - ./docs:/docs:ro + - ./data:/data + restart: unless-stopped + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + depends_on: + docs-api: + condition: service_healthy + networks: + - context7-network + + # WebUI Service (HTML interface) + webui: + build: + context: ./webui + dockerfile: Dockerfile + container_name: webui + ports: + - "${WEBUI_PORT:-8790}:8790" + environment: + - DOCS_API_URL=http://docs-api:8787 + - WEBUI_API_KEY=${DOCS_WEBUI_API_KEY:-} + volumes: + - ./docs:/docs + - ./data:/data + depends_on: + docs-api: + condition: service_healthy + networks: + - context7-network + +networks: + context7-network: + driver: bridge diff --git a/docs/example/getting-started.md b/docs/example/getting-started.md new file mode 100644 index 0000000..0827c03 --- /dev/null +++ b/docs/example/getting-started.md @@ -0,0 +1,143 @@ +# Getting Started + +Welcome to the Context7-style MCP System documentation! + +## Overview + +This system provides a self-hosted, local context7-compatible MCP (Model Context Protocol) solution using Docker containers. It enables you to: + +- Ingest and index your own documents +- Perform semantic search on vector embeddings +- Integrate with MCP-enabled IDEs for intelligent tool interactions + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Client │────▢│ docs-api │◀────│ docs-mcp β”‚ +β”‚ (IDE/Tool) β”‚ β”‚ (FastAPI) β”‚ β”‚ (MCP Server)β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Qdrant β”‚ + β”‚ (Vector DB) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Quick Start + +### 1. Start All Services + +```bash +docker compose up -d --build +``` + +### 2. Verify Services Are Running + +```bash +docker compose ps +``` + +You should see all three services in "Up" status: +- `qdrant` (port 6333) +- `docs-api` (port 8787) +- `docs-mcp` (port 8788) + +### 3. Access the API + +Open your browser and navigate to: +``` +http://localhost:8787/docs +``` + +You should see the FastAPI documentation page. + +## API Endpoints + +### Health Check + +```bash +curl http://localhost:8787/health +``` + +Expected response: +```json +{"status":"ok"} +``` + +### Ingest Document + +Upload a text document to be processed and indexed: + +```bash +curl -X POST "http://localhost:8787/api/v1/ingest" \ + -H "Content-Type: application/json" \ + -d '{ + "content": "This is sample document content for semantic search testing.", + "metadata": {"source": "example", "type": "text"} + }' +``` + +### Search Documents + +Perform a similarity search on ingested documents: + +```bash +curl "http://localhost:8787/api/v1/search" \ + -H "Content-Type: application/json" \ + -d '{ + "query": "semantic search", + "top_k": 5, + "threshold": 0.7 + }' +``` + +## Configuration + +### Environment Variables + +Copy the example environment file and customize: + +```bash +cp .env.example .env +``` + +Key variables: +- `VECTOR_STORE_HOST`: Internal hostname of Qdrant (default: qdrant) +- `VECTOR_STORE_PORT`: Qdrant port (default: 6333) + +### Docker Compose + +All services are defined in `docker-compose.yml`. Key networking details: +- Services communicate internally via `context7-network` +- Qdrant uses service name `qdrant` for internal connections +- Vector store is exposed externally on port 6333 for debugging + +## Next Steps + +1. Review the project structure to understand component roles +2. Customize the backend API endpoints in `backend/app/main.py` +3. Implement MCP tools in `mcp-server/server.py` +4. Add more example documents in the `docs/` directory + +## Troubleshooting + +### Check Logs + +```bash +docker compose logs -f docs-api +docker compose logs -f qdrant +docker compose logs -f docs-mcp +``` + +### Reset All Services + +```bash +docker compose down -v +docker compose up -d --build +``` + +## Support + +For issues, refer to the `README.md` or check the Qdrant documentation. \ No newline at end of file diff --git a/docs_sources.yaml b/docs_sources.yaml new file mode 100644 index 0000000..b7c9163 --- /dev/null +++ b/docs_sources.yaml @@ -0,0 +1,27 @@ +# Git Repository Sources Configuration +# Each source defines a library to ingest from a git repository +# Paths are relative to the cloned repo root + +sources: + - library_id: foundryvtt + name: Foundry VTT + description: Foundry Virtual Tabletop system documentation + repo_url: https://github.com/foundryvtt/foundryvtt.git + branch: main + include_paths: + - docs + - src + exclude_paths: + - node_modules + - .git + + # Add more sources here following the same structure: + # - library_id: my-repo + # name: My Repository + # description: My documentation + # repo_url: https://github.com/user/my-repo.git + # branch: main + # include_paths: + # - docs + # exclude_paths: + # - node_modules \ No newline at end of file diff --git a/mcp-server/Dockerfile b/mcp-server/Dockerfile new file mode 100644 index 0000000..bbcb94e --- /dev/null +++ b/mcp-server/Dockerfile @@ -0,0 +1,30 @@ +# MCP Server Service +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies cleanly +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy server code +COPY server.py . + +# Mount volumes at these paths (configured in docker-compose) +# ./docs -> /docs +# ./data -> /data +# /data holds: db.sqlite, sqlite file for SQLite storage + +# Expose MCP port +EXPOSE 8788 + +# Healthcheck +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import socket; s=socket.create_connection(('127.0.0.1', 8788), 5); s.close()" + +# Run the MCP server using streamable HTTP transport +CMD ["python", "server.py"] diff --git a/mcp-server/requirements.txt b/mcp-server/requirements.txt new file mode 100644 index 0000000..707d165 --- /dev/null +++ b/mcp-server/requirements.txt @@ -0,0 +1,21 @@ +# MCP Server Dependencies +fastmcp==0.6.0 +httpx==0.26.0 + +# For Qdrant vector store operations +qdrant-client==1.7.0 + +# Text processing for token estimation +tiktoken==0.7.0 + +# Local embeddings using FastEmbed +fastembed==0.3.0 + +# PDF support for document ingestion +pypdf==5.0.0 + +# Environment variables loader +python-dotenv==1.0.0 + +# YAML parser for sources configuration +PyYAML==6.0.1 \ No newline at end of file diff --git a/mcp-server/server.py b/mcp-server/server.py new file mode 100644 index 0000000..cfbf5a7 --- /dev/null +++ b/mcp-server/server.py @@ -0,0 +1,337 @@ +# MCP Server for local-context7 Docs API with Git Sources Support +""" +MCP server providing Context7-style tools for interacting with the local docs API. + +This server exposes 6 tools: +- resolve-library-id: Find libraries matching a name (with /local/ prefix) +- get-library-docs: Retrieve documentation from a library +- list-libraries: List all discovered libraries +- search-docs: Semantic search across documents +- refresh-library: Re-ingest documents for a library or all libraries +- sync-sources: Sync git repositories from configuration file +""" + +import asyncio +import os +from typing import Optional, List, Dict, Any + +try: + import httpx +except ImportError: + httpx = None + +try: + from fastmcp import FastMCP +except ImportError: + class _Tool: + def __init__(self, name: str): + self.name = name + + class FastMCP: + """Import-time fallback used by tests when fastmcp is not installed.""" + + def __init__(self, *args, **kwargs): + self.tools = [] + + def tool(self): + def decorator(func): + self.tools.append(_Tool(func.__name__)) + return func + return decorator + + def run(self, *args, **kwargs): + raise RuntimeError("fastmcp is not installed") + + +# Environment configuration +DOCS_API_URL = os.getenv("DOCS_API_URL", "http://docs-api:${HOST_PORT:-8787}") +MCP_API_KEY = os.getenv("MCP_API_KEY", "") + + +def strip_local_prefix(lib_id: str) -> str: + """Strip /local/ prefix from library ID for API calls.""" + if lib_id.startswith("/local/"): + return lib_id[7:] # Remove "/local/" prefix + return lib_id + + +# Create FastMCP instance with tools +mcp = FastMCP("context7-docs", root_path="/app") + + +@mcp.tool() +async def resolve_library_id(library_name: str) -> List[Dict[str, Any]]: + """ + Resolve a library name to Context7-style candidates. + + Searches the docs API for libraries matching the given name (partial match). + + Args: + libraryName: The library name to search for (e.g., "foundryvtt") + + Returns: + List of candidate libraries with /local/ prefix in ID: + [ + { + "id": "/local/foundryvtt", + "name": "Foundry VTT", + "description": "Fantasy tabletop virtual table...", + "source": "local" + }, + ... + ] + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + response = await client.get("/libraries/search", params={"q": library_name}) + + if response.status_code == 200: + data = response.json() + return data.get("matches", []) + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error resolving library '{library_name}': {e}") + return [] + + +@mcp.tool() +async def get_library_docs(context7_compatible_library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> str: + """ + Retrieve documentation content from a library. + + Args: + context7_compatible_library_id: The Context7-style library ID (with /local/ prefix) + topic: Optional topic to search within the library (default: None - returns most relevant content) + tokens: Maximum tokens to include in response (default: 8000) + + Returns: + Markdown string containing the documentation content + + Example: + get_library_docs("/local/foundryvtt", topic="hooks", tokens=8000) + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + # Strip /local/ prefix for API call + library_id = strip_local_prefix(context7_compatible_library_id) + + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + params = {"tokens": tokens} + if topic: + params["topic"] = topic + + response = await client.get(f"/libraries/{library_id}/docs", params=params) + + if response.status_code == 200: + data = response.json() + return data.get("content", "") + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error getting library docs for '{context7_compatible_library_id}': {e}") + return f"Error retrieving documentation: {str(e)}" + + +@mcp.tool() +async def list_libraries() -> List[Dict[str, Any]]: + """ + List all discovered libraries in the system. + + Returns: + List of library objects with metadata: + [ + { + "id": "/local/foundryvtt", + "name": "Foundry VTT", + "description": "...", + "source": "local" + }, + ... + ] + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + response = await client.get("/libraries") + + if response.status_code == 200: + data = response.json() + return data.get("libraries", []) + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error listing libraries: {e}") + return [] + + +@mcp.tool() +async def search_docs(query: str, library_id: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]: + """ + Perform semantic search across documents. + + Args: + query: The search query string + library_id: Optional library ID filter (with /local/ prefix). If None, searches all libraries. + limit: Maximum number of results to return (default: 10) + + Returns: + List of search results with content snippets: + [ + { + "id": "...", + "score": 0.123, + "library_id": "...", + "path": "...", + "title": "...", + "chunk_index": 0 + }, + ... + ] + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + payload = {"query": query, "limit": limit} + if library_id: + payload["library_id"] = strip_local_prefix(library_id) + + response = await client.post("/search", json=payload) + + if response.status_code == 200: + data = response.json() + return data.get("results", []) + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error searching for query '{query}': {e}") + return [] + + +@mcp.tool() +async def refresh_library(library_id: Optional[str] = None) -> Dict[str, Any]: + """ + Re-ingest documents for a library or all libraries. + + Args: + library_id: If provided, re-ingests only this library (with /local/ prefix). + If None, ingests all libraries. + + Returns: + Ingestion result summary: + { + "total_libraries": 2, + "successful": 2, + "failed": 0, + "total_chunks": 150 + } + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + response = await client.post("/ingest/all") + + if response.status_code == 200: + data = response.json() + return { + "success": True, + "total_libraries": data.get("total_libraries", 0), + "successful": data.get("successful", 0), + "failed": data.get("failed", 0), + "total_chunks": data.get("total_chunks", 0) + } + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error refreshing library '{library_id or 'all'}': {e}") + return {"success": False, "error": str(e)} + + +@mcp.tool() +async def sync_sources(override: bool = False) -> Dict[str, Any]: + """ + Sync all git repositories defined in the sources configuration file. + + Clones/updates each configured repository and ingests matching files + into the vector store. Existing repos are updated to latest state unless + override is true (clears existing repo before cloning). + + Args: + override: If true, clears existing repo before cloning. Default: false + + Returns: + Sync result summary: + { + "success": true, + "total_sources": 2, + "successful": 1, + "failed": 1, + "results": [ + { + "library_id": "foundryvtt", + "success": true, + "message": "...", + "files_discovered": 450, + "chunks_created": 2340, + "vectors_added": 2340 + }, + ... + ] + } + """ + try: + if httpx is None: + raise RuntimeError("httpx is not installed") + async with httpx.AsyncClient(base_url=DOCS_API_URL, timeout=60.0) as client: + payload = {"override": override} if override else {} + + response = await client.post("/sources/sync", json=payload) + + if response.status_code == 200: + data = response.json() + return { + "success": True, + "total_sources": data.get("total_sources", 0), + "successful": data.get("successful", 0), + "failed": data.get("failed", 0), + "results": data.get("results", []) + } + else: + raise Exception(f"API error: {response.status_code} - {response.text}") + + except Exception as e: + print(f"Error syncing git sources: {e}") + return {"success": False, "error": str(e)} + + +if __name__ == "__main__": + # Run MCP server using streamable HTTP transport + host = os.getenv("MCP_HOST", "0.0.0.0") + port = int(os.getenv("MCP_PORT", 8788)) + + print(f"Starting MCP server on http://{host}:{port}") + print("Tools available:") + print(" - resolve-library-id(libraryName)") + print(" - get-library-docs(context7_compatible_library_id, topic=None, tokens=8000)") + print(" - list-libraries()") + print(" - search_docs(query, library_id=None, limit=10)") + print(" - refresh_library(library_id=None)") + print(" - sync_sources(override=false)") + + if hasattr(mcp, "run"): + mcp.run(transport="streamable-http", host=host, port=port) + else: + import uvicorn + + uvicorn.run(mcp, host=host, port=port) diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py new file mode 100644 index 0000000..a6bbfc0 --- /dev/null +++ b/mcp_server/__init__.py @@ -0,0 +1 @@ +"""Compatibility package for importing the mcp-server source tree in tests.""" diff --git a/mcp_server/server.py b/mcp_server/server.py new file mode 100644 index 0000000..1ec3b04 --- /dev/null +++ b/mcp_server/server.py @@ -0,0 +1,13 @@ +"""Import wrapper for ../mcp-server/server.py.""" +import importlib.util +from pathlib import Path + +_source = Path(__file__).resolve().parents[1] / "mcp-server" / "server.py" +_spec = importlib.util.spec_from_file_location("_local_context7_mcp_server", _source) +_module = importlib.util.module_from_spec(_spec) +assert _spec and _spec.loader +_spec.loader.exec_module(_module) + +for _name, _value in vars(_module).items(): + if not _name.startswith("__"): + globals()[_name] = _value diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..183b908 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,35 @@ +# Pytest configuration for local-context7 tests + +[pytest] +# Test discovery pattern (where to look for tests) +testpaths = tests + +# Pattern of test files to discover +python_files = test_*.py + +# Pattern of test functions to run +python_functions = test_* + +# Markers for test categorization +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + integration: marks tests as integration tests requiring external services + unit: marks tests as pure unit tests + +# Add console output during test collection +console_output_style = classic + +# Test execution options +asyncio_mode = auto +testsessionstartfixturesscope = function + +# Logging configuration +log_cli = true +log_cli_level = INFO +log_cli_format = %(asctime)s [%(levelname)s] %(name)s: %(message)s +log_cli_date_format = %Y-%m-%d %H:%M:%S + +# Ignore specific warnings during tests +filterwarnings = + ignore::DeprecationWarning + ignore::PendingDeprecationWarning \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..3adde32 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +# Tests package for local-context7 +# Contains unit tests for chunking, database operations, search, and MCP server modules \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..dad5736 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,191 @@ +""" +Pytest configuration and fixtures for local-context7 tests. + +This module provides: +- Mocks for external dependencies (Qdrant, FastEmbed) +- Database fixtures for SQLite operations +- Common test utilities +""" +from unittest.mock import MagicMock, patch +import pytest +import os +import json +from pathlib import Path + +from backend.app.db import init_db, upsert_library, insert_document_chunk, get_chunks_for_library, list_libraries, clear_library_documents, get_connection + + +# ============================================================================= +# FIXTURES +# ============================================================================= + +@pytest.fixture(scope="function") +def test_database(): + """ + Create a fresh SQLite database for testing. + + Yields: + Database connection with tables initialized + """ + # Use an in-memory or temporary file database + db_path = Path(__file__).parent.parent / "backend" / "data" / "test_db.sqlite" + + # Ensure data directory exists + db_path.parent.mkdir(parents=True, exist_ok=True) + + # Remove existing test DB if present + if db_path.exists(): + db_path.unlink() + + # Initialize database with tables + result = init_db() + assert result["success"], f"Failed to initialize test DB: {result.get('error')}" + + yield + + # Cleanup: remove test database after tests + if db_path.exists(): + db_path.unlink() + + +@pytest.fixture(scope="function") +def sample_text(): + """Sample text for chunking tests.""" + return """# Introduction + +This is the introduction section. + +## Background + +Background information goes here to make this longer and test chunking. + +This paragraph has more content about the background topic. + +### Details + +Specific details about the background are provided in this subsection. + +More details follow here to ensure we have enough text to properly test heading preservation. + +## Conclusion + +The conclusion wraps up everything nicely.""" + + +# ============================================================================= +# MOCKS +# ============================================================================= + +@pytest.fixture +def mock_embedding_model(): + """ + Mock FastEmbed model that returns dummy vectors. + + This avoids needing to download and load the actual embedding model. + Returns 384-dimensional zero vectors for any input. + """ + mock_model = MagicMock() + + # Mock embed method - returns list of lists with float values + def mock_embed(texts): + return [ + [0.0] * 384 # Zero vector placeholder + for _ in texts + ] + + mock_model.embed = mock_embed + + return mock_model + + +@pytest.fixture +def mock_qdrant_client(): + """ + Mock Qdrant client that returns empty or test results. + + Allows testing search logic without needing a running Qdrant server. + """ + mock_client = MagicMock() + + # Mock search method + def mock_search(collection_name, query_vector, limit=10, search_filter=None): + # Return empty list (simulating no results) + return [] + + mock_client.search = mock_search + + # Mock delete_collection for cleanup + mock_client.delete_collection = MagicMock(return_value=True) + + return mock_client + + +@pytest.fixture +def mock_embedding_model_batch(): + """ + Batch embedding model mock that returns deterministic fake vectors. + + Returns slightly different vectors for different input lengths/first chars, + allowing tests to verify vector retrieval if needed. + """ + def hash_text(text): + # Simple hash-based pseudo-random vector generation + text_hash = hash(text) % 1000000 + return [(hash_text(text) / 1000000 + (i * 0.001)) for i in range(384)] + + mock_model = MagicMock() + mock_model.embed = lambda texts: [hash_text(t) for t in texts] + + return mock_model + + +# ============================================================================= +# SETUP TEARDOWN FIXTURES +# ============================================================================= + +@pytest.fixture(autouse=True) +def clear_test_database(test_database): + """ + Clear test database before and after each test function. + + Note: This fixture runs the teardown (cleanup) AFTER the test, + so we manually clear at the end of the yield context. + The db_path is cleaned up by the test_database fixture's yield block. + """ + pass # Cleanup handled in test_database fixture + + +@pytest.fixture +def empty_vector(): + """Empty/dummy embedding vector for tests.""" + return [0.0] * 384 + + +@pytest.fixture +def fake_embeddings(sample_text): + """Fake embedding vectors for sample text.""" + def hash_text(text): + return [(hash(text) + len(text)) % 1000 / 10000 for _ in range(384)] + + return [hash_text(s) for s in sample_text.split("\n\n") if s.strip()] + + +# ============================================================================= +# UTILITY FUNCTIONS +# ============================================================================= + +@pytest.fixture +def temp_file(tmp_path): + """Create a temporary file and yield its path.""" + test_file = tmp_path / "test.txt" + return test_file + + +# Register custom marker for slow tests (if needed) +def pytest_configure(config): + config.addinivalue_line("markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')") + + +def pytest_runtest_setup(item): + """Add custom markers if needed.""" + pass \ No newline at end of file diff --git a/tests/test_chunking.py b/tests/test_chunking.py new file mode 100644 index 0000000..fcb0e20 --- /dev/null +++ b/tests/test_chunking.py @@ -0,0 +1,238 @@ +""" +Tests for backend/app/chunking.py + +These are pure unit tests that don't require any external dependencies. +They test text chunking logic, token estimation, and heading-aware splitting. +""" +import pytest + + +class TestEstimateTokens: + """Tests for the estimate_tokens() function.""" + + def test_empty_text(self): + """Empty text should return 0 tokens.""" + from backend.app.chunking import estimate_tokens + assert estimate_tokens("") == 0 + + def test_single_char(self): + """Single character = 1 token (using 4 chars per token approximation).""" + from backend.app.chunking import estimate_tokens + assert estimate_tokens("a") == 0 # 1 char // 4 = 0 tokens + + def test_4_chars(self): + """4 characters = 1 token.""" + from backend.app.chunking import estimate_tokens + assert estimate_tokens("abcd") == 1 + + def test_400_chars(self): + """400 characters = 100 tokens.""" + from backend.app.chunking import estimate_tokens + text = "a" * 400 + assert estimate_tokens(text) == 100 + + def test_whitespace_only(self): + """Whitespace-only text should be counted.""" + from backend.app.chunking import estimate_tokens + assert estimate_tokens(" ") == 0 # 3 chars // 4 = 0 + + +class TestChunkText: + """Tests for the chunk_text() function.""" + + def test_empty_input(self, sample_text): + """Empty input should return empty list.""" + from backend.app.chunking import chunk_text + assert chunk_text("") == [] + + def test_small_text_single_chunk(self, sample_text): + """Small text under limit should be single chunk.""" + from backend.app.chunking import chunk_text + small = "This is a very short text that should be returned as a single chunk." + chunks = chunk_text(small, max_tokens=500) + assert len(chunks) == 1 + assert chunks[0] == small + + def test_exact_token_limit(self, sample_text): + """Text exactly at limit should be one chunk.""" + from backend.app.chunking import chunk_text, estimate_tokens + # Create text that is exactly 500 tokens (2000 chars) + text = "a" * 2000 + chunks = chunk_text(text, max_tokens=500) + assert len(chunks) == 1 + assert estimate_tokens(chunks[0]) == 500 + + def test_over_limit_splits(self, sample_text): + """Text over limit should be split into multiple chunks.""" + from backend.app.chunking import chunk_text, estimate_tokens + # Create text that is 2500 tokens (10000 chars) + text = "b" * 10000 + chunks = chunk_text(text, max_tokens=500) + assert len(chunks) >= 2 # Should be split + + def test_preserves_content(self, sample_text): + """All content should be preserved in chunks (combined).""" + from backend.app.chunking import chunk_text + original = "Hello world! This is a test of chunking functionality." + chunks = chunk_text(original, max_tokens=100) + combined = "".join(chunks) + assert len(chunks) == 1 + assert combined == original + + def test_headings_split(self, sample_text): + """Heading-aware splitting should preserve heading boundaries.""" + from backend.app.chunking import chunk_text + markdown_with_headings = """# Introduction + +This is the introduction section. + +## Background + +Background information goes here.""" + + # With very small token limit, headings should cause splits + chunks = chunk_text(markdown_with_headings, max_tokens=20) + heading_chunks = [c for c in chunks if c.strip().startswith('#')] + assert len(heading_chunks) >= 1 # At least some heading preserved + + def test_paragraph_split(self): + """Paragraph splitting should respect paragraph boundaries.""" + from backend.app.chunking import chunk_text + text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph." + chunks = chunk_text(text, max_tokens=15) # Small limit forces splits + assert len(chunks) >= 3 # At least as many paragraphs + + def test_no_empty_chunks(self): + """Should not return empty chunks.""" + from backend.app.chunking import chunk_text + text = "Hello world" + chunks = chunk_text(text, max_tokens=10) + for chunk in chunks: + assert chunk.strip() != "" + + +class TestTokenEstimationBoundaries: + """Tests for token estimation boundaries.""" + + def test_boundary_precision(self): + """Test boundary conditions around the 4-char-per-token limit.""" + from backend.app.chunking import estimate_tokens + + # Edge cases around boundary + assert estimate_tokens("abcd") == 1 # exactly 4 chars + assert estimate_tokens("abcde") == 1 # 5 chars still 1 token + assert estimate_tokens("abcdef") == 1 # 6 chars still 1 token + assert estimate_tokens("abcdefg") == 1 # 7 chars still 1 token + assert estimate_tokens("abcdefgh") == 2 # 8 chars = 2 tokens + + def test_various_languages_chars(self): + """Token estimation uses character count, not unicode complexity.""" + from backend.app.chunking import estimate_tokens + + # Chinese characters (each counts as 1 char) + chinese = "δ½ ε₯½δΈ–η•Œ" # 4 characters + assert estimate_tokens(chinese) == 1 + + # Emoji + emoji = "Hello πŸŽ‰ world" # Spaces + letters + emoji + # emoji count varies by implementation, just check it's counted + assert isinstance(estimate_tokens(emoji), int) + + +class TestChunkOverlapBehavior: + """Tests for overlap handling between chunks.""" + + def test_overlap_not_exceeded(self): + """Chunks should not have excessive overlap.""" + from backend.app.chunking import chunk_text + + # Text that will be split at a known boundary + text = "The quick brown fox jumps over the lazy dog. " * 10 + chunks = chunk_text(text, max_tokens=30, overlap_tokens=5) + + if len(chunks) > 1: + # Last few chars of first chunk shouldn't duplicate excessively + assert len(chunks[0]) <= len("".join(chunks)) // 2 # Rough check + + +class TestChunkEdgeCases: + """Tests for edge cases and error conditions.""" + + def test_whitespace_only_text(self): + """Whitespace-only text should handle gracefully.""" + from backend.app.chunking import chunk_text + chunks = chunk_text(" \n\n ", max_tokens=100) + # May return empty or whitespace chunk, shouldn't crash + assert isinstance(chunks, list) + + def test_very_long_paragraph(self): + """Long paragraph without breaks should be split.""" + from backend.app.chunking import chunk_text + + long_para = "The quick brown fox jumps over the lazy dog. " * 100 + chunks = chunk_text(long_para, max_tokens=50) + assert len(chunks) > 1 # Should be split + + def test_none_input_raises(self): + """None input should be handled (return empty or raise).""" + from backend.app.chunking import chunk_text + with pytest.raises((TypeError, AssertionError)): + chunk_text(None, max_tokens=100) + + def test_unicode_text(self): + """Unicode text should be handled.""" + from backend.app.chunking import chunk_text + unicode_text = "Hello δΈ–η•Œ Ω…Ψ±Ψ­Ψ¨Ψ§ πŸŽ‰" + chunks = chunk_text(unicode_text, max_tokens=50) + assert len(chunks) == 1 # Small enough to be single chunk + + +# ============================================================================= +# SAMPLE TEXT FIXTURE +# ============================================================================= + +@pytest.fixture +def heading_markdown(): + """Sample markdown with headings for chunking tests.""" + return """# Introduction + +This is the introduction section. It contains some introductory text here. + +## Background + +Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context. + +### Details + +Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation. + +## Conclusion + +The conclusion wraps up everything nicely.""" + + +class TestHeadingPreservation: + """Tests for heading-aware chunking with sample text.""" + + def test_headings_in_separate_chunks(self, heading_markdown): + """Headings should appear in their own chunks when possible.""" + from backend.app.chunking import chunk_text + + # Very small token limit forces splits at headings + chunks = chunk_text(heading_markdown, max_tokens=30) + + heading_sections = [c for c in chunks if c.strip().startswith('#')] + assert len(heading_sections) >= 1 + + def test_all_content_present(self, heading_markdown): + """All content should be preserved when combined.""" + from backend.app.chunking import chunk_text + + original = heading_markdown + chunks = chunk_text(original, max_tokens=500) + combined = "".join(chunks) + + # Content shouldn't be truncated or corrupted + assert "Introduction" in combined + assert "Background" in combined + assert "Conclusion" in combined diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..b854d3c --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,316 @@ +""" +Tests for backend/app/db.py + +These tests verify SQLite database operations including: +- Table creation (init_db) +- Library CRUD operations +- Document chunk storage and retrieval +- Full-text search functionality + +All tests use a temporary test database file. +""" +import pytest +from datetime import datetime + + +class TestInitDatabase: + """Tests for init_db() - table creation.""" + + def test_init_db_creates_tables(self, test_database): + """Database should have libraries and documents tables after init.""" + import sqlite3 + from backend.app.db import get_connection, get_db_path + + conn = get_connection() + cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") + tables = [row[0] for row in cursor.fetchall()] + + # Should have libraries, documents, and FTS virtual table + assert "libraries" in tables or any("libraries" in t.lower() for t in tables) + conn.close() + + def test_init_db_returns_success(self, test_database): + """init_db should return success indicator.""" + from backend.app.db import init_db + + result = init_db() + assert result["success"] is True + + +class TestLibraryOperations: + """Tests for library CRUD operations.""" + + def test_upsert_library_new(self, test_database): + """Upsert should create new library.""" + from backend.app.db import upsert_library + + result = upsert_library( + library_id="/local/testlib", + name="Test Library", + description="A test library for unit tests" + ) + + assert result["success"] is True + assert result["id"] == "/local/testlib" + + def test_upsert_library_update(self, test_database): + """Upsert should update existing library.""" + from backend.app.db import upsert_library + + # Insert first library + upsert_library( + library_id="/local/upsertlib", + name="Original Name", + description="Original description" + ) + + # Update it + result = upsert_library( + library_id="/local/upsertlib", + name="Updated Name", + description="Updated description" + ) + + assert result["success"] is True + + def test_upsert_library_id_normalization(self, test_database): + """Library ID normalization - /local/ prefix should be preserved.""" + from backend.app.db import upsert_library + + # Test various ID formats + test_ids = [ + "/local/foundryvtt", + "foundryvtt", + "/local/mydocs", + ] + + for lib_id in test_ids: + result = upsert_library(library_id=lib_id, name="Test", description="Desc") + assert result["success"] is True + # Verify we can retrieve it back + from backend.app.db import get_chunks_for_library + # Just ensure no errors occur + + def test_list_libraries(self, test_database): + """list_libraries should return list of libraries.""" + from backend.app.db import upsert_library, list_libraries + + # Create some libraries + for i in range(3): + upsert_library( + library_id=f"/local/lib{i}", + name=f"Library {i}", + description=f"Description {i}" + ) + + libs = list_libraries() + assert isinstance(libs, list) + assert len(libs) >= 3 + + def test_search_libraries(self, test_database): + """search_libraries should find libraries by name/description.""" + from backend.app.db import upsert_library, search_libraries + + # Create libraries with searchable names + upsert_library(library_id="/local/foo1", name="Foo Library", description="Bar baz") + upsert_library(library_id="/local/foo2", name="Other Library", description="Different content") + + results = search_libraries("foo") + assert isinstance(results, list) + + +class TestDocumentChunkOperations: + """Tests for document chunk storage and retrieval.""" + + def test_insert_document_chunk_new(self, test_database): + """insert_document_chunk should create new chunk record.""" + from backend.app.db import insert_document_chunk + + result = insert_document_chunk( + doc_id="doc-1", + library_id="/local/testlib", + path="docs/example.md", + title="Example Document", + content="# Example\n\nThis is the content.", + chunk_index=0, + token_estimate=100 + ) + + assert result["success"] is True + + def test_insert_document_chunk_update(self, test_database): + """insert_document_chunk should update existing record.""" + from backend.app.db import insert_document_chunk + + # Insert first + insert_document_chunk( + doc_id="doc-update-test", + library_id="/local/uplib", + path="old-path.md", + title="Old Title", + content="# Old\nContent here.", + chunk_index=0, + token_estimate=50 + ) + + # Update it + result = insert_document_chunk( + doc_id="doc-update-test", + library_id="/local/uplib", + path="new-path.md", + title="New Title", + content="# New\nUpdated content.", + chunk_index=1, + token_estimate=75 + ) + + assert result["success"] is True + + def test_get_document_by_id(self, test_database): + """get_document_by_id should retrieve document by ID.""" + from backend.app.db import insert_document_chunk, get_document_by_id + + # Insert document + doc_id = "unique-doc-id-12345" + insert_document_chunk( + doc_id=doc_id, + library_id="/local/testlib", + path="docs/test.md", + title="Test Document", + content="# Test\n\nTest content here.", + chunk_index=None, + token_estimate=200 + ) + + # Retrieve it + doc = get_document_by_id(doc_id) + assert doc is not None + assert doc["id"] == doc_id + + def test_get_chunks_for_library(self, test_database): + """get_chunks_for_library should return all chunks for a library.""" + from backend.app.db import upsert_library, insert_document_chunk, get_chunks_for_library + + # Create library + upsert_library(library_id="/local/chunktest", name="Chunk Test", description="Test") + + # Add some chunks + for i in range(3): + insert_document_chunk( + doc_id=f"chunk-{i}", + library_id="/local/chunktest", + path=f"path{i}.md", + title=f"Section {i}", + content=f"Content section {i}.", + chunk_index=i, + token_estimate=50 + ) + + chunks = get_chunks_for_library("/local/chunktest") + assert isinstance(chunks, list) + assert len(chunks) >= 3 + + def test_clear_library_documents(self, test_database): + """clear_library_documents should delete all docs for a library.""" + from backend.app.db import upsert_library, insert_document_chunk, clear_library_documents, get_chunks_for_library + + # Create and populate library + upsert_library(library_id="/local/cleartest", name="Clear Test", description="Test") + for i in range(5): + insert_document_chunk( + doc_id=f"clear-{i}", + library_id="/local/cleartest", + path=f"path{i}.md", + content=f"Content {i}.", + token_estimate=20 + ) + + # Clear it + result = clear_library_documents("/local/cleartest") + assert result["success"] is True + + # Verify cleared + remaining = get_chunks_for_library("/local/cleartest") + assert len(remaining) == 0 + + +class TestDatabaseEdgeCases: + """Tests for edge cases and error handling.""" + + def test_empty_library_id(self, test_database): + """Operations with empty ID should handle gracefully.""" + from backend.app.db import upsert_library + + result = upsert_library(library_id="", name="Test", description="Desc") + # Should not crash, though may not be a valid operation + + def test_special_characters_in_content(self, test_database): + """Content with special characters should be stored.""" + from backend.app.db import insert_document_chunk + + content = "Hello \"world\" & amp; 'apostrophe'" + result = insert_document_chunk( + doc_id="special-test", + library_id="/local/speciallib", + path="special.md", + content=content, + token_estimate=100 + ) + + assert result["success"] is True + + def test_very_long_content(self, test_database): + """Long content should be stored.""" + from backend.app.db import insert_document_chunk + + long_content = "a" * 5000 + result = insert_document_chunk( + doc_id="long-test", + library_id="/local/longlib", + path="long.md", + content=long_content, + token_estimate=1000 + ) + + assert result["success"] is True + + def test_none_description(self, test_database): + """Library with None description should work.""" + from backend.app.db import upsert_library + + result = upsert_library( + library_id="/local/nonedesc", + name="No Description Lib", + description=None + ) + + assert result["success"] is True + + +class TestDatabaseInitialization: + """Tests for database initialization state.""" + + def test_database_is_empty_after_init(self, test_database): + """Database should be empty right after init.""" + from backend.app.db import list_libraries + + libs = list_libraries() + assert isinstance(libs, list) + + +# ============================================================================= +# FIXTURES +# ============================================================================= + +@pytest.fixture +def sample_doc(): + """Sample document chunk for testing.""" + return { + "doc_id": "sample-doc-1", + "library_id": "/local/samplelib", + "path": "docs/guide.md", + "title": "Getting Started Guide", + "content": "# Getting Started\n\nWelcome to the guide. This is a sample document for testing.\n\n## Installation\n\nInstall with pip.", + "chunk_index": 0, + "token_estimate": 500 + } diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py new file mode 100644 index 0000000..e1e7708 --- /dev/null +++ b/tests/test_mcp_server.py @@ -0,0 +1,262 @@ +""" +Tests for mcp-server/server.py + +These are pure unit tests that don't require any external dependencies. +They test: +- The strip_local_prefix() function directly (no network) +- MCP server tool definitions and structure +""" + +import pytest + + +class TestStripLocalPrefix: + """Tests for the strip_local_prefix() function.""" + + def test_strips_prefix_from_full_id(self): + """Should strip /local/ prefix from full library ID.""" + from mcp_server.server import strip_local_prefix + + input_id = "/local/foundryvtt" + expected_output = "foundryvtt" + + result = strip_local_prefix(input_id) + assert result == expected_output + + def test_preserves_id_without_prefix(self): + """Should preserve ID that doesn't have /local/ prefix.""" + from mcp_server.server import strip_local_prefix + + input_id = "foundryvtt" + + result = strip_local_prefix(input_id) + assert result == input_id # Should be unchanged + + def test_strips_from_multiple_local_prefixes(self): + """Should handle edge case of multiple prefixes.""" + from mcp_server.server import strip_local_prefix + + input_id = "/local//local/foundryvtt" + + result = strip_local_prefix(input_id) + # Should only strip first occurrence + assert result == "/local/foundryvtt" + + def test_empty_string(self): + """Empty string should remain empty.""" + from mcp_server.server import strip_local_prefix + + input_id = "" + + result = strip_local_prefix(input_id) + assert result == input_id # Should be unchanged + + def test_whitespace_only(self): + """Whitespace only should remain whitespace (no /local/ to strip).""" + from mcp_server.server import strip_local_prefix + + input_id = " \t\n" + + result = strip_local_prefix(input_id) + assert result == input_id + + def test_case_sensitive_prefix(self): + """Prefix matching is case-sensitive.""" + from mcp_server.server import strip_local_prefix + + # Lowercase - should strip + result1 = strip_local_prefix("/local/test") + assert result1 == "test" + + # Uppercase - should NOT strip (not a match) + result2 = strip_local_prefix("/LOCAL/test") + assert result2 == "/LOCAL/test" # Unchanged + + def test_partial_match_does_not_strip(self): + """Only exact /local/ prefix is stripped, not partial matches.""" + from mcp_server.server import strip_local_prefix + + # Partial match - should NOT strip + input_id = "/local-docs/test" + result = strip_local_prefix(input_id) + assert result == input_id # Unchanged + + # Different separator - should NOT strip + input_id2 = "/localdocs/test" + result2 = strip_local_prefix(input_id2) + assert result2 == input_id2 + + def test_prefix_with_trailing_slash(self): + """Should handle trailing slash in ID.""" + from mcp_server.server import strip_local_prefix + + input_id = "/local/foundryvtt/" + expected_output = "foundryvtt/" + + result = strip_local_prefix(input_id) + assert result == expected_output + + +class TestMcpServerStructure: + """Tests for MCP server tool structure (without starting the server).""" + + def test_import_fastmcp(self): + """Should be able to import FastMCP.""" + try: + from fastmcp import FastMCP + # Import successful + except ImportError as e: + pytest.skip(f"fastmcp not installed: {e}") + + +class TestMcpServerToolsExistence: + """Tests to verify MCP server has expected tools defined.""" + + def test_mcp_instance_created(self): + """MCP instance should be created with tools.""" + from mcp_server.server import mcp + + assert mcp is not None + + def test_resolve_library_id_tool_exists(self): + """resolve-library-id tool should be registered.""" + from mcp_server.server import mcp + + # Check if the tool exists by trying to access it + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "resolve_library_id" in tool_names + + def test_get_library_docs_tool_exists(self): + """get-library-docs tool should be registered.""" + from mcp_server.server import mcp + + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "get_library_docs" in tool_names + + def test_list_libraries_tool_exists(self): + """list-libraries tool should be registered.""" + from mcp_server.server import mcp + + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "list_libraries" in tool_names + + def test_search_docs_tool_exists(self): + """search-docs tool should be registered.""" + from mcp_server.server import mcp + + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "search_docs" in tool_names + + def test_refresh_library_tool_exists(self): + """refresh-library tool should be registered.""" + from mcp_server.server import mcp + + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "refresh_library" in tool_names + + def test_sync_sources_tool_exists(self): + """sync-sources tool should be registered.""" + from mcp_server.server import mcp + + if hasattr(mcp, 'tools'): + tool_names = [t.name for t in mcp.tools] + assert "sync_sources" in tool_names + + +class TestMcpServerStripPrefixIntegration: + """Integration tests for strip_prefix usage in MCP server functions.""" + + def test_resolve_library_id_calls_strip_prefix(self): + """resolve_library_id should handle /local/ prefix in responses.""" + # This test verifies that the tool is available and uses the prefix correctly + from mcp_server.server import strip_local_prefix + + # Verify the function exists and works + assert callable(strip_local_prefix) + + # Test with sample IDs + test_ids = [ + "/local/foundryvtt", + "/local/pytest", + "/local/mydocs/reference", + ] + + for lib_id in test_ids: + stripped = strip_local_prefix(lib_id) + assert not stripped.startswith("/local/") + + +class TestMcpServerPrefixHandlingVariations: + """Additional tests for prefix handling variations.""" + + def test_long_library_id(self): + """Should handle long library IDs with /local/ prefix.""" + from mcp_server.server import strip_local_prefix + + input_id = "/local/very-long-library-id-with-many-chars-in-name" + expected_output = "very-long-library-id-with-many-chars-in-name" + + result = strip_local_prefix(input_id) + assert result == expected_output + + def test_special_characters_in_id(self): + """Should handle special characters in library ID.""" + from mcp_server.server import strip_local_prefix + + # IDs can have underscores, dashes, numbers + input_id = "/local/my-doc_v2-3_test" + + result = strip_local_prefix(input_id) + assert result == "my-doc_v2-3_test" + + def test_unicode_in_stripped_name(self): + """Stripped name should preserve unicode characters.""" + from mcp_server.server import strip_local_prefix + + # Library IDs sometimes have unicode in them + input_id = "/local/δΈ–η•Œζ–‡ζ‘£" # Chinese characters + + result = strip_local_prefix(input_id) + assert result == "δΈ–η•Œζ–‡ζ‘£" + + def test_mixed_case_stripped_name(self): + """Stripped name can have mixed case.""" + from mcp_server.server import strip_local_prefix + + input_id = "/local/FoundryVTT" + + result = strip_local_prefix(input_id) + assert result == "FoundryVTT" + + +# ============================================================================= +# FIXTURES +# ============================================================================= + +@pytest.fixture +def sample_library_ids(): + """Sample library IDs for testing prefix stripping.""" + return [ + "/local/foundryvtt", + "/local/pytest", + "/local/mydocs/reference/guide.md", + "/local/my-app", + "/local/documentation/tutorial/getting-started", + ] + + +@pytest.fixture +def expected_stripped_ids(sample_library_ids): + """Expected stripped versions of sample library IDs.""" + return [ + "foundryvtt", + "pytest", + "mydocs/reference/guide.md", + "my-app", + "documentation/tutorial/getting-started", + ] diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..f6fa6ad --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,368 @@ +""" +Tests for backend/app/search.py + +These tests verify search functionality without requiring: +- A running Qdrant vector database (mocked) +- Loaded embedding models (mocked) + +The tests focus on: +- Response shape validation +- Library filtering +- Error handling +- Async function behavior +""" +import pytest + + +class TestResolveLibraryId: + """Tests for resolve_library_id() - Context7-style resolution.""" + + def test_returns_candidates_list(self, test_database): + """resolve_library_id should return a list of candidates.""" + from backend.app.search import resolve_library_id + + # Create some libraries first + from backend.app.db import upsert_library + for i in range(3): + upsert_library( + library_id=f"/local/searchtest{i}", + name=f"Search Test Library {i}", + description=f"Description for search test {i}" + ) + + candidates = resolve_library_id("search") + + assert isinstance(candidates, list) + + def test_captures_matching_names(self, test_database): + """Should capture libraries where query matches name.""" + from backend.app.db import upsert_library + from backend.app.search import resolve_library_id + + # Create a library that should match "search" + upsert_library( + library_id="/local/searchlib", + name="Search Library", + description="Main search documentation" + ) + + candidates = resolve_library_id("search") + + assert isinstance(candidates, list) + + def test_context7_style_prefix(self, test_database): + """Candidates should have /local/ prefix added to ID.""" + from backend.app.db import upsert_library + from backend.app.search import resolve_library_id + + upsert_library( + library_id="foundryvtt", # Without /local/ + name="Foundry VTT", + description="Fantasy tabletop virtual table" + ) + + candidates = resolve_library_id("foundry") + + for candidate in candidates: + assert candidate.get("source") == "local" + + def test_partial_name_match(self, test_database): + """Should match on partial name.""" + from backend.app.db import upsert_library + from backend.app.search import resolve_library_id + + upsert_library( + library_id="/local/gamefoundry", + name="Foundry Game Module", + description="Module for foundry games" + ) + + candidates = resolve_library_id("game") + assert isinstance(candidates, list) + + def test_empty_result_on_no_matches(self, test_database): + """Should return empty list when no matches.""" + from backend.app.search import resolve_library_id + + # No libraries matching "xyznonexistent123" + candidates = resolve_library_id("xyznonexistent123") + + assert isinstance(candidates, list) + + +class TestSearchDocs: + """Tests for search_docs() - semantic search with mocked vector store.""" + + def test_returns_results_list(self, mock_qdrant_client, test_database): + """search_docs should return a list of results.""" + from backend.app.search import search_docs + + # Create some chunks first + from backend.app.db import upsert_library, insert_document_chunk + upsert_library(library_id="/local/searchdocslib", name="Search Docs Lib", description="Test") + + for i in range(5): + insert_document_chunk( + doc_id=f"searchdoc-{i}", + library_id="/local/searchdocslib", + path=f"path{i}.md", + title=f"Section {i}", + content=f"# Section {i}\n\nContent about section {i} that matches search queries.", + chunk_index=i, + token_estimate=100 + ) + + results = search_docs("section") + + assert isinstance(results, list) + + def test_empty_query_returns_empty_list(self): + """Empty query should return empty results.""" + from backend.app.search import search_docs + + results = search_docs("") + assert isinstance(results, list) + + def test_limit_parameter(self, mock_qdrant_client): + """Limit parameter should affect result count.""" + from backend.app.search import search_docs + + results_10 = search_docs("test", limit=10) + results_5 = search_docs("test", limit=5) + + assert isinstance(results_10, list) + assert isinstance(results_5, list) + + def test_response_shape_matches_spec(self): + """Verify response shape when mocked returns data.""" + from unittest.mock import patch + from backend.app.search import search_docs + + # Mock client to return formatted results + mock_results = [ + { + "id": "test-id-1", + "score": 0.95, + "library_id": "/local/testlib", + "path": "docs/example.md", + "title": "Example Document", + "chunk_index": 0 + } + ] + + with patch('backend.app.vector_store.get_client') as mock_get_client: + # Setup mock client to return our test data + mock_client = mock_get_client.return_value + mock_point = type('ScoredPoint', (), { + 'score': 0.95, + 'payload': { + "id": "test-id-1", + "library_id": "/local/testlib", + "path": "docs/example.md", + "title": "Example Document", + "chunk_index": 0 + } + })() + mock_client.search.return_value = [mock_point] + + results = search_docs("test query") + + assert isinstance(results, list) + if results: + # Verify each result has expected fields + result = results[0] + assert "id" in result + assert "score" in result + assert "library_id" in result + assert "path" in result + assert "title" in result + assert "chunk_index" in result + + +class TestGetLibraryDocs: + """Tests for get_library_docs() - document retrieval.""" + + def test_returns_empty_string_when_no_documents(self, mock_qdrant_client): + """Should return empty/error when no docs exist.""" + from backend.app.search import get_library_docs + + result = get_library_docs("/local/nonexistent") + + # Either returns empty string or error message + assert isinstance(result, str) + + def test_returns_content_when_documents_exist(self, mock_qdrant_client): + """Should return combined document content.""" + from backend.app.db import upsert_library, insert_document_chunk + from backend.app.search import get_library_docs + + # Create library with chunks + upsert_library(library_id="/local/docretrievetest", name="Doc Retrieve", description="Test") + insert_document_chunk( + doc_id="doc-retrieve-1", + library_id="/local/docretrievetest", + path="docs/getting-started.md", + title="Getting Started", + content="# Getting Started\n\nWelcome to the documentation. This is a test document.", + chunk_index=0, + token_estimate=200 + ) + + result = get_library_docs("/local/docretrievetest") + + assert isinstance(result, str) + # Should contain at least library title or content + + def test_topic_filter_searches(self, mock_qdrant_client): + """With topic filter, should search for relevant chunks.""" + from backend.app.db import upsert_library, insert_document_chunk + from backend.app.search import get_library_docs + + upsert_library(library_id="/local/topicsearchlib", name="Topic Search", description="Test") + + # Add documents with different topics + insert_document_chunk( + doc_id="topic-install", + library_id="/local/topicsearchlib", + path="docs/install.md", + title="Installation Guide", + content="# Installation\n\nInstall with pip install mypackage.", + chunk_index=0, + token_estimate=150 + ) + + insert_document_chunk( + doc_id="topic-usage", + library_id="/local/topicsearchlib", + path="docs/usage.md", + title="Usage Guide", + content="# Usage\n\nUse mycommand --help for help.", + chunk_index=0, + token_estimate=150 + ) + + # Search for "install" topic + result = get_library_docs("/local/topicsearchlib", topic="install") + + assert isinstance(result, str) + + def test_token_limit_respected(self): + """Token limit should truncate content appropriately.""" + from backend.app.search import get_library_docs + + # Create a library with lots of content + from backend.app.db import upsert_library, insert_document_chunk + + upsert_library(library_id="/local/tokenlimittest", name="Token Limit", description="Test") + + long_content = "# Long Content\n\n" + " ".join(["word"] * 500) + insert_document_chunk( + doc_id="long-doc", + library_id="/local/tokenlimittest", + path="docs/long.md", + title="Long Document", + content=long_content, + chunk_index=0, + token_estimate=2000 + ) + + # Request with small token limit + result = get_library_docs("/local/tokenlimittest", token_limit=100) + + assert isinstance(result, str) + + +class TestGetLibraryDocsWithMock: + """Tests that verify content retrieval when mocked data is available.""" + + def test_retrieves_chunks_by_library_id(self, mock_qdrant_client): + """get_library_docs without topic should fetch all chunks for library.""" + from backend.app.db import upsert_library, insert_document_chunk + from backend.app.search import get_library_docs + + upsert_library(library_id="/local/mockretrievetest", name="Mock Retrieve", description="Test") + + for i in range(3): + insert_document_chunk( + doc_id=f"mock-retrieve-{i}", + library_id="/local/mockretrievetest", + path=f"path{i}.md", + title=f"Path {i}", + content=f"Content for path {i}.", + chunk_index=i, + token_estimate=50 + ) + + result = get_library_docs("/local/mockretrievetest") + + assert isinstance(result, str) + + +class TestSearchErrorHandling: + """Tests for error handling in search functions.""" + + def test_search_handles_missing_library(self): + """Should handle missing library gracefully.""" + from backend.app.search import search_docs + + results = search_docs("test", library_id="/local/missing_lib_xyz123") + assert isinstance(results, list) + + def test_resolve_handles_no_libraries_in_db(self): + """Should handle empty database gracefully.""" + from backend.app.db import init_db + from backend.app.search import resolve_library_id + + # Initialize fresh DB (empty) + from backend.app.db import get_connection, get_chunks_for_library + # The test_database fixture already does this + + def test_get_library_docs_handles_empty_library(self): + """Should handle library with no chunks.""" + from backend.app.search import get_library_docs + + result = get_library_docs("/local/emptylib") + assert isinstance(result, str) + + +# ============================================================================= +# FIXTURES FOR SEARCH TESTS +# ============================================================================= + +@pytest.fixture +def search_sample_text(): + """Sample text with headings for search chunking tests.""" + return """# Installation Guide + +To install the package: +```bash +pip install mypackage +``` + +## Configuration + +Configure your environment by setting these variables: +- MY_VAR=123 +- DEBUG=true + +## Usage Examples + +Example 1: Basic usage +```python +import mymodule +module = mymodule.Module() +result = module.run() +print(result) +``` + +Example 2: Advanced usage with options +```python +options = {"verbose": True, "output": "stdout"} +result = module.run(options=options) +``` + +## Troubleshooting + +Common issues and their solutions: +- ImportError: Ensure package is installed +- AttributeError: Check that attributes exist on object""" diff --git a/webui.env.example b/webui.env.example new file mode 100644 index 0000000..a84d20a --- /dev/null +++ b/webui.env.example @@ -0,0 +1,29 @@ +# Context7 Docs WebUI Configuration +# Copy this file to .env and configure for your environment + +# === Ports (optional - use if you need custom ports) === +HOST_PORT=8787 # docs-api port (default: 8787) +MCP_HOST_PORT=8788 # docs-mcp port (default: 8788) +WEBUI_PORT=8790 # WebUI port (default: 8790) + +# === API Keys (optional - uncomment to enable auth) === +# Docs API key for protecting endpoints like /search, /ingest, etc. +# DOCS_API_KEY=your-secret-docs-api-key + +# WebUI API key (optional - separate from docs-api for UI authentication) +# DOCS_WEBUI_API_KEY=your-webui-api-key + +# === Application Configuration === +# Path to documentation files (relative to service container) +DOCS_PATH=/docs + +# SQLite database path +DB_PATH=/data/db.sqlite + +# Logging level: DEBUG, INFO, WARNING, ERROR +LOG_LEVEL=INFO + +# === Vector Store === +# Qdrant host and port (internal Docker network) +VECTOR_STORE_HOST=qdrant +VECTOR_STORE_PORT=6333 \ No newline at end of file diff --git a/webui/Dockerfile b/webui/Dockerfile new file mode 100644 index 0000000..6c15f12 --- /dev/null +++ b/webui/Dockerfile @@ -0,0 +1,19 @@ +# WebUI Dockerfile +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + DOCS_API_URL=http://docs-api:8787 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app /app/webui + +RUN mkdir -p /app/webui/templates/uploads + +EXPOSE 8790 + +CMD ["uvicorn", "webui.main:app", "--host", "0.0.0.0", "--port", "8790"] diff --git a/webui/app/api_client.py b/webui/app/api_client.py new file mode 100644 index 0000000..442d882 --- /dev/null +++ b/webui/app/api_client.py @@ -0,0 +1,72 @@ +"""Async docs-api client for the WebUI.""" +import os +from typing import Any, Dict, Optional + +from httpx import AsyncClient, Timeout + + +class DocsAPIClient: + """Small async HTTP client for the docs-api backend.""" + + def __init__(self, base_url: Optional[str] = None, api_key: Optional[str] = None): + self.base_url = (base_url or os.environ.get("DOCS_API_URL", "http://docs-api:8787")).rstrip("/") + self.api_key = api_key if api_key is not None else os.environ.get("WEBUI_API_KEY") + self.headers = {"X-API-Key": self.api_key} if self.api_key else {} + self._client: Optional[AsyncClient] = None + + async def _get_client(self) -> AsyncClient: + if self._client is None or self._client.is_closed: + self._client = AsyncClient( + base_url=self.base_url, + headers=self.headers, + timeout=Timeout(120.0), + ) + return self._client + + async def request(self, method: str, path: str, **kwargs: Any) -> Dict[str, Any]: + client = await self._get_client() + resp = await client.request(method, path, **kwargs) + if resp.status_code >= 400: + raise RuntimeError(f"{method} {path} failed: {resp.status_code} {resp.text}") + if resp.headers.get("content-type", "").startswith("application/json"): + data = resp.json() + return data if isinstance(data, dict) else {"data": data} + return {"data": resp.text} + + async def get(self, path: str, **kwargs: Any) -> Dict[str, Any]: + return await self.request("GET", path, **kwargs) + + async def post(self, path: str, **kwargs: Any) -> Dict[str, Any]: + return await self.request("POST", path, **kwargs) + + async def delete(self, path: str, **kwargs: Any) -> Dict[str, Any]: + return await self.request("DELETE", path, **kwargs) + + async def health(self) -> Dict[str, Any]: + try: + return await self.get("/health") + except Exception as e: + return {"status": "error", "message": str(e)} + + async def upload_file(self, library_id: str, filename: str, content: bytes) -> Dict[str, Any]: + files = {"file": (filename, content)} + return await self.post(f"/api/v1/upload/{library_id}", files=files) + + async def close(self) -> None: + if self._client is not None and not self._client.is_closed: + await self._client.aclose() + + +_client_instance: Optional[DocsAPIClient] = None + + +async def get_client() -> DocsAPIClient: + global _client_instance + if _client_instance is None: + _client_instance = DocsAPIClient() + return _client_instance + + +async def close_client() -> None: + if _client_instance is not None: + await _client_instance.close() diff --git a/webui/app/config.py b/webui/app/config.py new file mode 100644 index 0000000..a2d23bf --- /dev/null +++ b/webui/app/config.py @@ -0,0 +1,17 @@ +"""WebUI configuration.""" +from typing import Optional + + +class Settings: + """WebUI settings from environment variables.""" + + # Core API connection + DOCS_API_URL: str = "http://docs-api:8787" + WEBUI_API_KEY: Optional[str] = None + + # Default parameters for common operations + DEFAULT_SEARCH_LIMIT: int = 10 + DEFAULT_RESULT_TOKENS: int = 8000 + + +settings = Settings() \ No newline at end of file diff --git a/webui/app/main.py b/webui/app/main.py new file mode 100644 index 0000000..0899669 --- /dev/null +++ b/webui/app/main.py @@ -0,0 +1,259 @@ +"""WebUI FastAPI application.""" +import html +import os +from pathlib import Path +from typing import List, Optional + +from fastapi import FastAPI, File, Form, Request, UploadFile +from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates + +from .api_client import DocsAPIClient + + +app = FastAPI( + title="Context7 Docs WebUI", + description="Web dashboard for managing documentation system", + version="1.0.0", +) + +templates = Jinja2Templates(directory=os.path.join(os.path.dirname(__file__), "templates")) +templates.env.globals["escapeHtml"] = lambda value: html.escape(str(value or "")) +app.mount("/static", StaticFiles(directory=os.path.join(os.path.dirname(__file__), "static")), name="static") + +_client: Optional[DocsAPIClient] = None + + +def get_client() -> DocsAPIClient: + global _client + if _client is None: + _client = DocsAPIClient( + os.environ.get("DOCS_API_URL", "http://docs-api:8787"), + os.environ.get("WEBUI_API_KEY"), + ) + return _client + + +@app.on_event("shutdown") +async def shutdown() -> None: + if _client is not None: + await _client.close() + + +def page(title: str, body: str) -> HTMLResponse: + return HTMLResponse( + f""" +{html.escape(title)} +{body}""" + ) + + +@app.get("/") +async def dashboard(request: Request): + client = get_client() + health = await client.health() + + try: + collections_data = await client.get("/collections") + total_vectors = sum( + item.get("vectors", 0) + for item in collections_data.get("collections", {}).values() + if isinstance(item, dict) + ) + except Exception: + total_vectors = 0 + + try: + libs_data = await client.get("/libraries") + libraries = libs_data.get("libraries", []) + except Exception: + libraries = [] + + return templates.TemplateResponse( + "dashboard.html", + {"request": request, "health": health, "vectors": total_vectors, "libraries": libraries}, + ) + + +@app.post("/actions/ingest-all") +async def ingest_all(): + client = get_client() + try: + result = await client.post("/ingest/all") + body = f"

Ingestion Complete

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Ingestion Failed

{html.escape(str(e))}
Back" + return page("Ingestion", body) + + +@app.post("/actions/sync-sources") +async def sync_sources_action(): + client = get_client() + try: + result = await client.post("/sources/sync", json={"override": False}) + body = f"

Git Sync Complete

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Git Sync Failed

{html.escape(str(e))}
Back" + return page("Git Sync", body) + + +@app.get("/libraries") +async def libraries(request: Request): + client = get_client() + try: + data = await client.get("/libraries") + libraries_data = data.get("libraries", []) + except Exception: + libraries_data = [] + return templates.TemplateResponse("libraries.html", {"request": request, "data": libraries_data}) + + +@app.post("/libraries/create") +async def create_library( + library_id: str = Form(...), + name: str = Form(...), + description: Optional[str] = Form(None), +): + client = get_client() + try: + result = await client.post( + f"/api/v1/libraries/{library_id.strip()}", + data={"name": name, "description": description or ""}, + ) + body = f"

Library Created

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Create Failed

{html.escape(str(e))}
Back" + return page("Library Created", body) + + +@app.post("/libraries/{library_id}/ingest") +async def ingest_library(library_id: str): + client = get_client() + try: + result = await client.post(f"/ingest/{library_id}") + body = f"

Ingestion Complete

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Ingestion Failed

{html.escape(str(e))}
Back" + return page("Ingest Library", body) + + +@app.post("/libraries/{library_id}/delete") +async def delete_library(library_id: str): + client = get_client() + try: + result = await client.delete(f"/api/v1/libraries/{library_id}") + body = f"

Library Deleted

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Delete Failed

{html.escape(str(e))}
Back" + return page("Delete Library", body) + + +@app.get("/libraries/{library_id}/docs") +async def view_library_docs(library_id: str): + client = get_client() + try: + result = await client.get(f"/docs/{library_id}") + content = result.get("content", "") + except Exception as e: + content = str(e) + return page( + f"Docs: {library_id}", + f"

{html.escape(library_id)}

{html.escape(content)}
Back", + ) + + +@app.get("/upload") +async def upload_form(request: Request): + client = get_client() + try: + libs_data = await client.get("/libraries") + libraries = libs_data.get("libraries", []) + except Exception: + libraries = [] + return templates.TemplateResponse("upload.html", {"request": request, "libraries": libraries}) + + +@app.post("/upload") +async def upload_file( + request: Request, + library_id: str = Form(""), + ingest_after_upload: Optional[str] = Form(None), + files: List[UploadFile] = File(...), +): + client = get_client() + results = [] + total_size = 0 + + for upload in files: + filename = upload.filename or "upload.txt" + target_library = library_id.strip() + if not target_library: + target_library = Path(filename).stem.lower().replace(" ", "-") or "uploaded" + + try: + contents = await upload.read() + total_size += len(contents) + result = await client.upload_file(target_library, filename, contents) + results.append({"filename": filename, "status": "success", "message": result}) + except Exception as e: + results.append({"filename": filename, "status": "error", "message": str(e)}) + + if ingest_after_upload == "on": + for result in list(results): + if result["status"] != "success": + continue + target_library = result["message"]["library_id"] + try: + ingest_result = await client.post(f"/ingest/{target_library}") + results.append({"filename": "__INGEST__", "status": "success", "message": ingest_result}) + except Exception as e: + results.append({"filename": "__INGEST__", "status": "error", "message": str(e)}) + + return templates.TemplateResponse( + "upload.html", + {"request": request, "libraries": [], "results": results, "total_size_bytes": total_size}, + ) + + +@app.get("/search") +async def search_form(request: Request): + return templates.TemplateResponse("search.html", {"request": request, "query": "", "results": []}) + + +@app.get("/search/results") +async def search_results(request: Request, q: str = "", limit: int = 10): + client = get_client() + results = [] + if q: + try: + data = await client.post("/search", json={"query": q, "library_id": None, "limit": limit}) + results = data.get("results", []) + except Exception: + results = [] + return templates.TemplateResponse( + "search.html", + {"request": request, "query": q, "results": results, "limit": limit}, + ) + + +@app.get("/sources") +async def sources_page(request: Request): + client = get_client() + try: + data = await client.get("/api/v1/sources") + sources = data.get("sources", []) + except Exception: + sources = [] + return templates.TemplateResponse("sources.html", {"request": request, "sources": sources}) + + +@app.post("/sources/sync") +async def sync_sources(override: bool = Form(False)): + client = get_client() + try: + result = await client.post("/sources/sync", json={"override": override}) + body = f"

Git Sync Complete

{html.escape(str(result))}
Back" + except Exception as e: + body = f"

Git Sync Failed

{html.escape(str(e))}
Back" + return page("Git Sync", body) diff --git a/webui/app/static/app.js b/webui/app/static/app.js new file mode 100644 index 0000000..05ecdbb --- /dev/null +++ b/webui/app/static/app.js @@ -0,0 +1,159 @@ +// WebUI Static JavaScript Utilities +// Simple helper functions shared across templates + +/** + * Escape HTML to prevent XSS attacks when displaying user content + */ +function escapeHtml(text) { + if (typeof text !== 'string') return ""; + var e = document.createElement('div'); + try { + e.textContent = text; + return e.innerHTML; + } catch (err) { + return String(text).replace(/[&<>"']/g, function(m) { + switch (m) { + case '&': return '&'; + case '<': return '<'; + case '>': return '>'; + case '"': return '"'; + case "'": return '''; + default: return m; + } + }); + } +} + +/** + * Format number with thousands separators + */ +function formatNumber(num) { + if (num === null || num === undefined) return "N/A"; + return new Intl.NumberFormat().format(Math.floor(num)); +} + +/** + * Show loading spinner + */ +function showLoading(elementId) { + var el = document.getElementById(elementId); + if (el) { + el.innerHTML = '
Loading...
'; + } +} + +/** + * Hide loading spinner + */ +function hideLoading(elementId) { + var el = document.getElementById(elementId); + if (el) { + el.innerHTML = ""; + } +} + +/** + * Create a toast notification + */ +function showToast(message, type) { + var toast = document.createElement('div'); + toast.className = 'toast ' + (type || 'info'); + toast.textContent = message; + toast.style.cssText = 'position:fixed;bottom:20px;right:20px;' + + 'padding:12px 20px;border-radius:4px;margin-bottom:10px;' + + 'background:#333;color:white;font-size:0.9rem;z-index:1000'; + document.body.appendChild(toast); + + setTimeout(function() { + toast.style.opacity = '0'; + setTimeout(function() { toast.remove(); }, 200); + }, 3000); +} + +/** + * Show error notification + */ +function showError(message) { + showToast("Error: " + message, "error"); +} + +/** + * Show success notification + */ +function showSuccess(message) { + showToast("Success: " + message, "success"); +} + +/** + * Make an API request with error handling + */ +async function apiRequest(endpoint, method = 'GET', data = null) { + const config = window.webuiConfig; + let url = config.apiUrl; + + if (!url.endsWith('/')) url += '/'; + url += endpoint; + + const headers = {}; + if (config.apiKey) { + headers['X-API-Key'] = config.apiKey; + } + + try { + let response; + if (method === 'POST') { + response = await fetch(url, { + method: method, + headers: headers, + body: JSON.stringify(data) + }); + } else { + response = await fetch(url, { + method: method, + headers: headers + }); + } + + if (!response.ok) { + throw new Error(response.statusText); + } + + const contentType = response.headers.get('content-type'); + if (contentType && contentType.includes('application/json')) { + return await response.json(); + } else { + return await response.text(); + } + } catch (err) { + console.error('API request failed:', err); + throw err; + } +} + +/** + * Initialize tooltips if using them + */ +function initTooltips() { + // Add tooltip functionality here if needed +} + +/** + * Debounce function for input handling + */ +function debounce(func, wait) { + var timeout; + return function executedFunction(...args) { + var later = function() { + clearTimeout(timeout); + func.apply(this, args); + }; + timeout = setTimeout(later, wait); + }; +} + +// Export to window for use in templates +window.escapeHtml = escapeHtml; +window.formatNumber = formatNumber; +window.showToast = showToast; +window.showError = showError; +window.showSuccess = showSuccess; diff --git a/webui/app/static/style.css b/webui/app/static/style.css new file mode 100644 index 0000000..d40e001 --- /dev/null +++ b/webui/app/static/style.css @@ -0,0 +1,395 @@ +.container { + max-width: 1000px; + margin: 0 auto; + padding: 20px; +} + +header { + border-bottom: 1px solid #ccc; + padding-bottom: 15px; + margin-bottom: 20px; +} + +header h1 { + margin: 0 0 10px 0; + font-size: 1.5rem; +} + +nav { + display: flex; + gap: 15px; +} + +nav a { + text-decoration: none; + color: #0066cc; + font-size: 0.9rem; +} + +nav a.active { + font-weight: bold; + text-decoration: underline; +} + +main h2 { + margin-bottom: 15px; +} + +footer { + margin-top: 40px; + padding-top: 15px; + border-top: 1px solid #ccc; + font-size: 0.8rem; + color: #666; +} + +.status-card { + background: #f5f5f5; + padding: 20px; + border-radius: 8px; + border-left: 4px solid #00c467; + margin-bottom: 15px; +} + +.status-message { + background: #e8f4fd; + padding: 10px; + border-radius: 4px; + margin: 5px 0; +} + +pre.code-block { + background: #f5f5f5; + padding: 15px; + border-radius: 4px; + overflow-x: auto; + white-space: pre-wrap; + word-break: break-word; +} + +/* Tables */ +.library-table { + width: 100%; + border-collapse: collapse; + margin-top: 10px; +} + +.library-table th, .library-table td { + padding: 10px; + text-align: left; + border-bottom: 1px solid #ddd; +} + +.library-table th { + background: #f5f5f5; + font-weight: bold; +} + +/* Forms */ +form input[type="text"], form textarea, form select { + padding: 8px; + border: 1px solid #ccc; + border-radius: 4px; + margin-right: 10px; + margin-bottom: 10px; +} + +button { + background: #0066cc; + color: white; + border: none; + padding: 10px 20px; + border-radius: 4px; + cursor: pointer; +} + +button:hover { + background: #0055aa; +} + +/* Upload form */ +.upload-form, .search-form, .sync-form { + max-width: 600px; +} + +/* Search results */ +.results-count { + background: #e8f4fd; + padding: 10px; + border-radius: 4px; + margin-bottom: 15px; +} + +.result-card { + background: #fff; + border: 1px solid #ddd; + padding: 15px; + margin: 10px 0; + border-radius: 4px; +} + +.result-card h3 { + margin: 0 0 8px 0; +} + +/* Results box */ +.results-box { + max-height: 600px; + overflow-y: auto; +} + +.results-box .new-search-link { + display: block; + text-align: center; + margin-top: 15px; +} + +/* Source cards */ +.source-cards { + display: grid; + gap: 10px; +} + +.source-card { + background: #f5f5f5; + padding: 15px; + border-radius: 4px; + border-left: 4px solid #666; +} + +.status-message code { + background: #333; + color: #fff; + padding: 2px 6px; + border-radius: 3px; +} + +.hint { + color: #666; + font-size: 0.85rem; + margin-top: 15px; +} + +.results-box .error { + color: #cc0000; + font-weight: bold; +} + +.source-list, .source-cards, pre { + white-space: normal; +} + +/* Status cards grid */ +.status-cards { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 15px; + margin-bottom: 20px; +} + +.status-card h3 { + margin: 0 0 8px 0; + font-size: 0.9rem; + color: #555; +} + +.status-card p { + margin: 0; + font-size: 1.2rem; + font-weight: bold; +} + +/* Message box */ +.message-box { + background: #e8f4fd; + padding: 12px; + border-radius: 6px; + margin-bottom: 20px; + border-left: 4px solid #3b82f6; +} + +/* Action buttons */ +.action-buttons { + display: flex; + gap: 15px; + margin-bottom: 20px; +} + +.btn { + padding: 10px 20px; + border: none; + border-radius: 4px; + cursor: pointer; + text-decoration: none; + display: inline-block; + font-size: 0.9rem; +} + +.btn-primary { + background: #00c467; + color: white; +} + +.btn-primary:hover { + background: #00a855; +} + +.btn-secondary { + background: #2563eb; + color: white; +} + +.btn-secondary:hover { + background: #1d4ed8; +} + +/* Links section */ +.links-section h2 { + font-size: 1rem; + margin-bottom: 10px; +} + +.links-section a { + color: #0066cc; + text-decoration: none; + padding: 5px 10px; +} + +.links-section a:hover { + text-decoration: underline; +} + +/* Create library form */ +.create-form { + background: #f9f9f9; + padding: 15px; + border-radius: 6px; + margin-bottom: 20px; + border-left: 4px solid #00c467; +} + +.create-form label { + display: block; + margin-bottom: 8px; + font-weight: bold; + color: #333; +} + +.create-form input[type="text"] { + width: 100%; + padding: 8px; + margin-bottom: 12px; + border: 1px solid #ccc; + border-radius: 4px; + box-sizing: border-box; +} + +/* Table actions column */ +.actions { + white-space: nowrap; +} + +/* Button sizes */ +.btn-sm { + padding: 5px 12px; + font-size: 0.8rem; +} + +/* Additional action button colors */ +.btn-info { + background: #17a2b8; + color: white; +} + +.btn-info:hover { + background: #138496; +} + +.btn-warning { + background: #ffc107; + color: black; +} + +.btn-warning:hover { + background: #ffa000; +} + +.btn-danger { + background: #dc3545; + color: white; +} + +.btn-danger:hover { + background: #c82333; +} + +.btn-primary { + background: #007bff; + color: white; +} + +.btn-primary:hover { + background: #0056b3; +} + +/* Highlight row for popular libraries */ +tr.highlight { + background: #f0fdf4; +} + +/* Upload form specific styles */ +#library_id, #files { + width: 100%; + padding: 8px; + border: 1px solid #ccc; + border-radius: 4px; + margin-bottom: 12px; + box-sizing: border-box; +} + +#files { + font-family: sans-serif; +} + +/* Results box for upload */ +.result-box { + background: #fff; + border: 1px solid #ddd; + border-radius: 4px; + padding: 10px; + margin-top: 20px; + min-height: 100px; +} + +.result-box.error { + border-color: #dc3545; + background: #fff5f5; +} + +/* Result items */ +.result-item { + padding: 6px; + margin: 4px 0; + border-radius: 3px; + font-family: monospace; + font-size: 0.85rem; + word-break: break-word; +} + +.result-item.success { + background: #d4edda; + border-left: 3px solid #28a745; + color: #155724; +} + +.result-item.error { + background: #f8d7da; + border-left: 3px solid #dc3545; + color: #721c24; +} + +.result-item.info { + background: #d1ecf1; + border-left: 3px solid #17a2b8; + color: #0c5460; +} diff --git a/webui/app/templates/base.html b/webui/app/templates/base.html new file mode 100644 index 0000000..9fe2460 --- /dev/null +++ b/webui/app/templates/base.html @@ -0,0 +1,32 @@ + + + + + + {% block title %}Context7 Docs{% endblock %} + + + +
+
+

Context7 Docs UI

+ +
+ +
+ {% block content %}{% endblock %} +
+ +
Context7 Docs WebUI
+
+ + + {% block scripts %}{% endblock %} + + \ No newline at end of file diff --git a/webui/app/templates/dashboard.html b/webui/app/templates/dashboard.html new file mode 100644 index 0000000..ad7d0eb --- /dev/null +++ b/webui/app/templates/dashboard.html @@ -0,0 +1,83 @@ +{% extends "base.html" %} + +{% block title %}Dashboard - Context7 Docs{% endblock %} + +{% block content %} +

Dashboard

+ + +
+
+

Docs API Service

+ {% if health.status and health.status == 'ok' %} +

Status: Online βœ“

+ {% else %} +

Status: {% if health.status == 'error' %}Error{% else %}Offline{% endif %}

+ {% endif %} +
+ +
+

Vectors Stored

+

{{ vectors|default(0) }}

+
+ +
+

Libraries Registered

+

{{ libraries|length }}

+
+
+ + +{% if libraries and libraries|length > 0 %} +
+ Libraries: {{ escapeHtml(libraries) }} +
+{% endif %} + + +
+
+ +
+ +
+ + +
+
+ + + + + + +{% endblock %} \ No newline at end of file diff --git a/webui/app/templates/libraries.html b/webui/app/templates/libraries.html new file mode 100644 index 0000000..d96b207 --- /dev/null +++ b/webui/app/templates/libraries.html @@ -0,0 +1,74 @@ +{% extends "base.html" %} + +{% block title %}Libraries - Context7 Docs{% endblock %} + +{% block content %} +

Libraries

+ + +
+
+ + + + + + + + + + +
+
+ +
+ + + + + + + + + + + + + + + {% if data|length > 0 %} + {% for lib in data %} + + + + + + + + + {% endfor %} + {% else %} + + + + {% endif %} + +
IDNameDescriptionSource PathUpdated AtActions
{{ escapeHtml(lib.id) }}{{ escapeHtml(lib.name) }}{{ escapeHtml(lib.description) or '-' }}{{ escapeHtml(lib.source_path) or '-' }}{{ lib.updated_at|default('N/A') }} + View Docs | +
+ +
| +
+ +
+
No libraries found. Create one above.
+ +{% if data and data[0] and data[0].get('content') %} + +
{% for chunk in data.get('content', []) %}{% if chunk|length > 0 %}{{ chunk.text | default(chunk.content) | default(chunk) }}{% endif %}{% endfor %}
+← Back to Libraries +{% endif %} + +{% endblock %} \ No newline at end of file diff --git a/webui/app/templates/search.html b/webui/app/templates/search.html new file mode 100644 index 0000000..dc57230 --- /dev/null +++ b/webui/app/templates/search.html @@ -0,0 +1,71 @@ +{% extends "base.html" %} + +{% block title %}Search - Context7 Docs{% endblock %} + +{% block content %} +

Search Documentation

+ +
+ + + + + + + +
+ +
+ +{% if results %} +
{{ results|length }} results found
+{% endif %} + + +{% endblock %} \ No newline at end of file diff --git a/webui/app/templates/sources.html b/webui/app/templates/sources.html new file mode 100644 index 0000000..9af0a3d --- /dev/null +++ b/webui/app/templates/sources.html @@ -0,0 +1,34 @@ +{% extends "base.html" %} + +{% block title %}Sources - Context7 Docs{% endblock %} + +{% block content %} +

Git Repository Sync

+ +
Syncs all git repositories configured in docs_sources.yaml.
+ +
+ + + +
+ +
+ +{% if sources %} +

Configured Sources

+
+ {% for src in sources %} +
+ {{ src.library_id | default('unknown') }}
+ URL: {{ src.repo_url | default('N/A')[:60] }}
+ Branch: {{ src.branch | default('main') }}
+ Include: {{ (src.include_paths | default(['*']) | join(', ')) }} +
+ {% endfor %} +
+{% else %} +

No git sources configured. Add repositories to docs_sources.yaml.

+{% endif %} + +{% endblock %} diff --git a/webui/app/templates/upload.html b/webui/app/templates/upload.html new file mode 100644 index 0000000..85d1cfb --- /dev/null +++ b/webui/app/templates/upload.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} + +{% block title %}Upload - Context7 Docs{% endblock %} + +{% block content %} +

Upload Documentation Files

+ +
+ + + + + + + + + +
+ +
+ + +
+ + +

Allowed: .md, .txt, .py, .js, .ts, .json, .yaml, .yml, .html, .css, .pdf (max 5MB each)

+ + +
+ +{% if results %} +

Upload Results

+ +{% endif %} + +{% endblock %} diff --git a/webui/requirements.txt b/webui/requirements.txt new file mode 100644 index 0000000..368d7f6 --- /dev/null +++ b/webui/requirements.txt @@ -0,0 +1,7 @@ +# WebUI Dependencies +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +pydantic==2.5.3 +python-multipart==0.0.6 +httpx==0.26.0 +PyYAML==6.0.1 \ No newline at end of file