Files
george ff4da0cb9e Fix git sync and add repo browser with path selection
- Fix discover_files: rel_path always computed (was stuck at '.' at root),
  include_path_match now uses relative path, 'return' changed to 'continue'
- Fix ingest_git_source: files were cloned but ingested from wrong path
  (docs/repo-id instead of data/repos/repo-id). Now stages filtered files
  into DOCS_PATH/library_id before calling ingest_library.
- Add browse_repo_tree() for interactive repo exploration
- Add POST /api/v1/sources/browse endpoint to backend
- Add /sources/browse proxy route to webui
- Rewrite sources.html: browse repo, expand/collapse tree, check paths to
  include, then save source and sync

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-06 01:28:10 +01:00

488 lines
15 KiB
Python

"""Context7 Docs API."""
import asyncio
import io
import os
import shutil
import zipfile
import yaml
from pathlib import Path
from posixpath import normpath
from typing import Optional
from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from .config import settings
from .db import (
clear_library_documents,
delete_library,
init_db,
list_libraries,
search_libraries,
upsert_library,
)
from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source
from .ingest import ingest_all, ingest_library
from .search import get_library_docs, resolve_library_id, search_docs
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
app = FastAPI(
title="Context7 Docs API",
description="Document ingestion and semantic search API for local-context7",
version="1.0.0",
)
class SearchRequest(BaseModel):
query: str = Field(..., min_length=1)
library_id: Optional[str] = None
limit: int = Field(10, ge=1, le=50)
class SyncSourcesRequest(BaseModel):
override: bool = False
class GitSourceRequest(BaseModel):
library_id: str = Field(..., min_length=1)
repo_url: str = Field(..., min_length=1)
name: Optional[str] = None
description: Optional[str] = None
branch: str = "main"
include_paths: Optional[list[str]] = None
exclude_paths: Optional[list[str]] = None
class BrowseRepoRequest(BaseModel):
repo_url: str = Field(..., min_length=1)
branch: str = "main"
DOCUMENT_EXTENSIONS = {
".md",
".txt",
".py",
".js",
".ts",
".json",
".yaml",
".yml",
".html",
".css",
".pdf",
}
ALLOWED_EXTENSIONS = DOCUMENT_EXTENSIONS | {".zip"}
MAX_FILE_UPLOAD_BYTES = 5 * 1024 * 1024
MAX_ZIP_UPLOAD_BYTES = 100 * 1024 * 1024
MAX_ZIP_EXTRACTED_BYTES = 250 * 1024 * 1024
MAX_ZIP_FILES = 2000
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set."""
if not settings.is_auth_enabled:
return await call_next(request)
public_prefixes = ("/health", "/libraries", "/docs/")
if request.method == "GET" and request.url.path.startswith(public_prefixes):
return await call_next(request)
if request.headers.get("X-API-Key") != settings.api_key_docs_api:
return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"})
return await call_next(request)
@app.on_event("startup")
async def startup() -> None:
init_result = init_db()
if not init_result.get("success"):
raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}")
last_error = None
for _ in range(20):
collection_result = await ensure_collection()
if collection_result.get("success"):
return
last_error = collection_result.get("error")
await asyncio.sleep(1)
raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}")
def safe_library_id(library_id: str) -> str:
"""Normalize user-provided library IDs to a single path segment."""
base = Path(library_id).name.strip()
if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id:
raise HTTPException(status_code=400, detail="Invalid library ID")
return base
def safe_upload_filename(filename: str) -> str:
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
)
stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip()
if not stem:
raise HTTPException(status_code=400, detail="Filename contains only unsafe characters")
return f"{stem}{ext}"
def safe_archive_member_path(member_name: str) -> Optional[Path]:
normalized = normpath(member_name.replace("\\", "/")).lstrip("/")
if not normalized or normalized == ".":
return None
path = Path(normalized)
if path.is_absolute() or ".." in path.parts or path.suffix.lower() not in DOCUMENT_EXTENSIONS:
return None
safe_parts = []
for part in path.parts:
cleaned = "".join(c for c in part if c.isalnum() or c in "-_ .").strip()
if not cleaned:
return None
safe_parts.append(cleaned)
return Path(*safe_parts)
def assert_inside_directory(root: Path, target: Path) -> None:
root_resolved = root.resolve()
target_resolved = target.resolve()
if os.path.commonpath([str(root_resolved), str(target_resolved)]) != str(root_resolved):
raise HTTPException(status_code=400, detail="Archive contains unsafe paths")
def extract_zip_upload(contents: bytes, lib_dir: Path) -> dict:
if len(contents) > MAX_ZIP_UPLOAD_BYTES:
raise HTTPException(status_code=400, detail="ZIP too large (max 100MB)")
extracted = []
skipped = []
total_uncompressed = 0
try:
archive = zipfile.ZipFile(io.BytesIO(contents))
except zipfile.BadZipFile:
raise HTTPException(status_code=400, detail="Invalid ZIP archive")
with archive:
files = [info for info in archive.infolist() if not info.is_dir()]
if len(files) > MAX_ZIP_FILES:
raise HTTPException(status_code=400, detail=f"ZIP contains too many files (max {MAX_ZIP_FILES})")
for info in files:
relative_path = safe_archive_member_path(info.filename)
if relative_path is None:
skipped.append(info.filename)
continue
total_uncompressed += info.file_size
if total_uncompressed > MAX_ZIP_EXTRACTED_BYTES:
raise HTTPException(status_code=400, detail="ZIP extracted content too large (max 250MB)")
target = lib_dir / relative_path
assert_inside_directory(lib_dir, target)
target.parent.mkdir(parents=True, exist_ok=True)
with archive.open(info) as src:
target.write_bytes(src.read())
extracted.append(str(target.relative_to(lib_dir)))
return {"extracted": extracted, "skipped": skipped, "size_bytes": len(contents)}
def docs_root() -> Path:
return Path(settings.docs_path)
def sources_config_path() -> Path:
return Path(__file__).resolve().parents[2] / "docs_sources.yaml"
def clean_source_paths(paths: Optional[list[str]]) -> list[str]:
cleaned = []
for raw_path in paths or []:
path = raw_path.strip().strip("/")
if not path or path == "." or ".." in Path(path).parts or Path(path).is_absolute():
continue
cleaned.append(path)
return cleaned
def load_sources_config() -> dict:
path = sources_config_path()
if not path.exists():
return {"sources": []}
with path.open() as f:
data = yaml.safe_load(f) or {}
if isinstance(data, list):
return {"sources": data}
if not isinstance(data, dict):
return {"sources": []}
sources = data.get("sources", [])
data["sources"] = sources if isinstance(sources, list) else []
return data
def save_sources_config(data: dict) -> None:
path = sources_config_path()
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as f:
yaml.safe_dump(data, f, sort_keys=False)
@app.get("/health")
async def health_check():
return {"status": "ok", "service": "docs-api"}
@app.get("/collections")
async def collections():
try:
client = get_client()
info = client.get_collection(get_collection_name())
vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0
return {"collections": {get_collection_name(): {"vectors": vectors}}}
except Exception as e:
return {"collections": {}, "warning": str(e)}
@app.get("/libraries")
async def list_libraries_api():
libs = list_libraries()
if isinstance(libs, dict) and not libs.get("success", True):
raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries"))
return {"libraries": libs, "count": len(libs)}
@app.get("/libraries/search")
async def search_libraries_api(q: str = Query(..., min_length=1)):
matches = resolve_library_id(q)
return {"matches": matches, "count": len(matches)}
@app.post("/search")
async def search_docs_api(payload: SearchRequest):
results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit)
return {
"query": payload.query,
"library_id": payload.library_id,
"results": results,
"count": len(results),
}
@app.get("/docs/{library_id}")
@app.get("/libraries/{library_id}/docs")
async def get_library_docs_api(
library_id: str,
topic: Optional[str] = Query(None),
tokens: int = Query(8000, ge=1),
):
docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens)
return {"library_id": library_id, "content": docs}
@app.post("/ingest/all")
async def ingest_all_api():
return await ingest_all()
@app.post("/ingest/{library_id}")
async def ingest_library_api(library_id: str):
library_id = safe_library_id(library_id)
source_path = library_id
return await ingest_library(library_id=library_id, name=library_id, source_path=source_path)
@app.post("/api/v1/libraries/{library_id}")
async def api_create_library(
library_id: str,
name: Optional[str] = Form(None),
description: Optional[str] = Form(None),
):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
result = upsert_library(library_id, name or library_id, description, library_id)
if not result.get("success"):
raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library"))
return {
"success": True,
"created": not result.get("exists", False),
"library_id": library_id,
"name": name or library_id,
"description": description,
"path": str(lib_dir),
}
@app.delete("/api/v1/libraries/{library_id}")
async def api_delete_library(library_id: str):
library_id = safe_library_id(library_id)
lib_dir = docs_root() / library_id
deleted_files = 0
if lib_dir.exists():
for path in lib_dir.rglob("*"):
if path.is_file():
deleted_files += 1
shutil.rmtree(lib_dir)
docs_result = clear_library_documents(library_id)
vectors_result = await delete_library_vectors(library_id)
library_result = delete_library(library_id)
failures = [
r.get("error")
for r in (docs_result, vectors_result, library_result)
if isinstance(r, dict) and not r.get("success", True)
]
if failures:
raise HTTPException(status_code=500, detail="; ".join(failures))
return {"success": True, "library_id": library_id, "deleted_files": deleted_files}
@app.post("/api/v1/upload/{library_id}")
async def api_upload(library_id: str, file: UploadFile = File(...)):
library_id = safe_library_id(library_id)
safe_name = safe_upload_filename(file.filename or "upload.txt")
lib_dir = docs_root() / library_id
lib_dir.mkdir(parents=True, exist_ok=True)
contents = await file.read()
if safe_name.lower().endswith(".zip"):
result = extract_zip_upload(contents, lib_dir)
upsert_library(library_id, library_id, None, library_id)
return {
"success": True,
"library_id": library_id,
"filename": safe_name,
"archive": True,
**result,
}
if len(contents) > MAX_FILE_UPLOAD_BYTES:
raise HTTPException(status_code=400, detail="File too large (max 5MB)")
target = lib_dir / safe_name
target.write_bytes(contents)
upsert_library(library_id, library_id, None, library_id)
return {
"success": True,
"library_id": library_id,
"filename": safe_name,
"path": str(target.relative_to(docs_root())),
"size_bytes": len(contents),
}
@app.get("/api/v1/sources")
@app.get("/sources/config")
async def api_list_sources():
data = load_sources_config()
sources = data["sources"]
return {"success": True, "sources": sources, "count": len(sources)}
@app.post("/api/v1/sources")
async def api_add_source(source: GitSourceRequest):
library_id = safe_library_id(source.library_id)
branch = source.branch.strip() or "main"
include_paths = clean_source_paths(source.include_paths)
exclude_paths = clean_source_paths(source.exclude_paths) or ["node_modules", ".git"]
source_entry = {
"library_id": library_id,
"name": (source.name or library_id).strip(),
"description": (source.description or "").strip(),
"repo_url": source.repo_url.strip(),
"branch": branch,
"include_paths": include_paths or ["docs"],
"exclude_paths": exclude_paths,
}
data = load_sources_config()
sources = data["sources"]
existing_index = next(
(index for index, item in enumerate(sources) if item.get("library_id") == library_id),
None,
)
if existing_index is None:
sources.append(source_entry)
created = True
else:
sources[existing_index] = source_entry
created = False
save_sources_config(data)
return {"success": True, "created": created, "source": source_entry}
@app.post("/api/v1/sources/browse")
async def browse_repo_api(payload: BrowseRepoRequest):
"""Shallow-clone a repo and return its directory tree for path selection."""
import hashlib
repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10]
repo_id = f"browse-{repo_hash}"
try:
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=payload.repo_url,
branch=payload.branch,
)
repo_path = Path(clone_result["repo_path"])
tree = browse_repo_tree(repo_path)
return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch}
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc))
@app.post("/sources/sync")
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
source_data = await api_list_sources()
sources = source_data["sources"]
override = payload.override if payload else False
results = []
for source in sources:
library_id = source.get("library_id", "unknown")
try:
result = await ingest_git_source(
library_id=library_id,
name=source.get("name") or library_id,
description=source.get("description"),
repo_url=source["repo_url"],
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
)
except Exception as exc:
result = {
"success": False,
"library_id": library_id,
"repo_url": source.get("repo_url"),
"error": str(exc),
}
results.append(result)
successful = len([r for r in results if r.get("success")])
return {
"success": successful == len(results),
"total_sources": len(results),
"successful": successful,
"failed": len(results) - successful,
"results": results,
}