ff4da0cb9e
- Fix discover_files: rel_path always computed (was stuck at '.' at root), include_path_match now uses relative path, 'return' changed to 'continue' - Fix ingest_git_source: files were cloned but ingested from wrong path (docs/repo-id instead of data/repos/repo-id). Now stages filtered files into DOCS_PATH/library_id before calling ingest_library. - Add browse_repo_tree() for interactive repo exploration - Add POST /api/v1/sources/browse endpoint to backend - Add /sources/browse proxy route to webui - Rewrite sources.html: browse repo, expand/collapse tree, check paths to include, then save source and sync Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
488 lines
15 KiB
Python
488 lines
15 KiB
Python
"""Context7 Docs API."""
|
|
import asyncio
|
|
import io
|
|
import os
|
|
import shutil
|
|
import zipfile
|
|
import yaml
|
|
from pathlib import Path
|
|
from posixpath import normpath
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
|
|
from .config import settings
|
|
from .db import (
|
|
clear_library_documents,
|
|
delete_library,
|
|
init_db,
|
|
list_libraries,
|
|
search_libraries,
|
|
upsert_library,
|
|
)
|
|
from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source
|
|
from .ingest import ingest_all, ingest_library
|
|
from .search import get_library_docs, resolve_library_id, search_docs
|
|
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
|
|
|
|
|
|
app = FastAPI(
|
|
title="Context7 Docs API",
|
|
description="Document ingestion and semantic search API for local-context7",
|
|
version="1.0.0",
|
|
)
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
query: str = Field(..., min_length=1)
|
|
library_id: Optional[str] = None
|
|
limit: int = Field(10, ge=1, le=50)
|
|
|
|
|
|
class SyncSourcesRequest(BaseModel):
|
|
override: bool = False
|
|
|
|
|
|
class GitSourceRequest(BaseModel):
|
|
library_id: str = Field(..., min_length=1)
|
|
repo_url: str = Field(..., min_length=1)
|
|
name: Optional[str] = None
|
|
description: Optional[str] = None
|
|
branch: str = "main"
|
|
include_paths: Optional[list[str]] = None
|
|
exclude_paths: Optional[list[str]] = None
|
|
|
|
|
|
class BrowseRepoRequest(BaseModel):
|
|
repo_url: str = Field(..., min_length=1)
|
|
branch: str = "main"
|
|
|
|
|
|
DOCUMENT_EXTENSIONS = {
|
|
".md",
|
|
".txt",
|
|
".py",
|
|
".js",
|
|
".ts",
|
|
".json",
|
|
".yaml",
|
|
".yml",
|
|
".html",
|
|
".css",
|
|
".pdf",
|
|
}
|
|
ALLOWED_EXTENSIONS = DOCUMENT_EXTENSIONS | {".zip"}
|
|
MAX_FILE_UPLOAD_BYTES = 5 * 1024 * 1024
|
|
MAX_ZIP_UPLOAD_BYTES = 100 * 1024 * 1024
|
|
MAX_ZIP_EXTRACTED_BYTES = 250 * 1024 * 1024
|
|
MAX_ZIP_FILES = 2000
|
|
|
|
|
|
@app.middleware("http")
|
|
async def auth_middleware(request: Request, call_next):
|
|
"""Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set."""
|
|
if not settings.is_auth_enabled:
|
|
return await call_next(request)
|
|
|
|
public_prefixes = ("/health", "/libraries", "/docs/")
|
|
if request.method == "GET" and request.url.path.startswith(public_prefixes):
|
|
return await call_next(request)
|
|
|
|
if request.headers.get("X-API-Key") != settings.api_key_docs_api:
|
|
return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"})
|
|
|
|
return await call_next(request)
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup() -> None:
|
|
init_result = init_db()
|
|
if not init_result.get("success"):
|
|
raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}")
|
|
|
|
last_error = None
|
|
for _ in range(20):
|
|
collection_result = await ensure_collection()
|
|
if collection_result.get("success"):
|
|
return
|
|
last_error = collection_result.get("error")
|
|
await asyncio.sleep(1)
|
|
raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}")
|
|
|
|
|
|
def safe_library_id(library_id: str) -> str:
|
|
"""Normalize user-provided library IDs to a single path segment."""
|
|
base = Path(library_id).name.strip()
|
|
if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id:
|
|
raise HTTPException(status_code=400, detail="Invalid library ID")
|
|
return base
|
|
|
|
|
|
def safe_upload_filename(filename: str) -> str:
|
|
ext = Path(filename).suffix.lower()
|
|
if ext not in ALLOWED_EXTENSIONS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
|
|
)
|
|
|
|
stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip()
|
|
if not stem:
|
|
raise HTTPException(status_code=400, detail="Filename contains only unsafe characters")
|
|
return f"{stem}{ext}"
|
|
|
|
|
|
def safe_archive_member_path(member_name: str) -> Optional[Path]:
|
|
normalized = normpath(member_name.replace("\\", "/")).lstrip("/")
|
|
if not normalized or normalized == ".":
|
|
return None
|
|
|
|
path = Path(normalized)
|
|
if path.is_absolute() or ".." in path.parts or path.suffix.lower() not in DOCUMENT_EXTENSIONS:
|
|
return None
|
|
|
|
safe_parts = []
|
|
for part in path.parts:
|
|
cleaned = "".join(c for c in part if c.isalnum() or c in "-_ .").strip()
|
|
if not cleaned:
|
|
return None
|
|
safe_parts.append(cleaned)
|
|
return Path(*safe_parts)
|
|
|
|
|
|
def assert_inside_directory(root: Path, target: Path) -> None:
|
|
root_resolved = root.resolve()
|
|
target_resolved = target.resolve()
|
|
if os.path.commonpath([str(root_resolved), str(target_resolved)]) != str(root_resolved):
|
|
raise HTTPException(status_code=400, detail="Archive contains unsafe paths")
|
|
|
|
|
|
def extract_zip_upload(contents: bytes, lib_dir: Path) -> dict:
|
|
if len(contents) > MAX_ZIP_UPLOAD_BYTES:
|
|
raise HTTPException(status_code=400, detail="ZIP too large (max 100MB)")
|
|
|
|
extracted = []
|
|
skipped = []
|
|
total_uncompressed = 0
|
|
|
|
try:
|
|
archive = zipfile.ZipFile(io.BytesIO(contents))
|
|
except zipfile.BadZipFile:
|
|
raise HTTPException(status_code=400, detail="Invalid ZIP archive")
|
|
|
|
with archive:
|
|
files = [info for info in archive.infolist() if not info.is_dir()]
|
|
if len(files) > MAX_ZIP_FILES:
|
|
raise HTTPException(status_code=400, detail=f"ZIP contains too many files (max {MAX_ZIP_FILES})")
|
|
|
|
for info in files:
|
|
relative_path = safe_archive_member_path(info.filename)
|
|
if relative_path is None:
|
|
skipped.append(info.filename)
|
|
continue
|
|
|
|
total_uncompressed += info.file_size
|
|
if total_uncompressed > MAX_ZIP_EXTRACTED_BYTES:
|
|
raise HTTPException(status_code=400, detail="ZIP extracted content too large (max 250MB)")
|
|
|
|
target = lib_dir / relative_path
|
|
assert_inside_directory(lib_dir, target)
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
with archive.open(info) as src:
|
|
target.write_bytes(src.read())
|
|
extracted.append(str(target.relative_to(lib_dir)))
|
|
|
|
return {"extracted": extracted, "skipped": skipped, "size_bytes": len(contents)}
|
|
|
|
|
|
def docs_root() -> Path:
|
|
return Path(settings.docs_path)
|
|
|
|
|
|
def sources_config_path() -> Path:
|
|
return Path(__file__).resolve().parents[2] / "docs_sources.yaml"
|
|
|
|
|
|
def clean_source_paths(paths: Optional[list[str]]) -> list[str]:
|
|
cleaned = []
|
|
for raw_path in paths or []:
|
|
path = raw_path.strip().strip("/")
|
|
if not path or path == "." or ".." in Path(path).parts or Path(path).is_absolute():
|
|
continue
|
|
cleaned.append(path)
|
|
return cleaned
|
|
|
|
|
|
def load_sources_config() -> dict:
|
|
path = sources_config_path()
|
|
if not path.exists():
|
|
return {"sources": []}
|
|
|
|
with path.open() as f:
|
|
data = yaml.safe_load(f) or {}
|
|
|
|
if isinstance(data, list):
|
|
return {"sources": data}
|
|
if not isinstance(data, dict):
|
|
return {"sources": []}
|
|
sources = data.get("sources", [])
|
|
data["sources"] = sources if isinstance(sources, list) else []
|
|
return data
|
|
|
|
|
|
def save_sources_config(data: dict) -> None:
|
|
path = sources_config_path()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with path.open("w") as f:
|
|
yaml.safe_dump(data, f, sort_keys=False)
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
return {"status": "ok", "service": "docs-api"}
|
|
|
|
|
|
@app.get("/collections")
|
|
async def collections():
|
|
try:
|
|
client = get_client()
|
|
info = client.get_collection(get_collection_name())
|
|
vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0
|
|
return {"collections": {get_collection_name(): {"vectors": vectors}}}
|
|
except Exception as e:
|
|
return {"collections": {}, "warning": str(e)}
|
|
|
|
|
|
@app.get("/libraries")
|
|
async def list_libraries_api():
|
|
libs = list_libraries()
|
|
if isinstance(libs, dict) and not libs.get("success", True):
|
|
raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries"))
|
|
return {"libraries": libs, "count": len(libs)}
|
|
|
|
|
|
@app.get("/libraries/search")
|
|
async def search_libraries_api(q: str = Query(..., min_length=1)):
|
|
matches = resolve_library_id(q)
|
|
return {"matches": matches, "count": len(matches)}
|
|
|
|
|
|
@app.post("/search")
|
|
async def search_docs_api(payload: SearchRequest):
|
|
results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit)
|
|
return {
|
|
"query": payload.query,
|
|
"library_id": payload.library_id,
|
|
"results": results,
|
|
"count": len(results),
|
|
}
|
|
|
|
|
|
@app.get("/docs/{library_id}")
|
|
@app.get("/libraries/{library_id}/docs")
|
|
async def get_library_docs_api(
|
|
library_id: str,
|
|
topic: Optional[str] = Query(None),
|
|
tokens: int = Query(8000, ge=1),
|
|
):
|
|
docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens)
|
|
return {"library_id": library_id, "content": docs}
|
|
|
|
|
|
@app.post("/ingest/all")
|
|
async def ingest_all_api():
|
|
return await ingest_all()
|
|
|
|
|
|
@app.post("/ingest/{library_id}")
|
|
async def ingest_library_api(library_id: str):
|
|
library_id = safe_library_id(library_id)
|
|
source_path = library_id
|
|
return await ingest_library(library_id=library_id, name=library_id, source_path=source_path)
|
|
|
|
|
|
@app.post("/api/v1/libraries/{library_id}")
|
|
async def api_create_library(
|
|
library_id: str,
|
|
name: Optional[str] = Form(None),
|
|
description: Optional[str] = Form(None),
|
|
):
|
|
library_id = safe_library_id(library_id)
|
|
lib_dir = docs_root() / library_id
|
|
lib_dir.mkdir(parents=True, exist_ok=True)
|
|
result = upsert_library(library_id, name or library_id, description, library_id)
|
|
if not result.get("success"):
|
|
raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library"))
|
|
return {
|
|
"success": True,
|
|
"created": not result.get("exists", False),
|
|
"library_id": library_id,
|
|
"name": name or library_id,
|
|
"description": description,
|
|
"path": str(lib_dir),
|
|
}
|
|
|
|
|
|
@app.delete("/api/v1/libraries/{library_id}")
|
|
async def api_delete_library(library_id: str):
|
|
library_id = safe_library_id(library_id)
|
|
lib_dir = docs_root() / library_id
|
|
deleted_files = 0
|
|
|
|
if lib_dir.exists():
|
|
for path in lib_dir.rglob("*"):
|
|
if path.is_file():
|
|
deleted_files += 1
|
|
shutil.rmtree(lib_dir)
|
|
|
|
docs_result = clear_library_documents(library_id)
|
|
vectors_result = await delete_library_vectors(library_id)
|
|
library_result = delete_library(library_id)
|
|
|
|
failures = [
|
|
r.get("error")
|
|
for r in (docs_result, vectors_result, library_result)
|
|
if isinstance(r, dict) and not r.get("success", True)
|
|
]
|
|
if failures:
|
|
raise HTTPException(status_code=500, detail="; ".join(failures))
|
|
|
|
return {"success": True, "library_id": library_id, "deleted_files": deleted_files}
|
|
|
|
|
|
@app.post("/api/v1/upload/{library_id}")
|
|
async def api_upload(library_id: str, file: UploadFile = File(...)):
|
|
library_id = safe_library_id(library_id)
|
|
safe_name = safe_upload_filename(file.filename or "upload.txt")
|
|
lib_dir = docs_root() / library_id
|
|
lib_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
contents = await file.read()
|
|
if safe_name.lower().endswith(".zip"):
|
|
result = extract_zip_upload(contents, lib_dir)
|
|
upsert_library(library_id, library_id, None, library_id)
|
|
return {
|
|
"success": True,
|
|
"library_id": library_id,
|
|
"filename": safe_name,
|
|
"archive": True,
|
|
**result,
|
|
}
|
|
|
|
if len(contents) > MAX_FILE_UPLOAD_BYTES:
|
|
raise HTTPException(status_code=400, detail="File too large (max 5MB)")
|
|
|
|
target = lib_dir / safe_name
|
|
target.write_bytes(contents)
|
|
|
|
upsert_library(library_id, library_id, None, library_id)
|
|
return {
|
|
"success": True,
|
|
"library_id": library_id,
|
|
"filename": safe_name,
|
|
"path": str(target.relative_to(docs_root())),
|
|
"size_bytes": len(contents),
|
|
}
|
|
|
|
|
|
@app.get("/api/v1/sources")
|
|
@app.get("/sources/config")
|
|
async def api_list_sources():
|
|
data = load_sources_config()
|
|
sources = data["sources"]
|
|
return {"success": True, "sources": sources, "count": len(sources)}
|
|
|
|
|
|
@app.post("/api/v1/sources")
|
|
async def api_add_source(source: GitSourceRequest):
|
|
library_id = safe_library_id(source.library_id)
|
|
branch = source.branch.strip() or "main"
|
|
include_paths = clean_source_paths(source.include_paths)
|
|
exclude_paths = clean_source_paths(source.exclude_paths) or ["node_modules", ".git"]
|
|
|
|
source_entry = {
|
|
"library_id": library_id,
|
|
"name": (source.name or library_id).strip(),
|
|
"description": (source.description or "").strip(),
|
|
"repo_url": source.repo_url.strip(),
|
|
"branch": branch,
|
|
"include_paths": include_paths or ["docs"],
|
|
"exclude_paths": exclude_paths,
|
|
}
|
|
|
|
data = load_sources_config()
|
|
sources = data["sources"]
|
|
existing_index = next(
|
|
(index for index, item in enumerate(sources) if item.get("library_id") == library_id),
|
|
None,
|
|
)
|
|
if existing_index is None:
|
|
sources.append(source_entry)
|
|
created = True
|
|
else:
|
|
sources[existing_index] = source_entry
|
|
created = False
|
|
|
|
save_sources_config(data)
|
|
return {"success": True, "created": created, "source": source_entry}
|
|
|
|
|
|
@app.post("/api/v1/sources/browse")
|
|
async def browse_repo_api(payload: BrowseRepoRequest):
|
|
"""Shallow-clone a repo and return its directory tree for path selection."""
|
|
import hashlib
|
|
|
|
repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10]
|
|
repo_id = f"browse-{repo_hash}"
|
|
try:
|
|
clone_result = clone_or_update_repo(
|
|
repo_id=repo_id,
|
|
repo_url=payload.repo_url,
|
|
branch=payload.branch,
|
|
)
|
|
repo_path = Path(clone_result["repo_path"])
|
|
tree = browse_repo_tree(repo_path)
|
|
return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch}
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=400, detail=str(exc))
|
|
|
|
|
|
@app.post("/sources/sync")
|
|
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
|
|
source_data = await api_list_sources()
|
|
sources = source_data["sources"]
|
|
override = payload.override if payload else False
|
|
results = []
|
|
|
|
for source in sources:
|
|
library_id = source.get("library_id", "unknown")
|
|
try:
|
|
result = await ingest_git_source(
|
|
library_id=library_id,
|
|
name=source.get("name") or library_id,
|
|
description=source.get("description"),
|
|
repo_url=source["repo_url"],
|
|
branch=source.get("branch", "main"),
|
|
include_paths=source.get("include_paths"),
|
|
exclude_paths=source.get("exclude_paths"),
|
|
)
|
|
except Exception as exc:
|
|
result = {
|
|
"success": False,
|
|
"library_id": library_id,
|
|
"repo_url": source.get("repo_url"),
|
|
"error": str(exc),
|
|
}
|
|
results.append(result)
|
|
|
|
successful = len([r for r in results if r.get("success")])
|
|
return {
|
|
"success": successful == len(results),
|
|
"total_sources": len(results),
|
|
"successful": successful,
|
|
"failed": len(results) - successful,
|
|
"results": results,
|
|
}
|