diff --git a/backend/app/git_source.py b/backend/app/git_source.py index b553f1e..26a0a14 100644 --- a/backend/app/git_source.py +++ b/backend/app/git_source.py @@ -128,85 +128,60 @@ def discover_files( } discovered = [] - - def should_include(path: Path, rel_path: Path) -> bool: - """Check if a path matches any include pattern.""" - if not include_patterns: - return True - - # Normalize paths for comparison (handle trailing slashes, etc.) - path_str = str(path).replace("\\", "/") + + def should_exclude(rel_path: Path) -> bool: rel_str = str(rel_path).replace("\\", "/") - - for inc_pattern in include_patterns: - inc_str = str(inc_pattern).replace("\\", "/") - - # If pattern has subdirs, check prefix match - if "/" in inc_str and not inc_str.endswith("/"): - pattern_base = inc_str.rsplit("/", 1)[0] + "/" - if rel_str.startswith(pattern_base): - return True - elif rel_str == inc_str: - return True - - return False - - def should_exclude(path: Path, rel_path: Path) -> bool: - """Check if a path matches any exclude pattern (simple prefix/exact match).""" for exc_pattern in exclude_patterns: exc_str = str(exc_pattern).replace("\\", "/") - rel_str = str(rel_path).replace("\\", "/") - - # Exact match or parent directory match if rel_str == exc_str or rel_str.startswith(exc_str + "/"): return True - return False - + + def file_under_include(rel_path: Path) -> bool: + """True if the file is inside an include path.""" + if not include_patterns: + return True + rel_str = str(rel_path).replace("\\", "/") + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + if rel_str == inc_str or rel_str.startswith(inc_str + "/"): + return True + return False + + def dir_may_contain_includes(rel_path: Path) -> bool: + """True if this directory is inside, equal to, or a parent of any include path.""" + if not include_patterns: + return True + rel_str = str(rel_path).replace("\\", "/") + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + if rel_str == inc_str or rel_str.startswith(inc_str + "/") or inc_str.startswith(rel_str + "/"): + return True + return False + def walk_and_collect(current: Path, rel_prefix: Path): - """Recursive walk function.""" try: for entry in sorted(os.scandir(current), key=lambda e: e.name): entry_path = current / entry.name - rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix - - # Filter by exclude paths first - if should_exclude(entry_path, rel_path): + rel_path = rel_prefix / entry.name + + if should_exclude(rel_path): continue - - # If include_paths specified, only go into matching directories - if include_patterns and not include_path_match(entry_path, rel_path): - if entry.is_dir(): - return # Don't descend into this directory - + if entry.is_file(): - discovered.append({ - "path": str(rel_path).lstrip("/"), - "full_path": str(entry_path), - "is_binary": is_probably_binary(str(entry_path)) - }) + if file_under_include(rel_path): + discovered.append({ + "path": str(rel_path).replace("\\", "/"), + "full_path": str(entry_path), + "is_binary": is_probably_binary(str(entry_path)) + }) elif entry.is_dir(): - walk_and_collect(entry_path, rel_path) - + if dir_may_contain_includes(rel_path): + walk_and_collect(entry_path, rel_path) + except PermissionError: - # Skip directories we can't read pass - - def include_path_match(path: Path, rel_path: Path) -> bool: - """Check if path matches any include pattern (for filtering on the fly).""" - if not include_patterns: - return True - - path_str = str(path).replace("\\", "/") - for inc_pattern in include_patterns: - inc_str = str(inc_pattern).replace("\\", "/") - - # Exact match or parent directory match - if path_str == inc_str or path_str.startswith(inc_str + "/"): - return True - - return False - + def is_probably_binary(filepath: str) -> bool: """Simple binary detection based on file extension and first bytes.""" ext = Path(filepath).suffix.lower() @@ -224,14 +199,44 @@ def discover_files( return False - root_str = str(repo_path).replace("\\", "/") - - # Walk the repository starting from repo root - walk_and_collect(repo_path, Path(".")) - + walk_and_collect(repo_path, Path()) return discovered +def browse_repo_tree(repo_path: Path, max_depth: int = 4) -> List[Dict[str, Any]]: + """ + Return a nested directory tree for browsing a cloned repository. + + Returns a list of nodes: + {"path": "docs", "type": "dir", "children": [...]} + {"path": "README.md", "type": "file"} + Paths are relative to repo_path. Hidden entries (starting with '.') are skipped. + """ + SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", ".tox"} + + def build(current: Path, rel_prefix: Path, depth: int) -> List[Dict[str, Any]]: + if depth <= 0: + return [] + nodes: List[Dict[str, Any]] = [] + try: + entries = sorted(os.scandir(current), key=lambda e: (not e.is_dir(), e.name.lower())) + except PermissionError: + return [] + for entry in entries: + if entry.name.startswith(".") or entry.name in SKIP_DIRS: + continue + rel_path = rel_prefix / entry.name + rel_str = str(rel_path).replace("\\", "/") + if entry.is_dir(): + children = build(Path(entry.path), rel_path, depth - 1) + nodes.append({"path": rel_str, "type": "dir", "children": children}) + else: + nodes.append({"path": rel_str, "type": "file"}) + return nodes + + return build(repo_path, Path(), max_depth) + + async def ingest_git_source( library_id: str, name: str, @@ -263,7 +268,6 @@ async def ingest_git_source( Raises: GitCloneError: If git operations fail """ - from .db import upsert_library from .ingest import ingest_library print(f"\n[Git Ingestion] Processing library: {library_id}") @@ -303,19 +307,35 @@ async def ingest_git_source( "message": "No files found matching include/exclude criteria", "files_discovered": 0 } - - # Remove .git directory if present (avoid processing it) - git_dir = repo_path / ".git" - if git_dir.exists(): - shutil.rmtree(git_dir) - print(f" [Git] Removed .git directory") - - # Ingest using existing library ingestion pipeline + + # Stage only the filtered files into DOCS_PATH/library_id so that + # ingest_library reads exactly what discover_files selected. + from .config import settings + docs_dir = Path(settings.docs_path) / library_id + if docs_dir.exists(): + shutil.rmtree(docs_dir) + docs_dir.mkdir(parents=True, exist_ok=True) + + staged = 0 + for file_info in files: + if file_info.get("is_binary"): + continue + src = Path(file_info["full_path"]) + dst = docs_dir / file_info["path"] + dst.parent.mkdir(parents=True, exist_ok=True) + try: + shutil.copy2(src, dst) + staged += 1 + except OSError as exc: + print(f" [Git] Warning: could not copy {src}: {exc}") + + print(f" [Git] Staged {staged} file(s) to {docs_dir}") + result = await ingest_library( library_id=library_id, name=name, description=description, - source_path=repo_id # Use repo_id as the "source path" for tracking + source_path=library_id, ) return { diff --git a/backend/app/main.py b/backend/app/main.py index b6ff0e7..207cfbd 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -22,7 +22,7 @@ from .db import ( search_libraries, upsert_library, ) -from .git_source import ingest_git_source +from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source from .ingest import ingest_all, ingest_library from .search import get_library_docs, resolve_library_id, search_docs from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name @@ -55,6 +55,11 @@ class GitSourceRequest(BaseModel): exclude_paths: Optional[list[str]] = None +class BrowseRepoRequest(BaseModel): + repo_url: str = Field(..., min_length=1) + branch: str = "main" + + DOCUMENT_EXTENSIONS = { ".md", ".txt", @@ -424,6 +429,26 @@ async def api_add_source(source: GitSourceRequest): return {"success": True, "created": created, "source": source_entry} +@app.post("/api/v1/sources/browse") +async def browse_repo_api(payload: BrowseRepoRequest): + """Shallow-clone a repo and return its directory tree for path selection.""" + import hashlib + + repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10] + repo_id = f"browse-{repo_hash}" + try: + clone_result = clone_or_update_repo( + repo_id=repo_id, + repo_url=payload.repo_url, + branch=payload.branch, + ) + repo_path = Path(clone_result["repo_path"]) + tree = browse_repo_tree(repo_path) + return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch} + except Exception as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + @app.post("/sources/sync") async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None): source_data = await api_list_sources() diff --git a/webui/app/main.py b/webui/app/main.py index 52a7705..9aefd13 100644 --- a/webui/app/main.py +++ b/webui/app/main.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import List, Optional from fastapi import FastAPI, File, Form, Request, UploadFile -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates @@ -257,6 +257,19 @@ def parse_path_list(value: str) -> List[str]: return paths +@app.post("/sources/browse") +async def browse_source(repo_url: str = Form(...), branch: str = Form("main")): + client = get_client() + try: + result = await client.post( + "/api/v1/sources/browse", + json={"repo_url": repo_url, "branch": branch}, + ) + return JSONResponse(content=result) + except Exception as e: + return JSONResponse(status_code=400, content={"success": False, "error": str(e)}) + + @app.post("/sources/add") async def add_source( library_id: str = Form(...), diff --git a/webui/app/templates/sources.html b/webui/app/templates/sources.html index 1c5b349..d4b8a3c 100644 --- a/webui/app/templates/sources.html +++ b/webui/app/templates/sources.html @@ -3,59 +3,267 @@ {% block title %}Sources - Context7 Docs{% endblock %} {% block content %} -