From ff4da0cb9eaf342694a33aceb80216c9235c0aaa Mon Sep 17 00:00:00 2001 From: george Date: Sat, 6 Jun 2026 01:28:10 +0100 Subject: [PATCH] Fix git sync and add repo browser with path selection - Fix discover_files: rel_path always computed (was stuck at '.' at root), include_path_match now uses relative path, 'return' changed to 'continue' - Fix ingest_git_source: files were cloned but ingested from wrong path (docs/repo-id instead of data/repos/repo-id). Now stages filtered files into DOCS_PATH/library_id before calling ingest_library. - Add browse_repo_tree() for interactive repo exploration - Add POST /api/v1/sources/browse endpoint to backend - Add /sources/browse proxy route to webui - Rewrite sources.html: browse repo, expand/collapse tree, check paths to include, then save source and sync Co-Authored-By: Claude Sonnet 4.6 --- backend/app/git_source.py | 178 ++++++++++-------- backend/app/main.py | 27 ++- webui/app/main.py | 15 +- webui/app/templates/sources.html | 302 ++++++++++++++++++++++++++----- 4 files changed, 394 insertions(+), 128 deletions(-) diff --git a/backend/app/git_source.py b/backend/app/git_source.py index b553f1e..26a0a14 100644 --- a/backend/app/git_source.py +++ b/backend/app/git_source.py @@ -128,85 +128,60 @@ def discover_files( } discovered = [] - - def should_include(path: Path, rel_path: Path) -> bool: - """Check if a path matches any include pattern.""" - if not include_patterns: - return True - - # Normalize paths for comparison (handle trailing slashes, etc.) - path_str = str(path).replace("\\", "/") + + def should_exclude(rel_path: Path) -> bool: rel_str = str(rel_path).replace("\\", "/") - - for inc_pattern in include_patterns: - inc_str = str(inc_pattern).replace("\\", "/") - - # If pattern has subdirs, check prefix match - if "/" in inc_str and not inc_str.endswith("/"): - pattern_base = inc_str.rsplit("/", 1)[0] + "/" - if rel_str.startswith(pattern_base): - return True - elif rel_str == inc_str: - return True - - return False - - def should_exclude(path: Path, rel_path: Path) -> bool: - """Check if a path matches any exclude pattern (simple prefix/exact match).""" for exc_pattern in exclude_patterns: exc_str = str(exc_pattern).replace("\\", "/") - rel_str = str(rel_path).replace("\\", "/") - - # Exact match or parent directory match if rel_str == exc_str or rel_str.startswith(exc_str + "/"): return True - return False - + + def file_under_include(rel_path: Path) -> bool: + """True if the file is inside an include path.""" + if not include_patterns: + return True + rel_str = str(rel_path).replace("\\", "/") + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + if rel_str == inc_str or rel_str.startswith(inc_str + "/"): + return True + return False + + def dir_may_contain_includes(rel_path: Path) -> bool: + """True if this directory is inside, equal to, or a parent of any include path.""" + if not include_patterns: + return True + rel_str = str(rel_path).replace("\\", "/") + for inc_pattern in include_patterns: + inc_str = str(inc_pattern).replace("\\", "/") + if rel_str == inc_str or rel_str.startswith(inc_str + "/") or inc_str.startswith(rel_str + "/"): + return True + return False + def walk_and_collect(current: Path, rel_prefix: Path): - """Recursive walk function.""" try: for entry in sorted(os.scandir(current), key=lambda e: e.name): entry_path = current / entry.name - rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix - - # Filter by exclude paths first - if should_exclude(entry_path, rel_path): + rel_path = rel_prefix / entry.name + + if should_exclude(rel_path): continue - - # If include_paths specified, only go into matching directories - if include_patterns and not include_path_match(entry_path, rel_path): - if entry.is_dir(): - return # Don't descend into this directory - + if entry.is_file(): - discovered.append({ - "path": str(rel_path).lstrip("/"), - "full_path": str(entry_path), - "is_binary": is_probably_binary(str(entry_path)) - }) + if file_under_include(rel_path): + discovered.append({ + "path": str(rel_path).replace("\\", "/"), + "full_path": str(entry_path), + "is_binary": is_probably_binary(str(entry_path)) + }) elif entry.is_dir(): - walk_and_collect(entry_path, rel_path) - + if dir_may_contain_includes(rel_path): + walk_and_collect(entry_path, rel_path) + except PermissionError: - # Skip directories we can't read pass - - def include_path_match(path: Path, rel_path: Path) -> bool: - """Check if path matches any include pattern (for filtering on the fly).""" - if not include_patterns: - return True - - path_str = str(path).replace("\\", "/") - for inc_pattern in include_patterns: - inc_str = str(inc_pattern).replace("\\", "/") - - # Exact match or parent directory match - if path_str == inc_str or path_str.startswith(inc_str + "/"): - return True - - return False - + def is_probably_binary(filepath: str) -> bool: """Simple binary detection based on file extension and first bytes.""" ext = Path(filepath).suffix.lower() @@ -224,14 +199,44 @@ def discover_files( return False - root_str = str(repo_path).replace("\\", "/") - - # Walk the repository starting from repo root - walk_and_collect(repo_path, Path(".")) - + walk_and_collect(repo_path, Path()) return discovered +def browse_repo_tree(repo_path: Path, max_depth: int = 4) -> List[Dict[str, Any]]: + """ + Return a nested directory tree for browsing a cloned repository. + + Returns a list of nodes: + {"path": "docs", "type": "dir", "children": [...]} + {"path": "README.md", "type": "file"} + Paths are relative to repo_path. Hidden entries (starting with '.') are skipped. + """ + SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", ".tox"} + + def build(current: Path, rel_prefix: Path, depth: int) -> List[Dict[str, Any]]: + if depth <= 0: + return [] + nodes: List[Dict[str, Any]] = [] + try: + entries = sorted(os.scandir(current), key=lambda e: (not e.is_dir(), e.name.lower())) + except PermissionError: + return [] + for entry in entries: + if entry.name.startswith(".") or entry.name in SKIP_DIRS: + continue + rel_path = rel_prefix / entry.name + rel_str = str(rel_path).replace("\\", "/") + if entry.is_dir(): + children = build(Path(entry.path), rel_path, depth - 1) + nodes.append({"path": rel_str, "type": "dir", "children": children}) + else: + nodes.append({"path": rel_str, "type": "file"}) + return nodes + + return build(repo_path, Path(), max_depth) + + async def ingest_git_source( library_id: str, name: str, @@ -263,7 +268,6 @@ async def ingest_git_source( Raises: GitCloneError: If git operations fail """ - from .db import upsert_library from .ingest import ingest_library print(f"\n[Git Ingestion] Processing library: {library_id}") @@ -303,19 +307,35 @@ async def ingest_git_source( "message": "No files found matching include/exclude criteria", "files_discovered": 0 } - - # Remove .git directory if present (avoid processing it) - git_dir = repo_path / ".git" - if git_dir.exists(): - shutil.rmtree(git_dir) - print(f" [Git] Removed .git directory") - - # Ingest using existing library ingestion pipeline + + # Stage only the filtered files into DOCS_PATH/library_id so that + # ingest_library reads exactly what discover_files selected. + from .config import settings + docs_dir = Path(settings.docs_path) / library_id + if docs_dir.exists(): + shutil.rmtree(docs_dir) + docs_dir.mkdir(parents=True, exist_ok=True) + + staged = 0 + for file_info in files: + if file_info.get("is_binary"): + continue + src = Path(file_info["full_path"]) + dst = docs_dir / file_info["path"] + dst.parent.mkdir(parents=True, exist_ok=True) + try: + shutil.copy2(src, dst) + staged += 1 + except OSError as exc: + print(f" [Git] Warning: could not copy {src}: {exc}") + + print(f" [Git] Staged {staged} file(s) to {docs_dir}") + result = await ingest_library( library_id=library_id, name=name, description=description, - source_path=repo_id # Use repo_id as the "source path" for tracking + source_path=library_id, ) return { diff --git a/backend/app/main.py b/backend/app/main.py index b6ff0e7..207cfbd 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -22,7 +22,7 @@ from .db import ( search_libraries, upsert_library, ) -from .git_source import ingest_git_source +from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source from .ingest import ingest_all, ingest_library from .search import get_library_docs, resolve_library_id, search_docs from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name @@ -55,6 +55,11 @@ class GitSourceRequest(BaseModel): exclude_paths: Optional[list[str]] = None +class BrowseRepoRequest(BaseModel): + repo_url: str = Field(..., min_length=1) + branch: str = "main" + + DOCUMENT_EXTENSIONS = { ".md", ".txt", @@ -424,6 +429,26 @@ async def api_add_source(source: GitSourceRequest): return {"success": True, "created": created, "source": source_entry} +@app.post("/api/v1/sources/browse") +async def browse_repo_api(payload: BrowseRepoRequest): + """Shallow-clone a repo and return its directory tree for path selection.""" + import hashlib + + repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10] + repo_id = f"browse-{repo_hash}" + try: + clone_result = clone_or_update_repo( + repo_id=repo_id, + repo_url=payload.repo_url, + branch=payload.branch, + ) + repo_path = Path(clone_result["repo_path"]) + tree = browse_repo_tree(repo_path) + return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch} + except Exception as exc: + raise HTTPException(status_code=400, detail=str(exc)) + + @app.post("/sources/sync") async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None): source_data = await api_list_sources() diff --git a/webui/app/main.py b/webui/app/main.py index 52a7705..9aefd13 100644 --- a/webui/app/main.py +++ b/webui/app/main.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import List, Optional from fastapi import FastAPI, File, Form, Request, UploadFile -from fastapi.responses import HTMLResponse, RedirectResponse +from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates @@ -257,6 +257,19 @@ def parse_path_list(value: str) -> List[str]: return paths +@app.post("/sources/browse") +async def browse_source(repo_url: str = Form(...), branch: str = Form("main")): + client = get_client() + try: + result = await client.post( + "/api/v1/sources/browse", + json={"repo_url": repo_url, "branch": branch}, + ) + return JSONResponse(content=result) + except Exception as e: + return JSONResponse(status_code=400, content={"success": False, "error": str(e)}) + + @app.post("/sources/add") async def add_source( library_id: str = Form(...), diff --git a/webui/app/templates/sources.html b/webui/app/templates/sources.html index 1c5b349..d4b8a3c 100644 --- a/webui/app/templates/sources.html +++ b/webui/app/templates/sources.html @@ -3,59 +3,267 @@ {% block title %}Sources - Context7 Docs{% endblock %} {% block content %} -

Git Repository Sync

+

Git Repository Sources

-
Add Git repositories to docs_sources.yaml, then sync them into searchable libraries.
+
Add a Git repository, browse its structure, select paths to include, then save and sync.
-

Add Git Source

-
- - + +
+

Add Git Source

- - - - - - - - - - - - - - - - - - - - - -
- - - -
- -
- -{% if sources %} -

Configured Sources

-
- {% for src in sources %} -
- {{ src.library_id | default('unknown') }}
- URL: {{ src.repo_url | default('N/A') }}
- Branch: {{ src.branch | default('main') }}
- Include: {{ src.include_paths | default(['*']) | join(', ') }} +
+
+ +
- {% endfor %} +
+ + +
+
+ +
+
+ + + + + + +
+ + +{% if sources %} +

Configured Sources ({{ sources|length }})

+
+ {% for src in sources %} +
+ {{ src.library_id | default('unknown') }} +  {{ src.repo_url | default('') }}
+ Branch: {{ src.branch | default('main') }} +  |  Include: {{ src.include_paths | default(['*']) | join(', ') }} + {% if src.exclude_paths %} +  |  Exclude: {{ src.exclude_paths | join(', ') }} + {% endif %} +
+ {% endfor %}
{% else %} -

No git sources configured. Add repositories to docs_sources.yaml.

+

No git sources configured yet.

{% endif %} +
+
+ +
+
+ +{% endblock %} + +{% block scripts %} + {% endblock %}