Fix git sync and add repo browser with path selection

- Fix discover_files: rel_path always computed (was stuck at '.' at root),
  include_path_match now uses relative path, 'return' changed to 'continue'
- Fix ingest_git_source: files were cloned but ingested from wrong path
  (docs/repo-id instead of data/repos/repo-id). Now stages filtered files
  into DOCS_PATH/library_id before calling ingest_library.
- Add browse_repo_tree() for interactive repo exploration
- Add POST /api/v1/sources/browse endpoint to backend
- Add /sources/browse proxy route to webui
- Rewrite sources.html: browse repo, expand/collapse tree, check paths to
  include, then save source and sync

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
george
2026-06-06 01:28:10 +01:00
parent 1b61af8873
commit ff4da0cb9e
4 changed files with 394 additions and 128 deletions
+85 -65
View File
@@ -129,84 +129,59 @@ def discover_files(
discovered = [] discovered = []
def should_include(path: Path, rel_path: Path) -> bool: def should_exclude(rel_path: Path) -> bool:
"""Check if a path matches any include pattern."""
if not include_patterns:
return True
# Normalize paths for comparison (handle trailing slashes, etc.)
path_str = str(path).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/") rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# If pattern has subdirs, check prefix match
if "/" in inc_str and not inc_str.endswith("/"):
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
if rel_str.startswith(pattern_base):
return True
elif rel_str == inc_str:
return True
return False
def should_exclude(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
for exc_pattern in exclude_patterns: for exc_pattern in exclude_patterns:
exc_str = str(exc_pattern).replace("\\", "/") exc_str = str(exc_pattern).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
# Exact match or parent directory match
if rel_str == exc_str or rel_str.startswith(exc_str + "/"): if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
return True return True
return False
def file_under_include(rel_path: Path) -> bool:
"""True if the file is inside an include path."""
if not include_patterns:
return True
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
if rel_str == inc_str or rel_str.startswith(inc_str + "/"):
return True
return False
def dir_may_contain_includes(rel_path: Path) -> bool:
"""True if this directory is inside, equal to, or a parent of any include path."""
if not include_patterns:
return True
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
if rel_str == inc_str or rel_str.startswith(inc_str + "/") or inc_str.startswith(rel_str + "/"):
return True
return False return False
def walk_and_collect(current: Path, rel_prefix: Path): def walk_and_collect(current: Path, rel_prefix: Path):
"""Recursive walk function."""
try: try:
for entry in sorted(os.scandir(current), key=lambda e: e.name): for entry in sorted(os.scandir(current), key=lambda e: e.name):
entry_path = current / entry.name entry_path = current / entry.name
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix rel_path = rel_prefix / entry.name
# Filter by exclude paths first if should_exclude(rel_path):
if should_exclude(entry_path, rel_path):
continue continue
# If include_paths specified, only go into matching directories
if include_patterns and not include_path_match(entry_path, rel_path):
if entry.is_dir():
return # Don't descend into this directory
if entry.is_file(): if entry.is_file():
if file_under_include(rel_path):
discovered.append({ discovered.append({
"path": str(rel_path).lstrip("/"), "path": str(rel_path).replace("\\", "/"),
"full_path": str(entry_path), "full_path": str(entry_path),
"is_binary": is_probably_binary(str(entry_path)) "is_binary": is_probably_binary(str(entry_path))
}) })
elif entry.is_dir(): elif entry.is_dir():
if dir_may_contain_includes(rel_path):
walk_and_collect(entry_path, rel_path) walk_and_collect(entry_path, rel_path)
except PermissionError: except PermissionError:
# Skip directories we can't read
pass pass
def include_path_match(path: Path, rel_path: Path) -> bool:
"""Check if path matches any include pattern (for filtering on the fly)."""
if not include_patterns:
return True
path_str = str(path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# Exact match or parent directory match
if path_str == inc_str or path_str.startswith(inc_str + "/"):
return True
return False
def is_probably_binary(filepath: str) -> bool: def is_probably_binary(filepath: str) -> bool:
"""Simple binary detection based on file extension and first bytes.""" """Simple binary detection based on file extension and first bytes."""
ext = Path(filepath).suffix.lower() ext = Path(filepath).suffix.lower()
@@ -224,14 +199,44 @@ def discover_files(
return False return False
root_str = str(repo_path).replace("\\", "/") walk_and_collect(repo_path, Path())
# Walk the repository starting from repo root
walk_and_collect(repo_path, Path("."))
return discovered return discovered
def browse_repo_tree(repo_path: Path, max_depth: int = 4) -> List[Dict[str, Any]]:
"""
Return a nested directory tree for browsing a cloned repository.
Returns a list of nodes:
{"path": "docs", "type": "dir", "children": [...]}
{"path": "README.md", "type": "file"}
Paths are relative to repo_path. Hidden entries (starting with '.') are skipped.
"""
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", ".tox"}
def build(current: Path, rel_prefix: Path, depth: int) -> List[Dict[str, Any]]:
if depth <= 0:
return []
nodes: List[Dict[str, Any]] = []
try:
entries = sorted(os.scandir(current), key=lambda e: (not e.is_dir(), e.name.lower()))
except PermissionError:
return []
for entry in entries:
if entry.name.startswith(".") or entry.name in SKIP_DIRS:
continue
rel_path = rel_prefix / entry.name
rel_str = str(rel_path).replace("\\", "/")
if entry.is_dir():
children = build(Path(entry.path), rel_path, depth - 1)
nodes.append({"path": rel_str, "type": "dir", "children": children})
else:
nodes.append({"path": rel_str, "type": "file"})
return nodes
return build(repo_path, Path(), max_depth)
async def ingest_git_source( async def ingest_git_source(
library_id: str, library_id: str,
name: str, name: str,
@@ -263,7 +268,6 @@ async def ingest_git_source(
Raises: Raises:
GitCloneError: If git operations fail GitCloneError: If git operations fail
""" """
from .db import upsert_library
from .ingest import ingest_library from .ingest import ingest_library
print(f"\n[Git Ingestion] Processing library: {library_id}") print(f"\n[Git Ingestion] Processing library: {library_id}")
@@ -304,18 +308,34 @@ async def ingest_git_source(
"files_discovered": 0 "files_discovered": 0
} }
# Remove .git directory if present (avoid processing it) # Stage only the filtered files into DOCS_PATH/library_id so that
git_dir = repo_path / ".git" # ingest_library reads exactly what discover_files selected.
if git_dir.exists(): from .config import settings
shutil.rmtree(git_dir) docs_dir = Path(settings.docs_path) / library_id
print(f" [Git] Removed .git directory") if docs_dir.exists():
shutil.rmtree(docs_dir)
docs_dir.mkdir(parents=True, exist_ok=True)
staged = 0
for file_info in files:
if file_info.get("is_binary"):
continue
src = Path(file_info["full_path"])
dst = docs_dir / file_info["path"]
dst.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(src, dst)
staged += 1
except OSError as exc:
print(f" [Git] Warning: could not copy {src}: {exc}")
print(f" [Git] Staged {staged} file(s) to {docs_dir}")
# Ingest using existing library ingestion pipeline
result = await ingest_library( result = await ingest_library(
library_id=library_id, library_id=library_id,
name=name, name=name,
description=description, description=description,
source_path=repo_id # Use repo_id as the "source path" for tracking source_path=library_id,
) )
return { return {
+26 -1
View File
@@ -22,7 +22,7 @@ from .db import (
search_libraries, search_libraries,
upsert_library, upsert_library,
) )
from .git_source import ingest_git_source from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source
from .ingest import ingest_all, ingest_library from .ingest import ingest_all, ingest_library
from .search import get_library_docs, resolve_library_id, search_docs from .search import get_library_docs, resolve_library_id, search_docs
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
@@ -55,6 +55,11 @@ class GitSourceRequest(BaseModel):
exclude_paths: Optional[list[str]] = None exclude_paths: Optional[list[str]] = None
class BrowseRepoRequest(BaseModel):
repo_url: str = Field(..., min_length=1)
branch: str = "main"
DOCUMENT_EXTENSIONS = { DOCUMENT_EXTENSIONS = {
".md", ".md",
".txt", ".txt",
@@ -424,6 +429,26 @@ async def api_add_source(source: GitSourceRequest):
return {"success": True, "created": created, "source": source_entry} return {"success": True, "created": created, "source": source_entry}
@app.post("/api/v1/sources/browse")
async def browse_repo_api(payload: BrowseRepoRequest):
"""Shallow-clone a repo and return its directory tree for path selection."""
import hashlib
repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10]
repo_id = f"browse-{repo_hash}"
try:
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=payload.repo_url,
branch=payload.branch,
)
repo_path = Path(clone_result["repo_path"])
tree = browse_repo_tree(repo_path)
return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch}
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc))
@app.post("/sources/sync") @app.post("/sources/sync")
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None): async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
source_data = await api_list_sources() source_data = await api_list_sources()
+14 -1
View File
@@ -5,7 +5,7 @@ from pathlib import Path
from typing import List, Optional from typing import List, Optional
from fastapi import FastAPI, File, Form, Request, UploadFile from fastapi import FastAPI, File, Form, Request, UploadFile
from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
@@ -257,6 +257,19 @@ def parse_path_list(value: str) -> List[str]:
return paths return paths
@app.post("/sources/browse")
async def browse_source(repo_url: str = Form(...), branch: str = Form("main")):
client = get_client()
try:
result = await client.post(
"/api/v1/sources/browse",
json={"repo_url": repo_url, "branch": branch},
)
return JSONResponse(content=result)
except Exception as e:
return JSONResponse(status_code=400, content={"success": False, "error": str(e)})
@app.post("/sources/add") @app.post("/sources/add")
async def add_source( async def add_source(
library_id: str = Form(...), library_id: str = Form(...),
+241 -33
View File
@@ -3,59 +3,267 @@
{% block title %}Sources - Context7 Docs{% endblock %} {% block title %}Sources - Context7 Docs{% endblock %}
{% block content %} {% block content %}
<h2>Git Repository Sync</h2> <h2>Git Repository Sources</h2>
<div class="status-message">Add Git repositories to <code>docs_sources.yaml</code>, then sync them into searchable libraries.</div> <div class="status-message">Add a Git repository, browse its structure, select paths to include, then save and sync.</div>
<!-- Step 1: Enter repo details and browse -->
<div class="create-form">
<h3>Add Git Source</h3> <h3>Add Git Source</h3>
<form method="post" action="/sources/add" class="sync-form">
<label for="library_id">Library ID:</label>
<input type="text" id="library_id" name="library_id" placeholder="my-project" required>
<label for="repo_url">Repository URL:</label> <div style="display:flex;gap:10px;flex-wrap:wrap;margin-bottom:8px;">
<input type="text" id="repo_url" name="repo_url" placeholder="https://github.com/user/repo.git" required> <div style="flex:2;min-width:220px;">
<label for="browse_url" style="display:block;font-weight:bold;margin-bottom:4px;">Repository URL</label>
<input type="text" id="browse_url" placeholder="https://github.com/user/repo.git"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div style="flex:0 0 120px;">
<label for="browse_branch" style="display:block;font-weight:bold;margin-bottom:4px;">Branch</label>
<input type="text" id="browse_branch" value="main"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div style="flex:0 0 auto;align-self:flex-end;padding-bottom:1px;">
<button type="button" id="browse-btn" class="btn btn-secondary" onclick="browseRepo()">Browse Repository</button>
</div>
</div>
<label for="name">Display Name:</label> <!-- Tree viewer -->
<input type="text" id="name" name="name" placeholder="My Project"> <div id="tree-section" style="display:none;margin-top:12px;">
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:6px;">
<strong>Select paths to include:</strong>
<span>
<button type="button" class="btn btn-sm btn-info" onclick="selectAll()">Select All Dirs</button>
<button type="button" class="btn btn-sm" style="background:#888;color:#fff;" onclick="clearAll()">Clear</button>
</span>
</div>
<div id="tree-container"
style="border:1px solid #ccc;border-radius:4px;padding:10px;max-height:380px;overflow-y:auto;background:#fafafa;font-family:monospace;font-size:0.88rem;">
</div>
</div>
<label for="description">Description:</label> <!-- Step 2: Library details (shown after browse) -->
<input type="text" id="description" name="description" placeholder="Project documentation"> <div id="save-section" style="display:none;margin-top:16px;">
<form method="post" action="/sources/add" id="save-form">
<input type="hidden" id="include_paths" name="include_paths" value="">
<label for="branch">Branch:</label> <div style="display:grid;grid-template-columns:1fr 1fr;gap:10px;margin-bottom:10px;">
<input type="text" id="branch" name="branch" value="main"> <div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Library ID <span style="color:red;">*</span></label>
<input type="text" name="library_id" id="lib_id" placeholder="my-project" required
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Display Name</label>
<input type="text" name="name" id="lib_name" placeholder="My Project"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Description</label>
<input type="text" name="description" placeholder="Optional description"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Exclude Paths</label>
<input type="text" name="exclude_paths" value="node_modules,.git" placeholder="node_modules,.git"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
</div>
<label for="include_paths">Include Paths:</label> <input type="hidden" name="repo_url" id="form_repo_url">
<textarea id="include_paths" name="include_paths" rows="4">docs</textarea> <input type="hidden" name="branch" id="form_branch">
<label for="exclude_paths">Exclude Paths:</label> <div id="selected-paths-preview"
<textarea id="exclude_paths" name="exclude_paths" rows="4">node_modules style="background:#e8f4fd;border-radius:4px;padding:8px;margin-bottom:10px;font-size:0.85rem;display:none;">
.git</textarea> <strong>Selected include paths:</strong> <span id="paths-list"></span>
</div>
<button type="submit">Save Git Source</button> <button type="submit" class="btn btn-primary">Save Git Source</button>
</form> </form>
</div>
</div>
<form method="post" action="/sources/sync" class="sync-form"> <!-- Configured sources list -->
<label for="override">Override existing repos:</label>
<input type="checkbox" id="override" name="override">
<button type="submit">Sync All Repositories</button>
</form>
<div id="source-list"></div>
{% if sources %} {% if sources %}
<h3>Configured Sources</h3> <h3 style="margin-top:30px;">Configured Sources ({{ sources|length }})</h3>
<div class="source-cards"> <div class="source-cards">
{% for src in sources %} {% for src in sources %}
<div class="source-card"> <div class="source-card">
<strong>{{ src.library_id | default('unknown') }}</strong><br> <strong>{{ src.library_id | default('unknown') }}</strong>
URL: {{ src.repo_url | default('N/A') }}<br> &nbsp;<span style="color:#666;font-size:0.85rem;">{{ src.repo_url | default('') }}</span><br>
Branch: {{ src.branch | default('main') }}<br> Branch: {{ src.branch | default('main') }}
Include: {{ src.include_paths | default(['*']) | join(', ') }} &nbsp;|&nbsp; Include: {{ src.include_paths | default(['*']) | join(', ') }}
{% if src.exclude_paths %}
&nbsp;|&nbsp; Exclude: {{ src.exclude_paths | join(', ') }}
{% endif %}
</div> </div>
{% endfor %} {% endfor %}
</div> </div>
{% else %} {% else %}
<p>No git sources configured. Add repositories to <code>docs_sources.yaml</code>.</p> <p style="margin-top:20px;color:#666;">No git sources configured yet.</p>
{% endif %} {% endif %}
<div style="margin-top:24px;">
<form method="post" action="/sources/sync" style="display:inline;">
<button type="submit" class="btn btn-primary">Sync All Repositories</button>
</form>
</div>
{% endblock %}
{% block scripts %}
<script>
const checkedPaths = new Set();
async function browseRepo() {
const url = document.getElementById('browse_url').value.trim();
const branch = document.getElementById('browse_branch').value.trim() || 'main';
if (!url) { alert('Enter a repository URL first.'); return; }
const btn = document.getElementById('browse-btn');
btn.disabled = true;
btn.textContent = 'Cloning…';
try {
const fd = new FormData();
fd.append('repo_url', url);
fd.append('branch', branch);
const resp = await fetch('/sources/browse', { method: 'POST', body: fd });
const data = await resp.json();
if (!data.success) {
alert('Browse failed: ' + (data.detail || data.error || 'unknown error'));
return;
}
checkedPaths.clear();
renderTree(data.tree);
document.getElementById('tree-section').style.display = 'block';
document.getElementById('save-section').style.display = 'block';
document.getElementById('form_repo_url').value = url;
document.getElementById('form_branch').value = branch;
// Auto-fill library ID from repo name
const repoName = url.split('/').pop().replace(/\.git$/, '').toLowerCase().replace(/[^a-z0-9-]/g, '-');
if (!document.getElementById('lib_id').value) {
document.getElementById('lib_id').value = repoName;
document.getElementById('lib_name').value = repoName;
}
} catch(e) {
alert('Request failed: ' + e.message);
} finally {
btn.disabled = false;
btn.textContent = 'Browse Repository';
}
}
function renderTree(nodes) {
const container = document.getElementById('tree-container');
container.innerHTML = buildTreeHTML(nodes, 0);
}
function buildTreeHTML(nodes, depth) {
if (!nodes || nodes.length === 0) return '';
let html = '<ul style="list-style:none;padding-left:' + (depth === 0 ? 0 : 18) + 'px;margin:0;">';
for (const node of nodes) {
if (node.type === 'dir') {
const hasChildren = node.children && node.children.length > 0;
html += `<li style="margin:2px 0;">
<span class="tree-toggle" onclick="toggleDir(this)" style="cursor:pointer;user-select:none;">&#9656;</span>
<label style="cursor:pointer;">
<input type="checkbox" class="dir-check" data-path="${node.path}"
onchange="onDirCheck(this)"> &#128193; ${node.path.split('/').pop()}/
</label>
<div class="tree-children" style="display:none;">
${hasChildren ? buildTreeHTML(node.children, depth + 1) : ''}
</div>
</li>`;
} else {
html += `<li style="margin:2px 0;color:#555;">
<label style="cursor:pointer;padding-left:20px;">
<input type="checkbox" class="file-check" data-path="${node.path}"
onchange="onFileCheck(this)"> &#128196; ${node.path.split('/').pop()}
</label>
</li>`;
}
}
html += '</ul>';
return html;
}
function toggleDir(arrow) {
const li = arrow.closest('li');
const children = li.querySelector('.tree-children');
if (children.style.display === 'none') {
children.style.display = 'block';
arrow.innerHTML = '&#9662;';
} else {
children.style.display = 'none';
arrow.innerHTML = '&#9656;';
}
}
function onDirCheck(cb) {
if (cb.checked) {
checkedPaths.add(cb.dataset.path);
// Uncheck any parent dirs that are already checked (this dir is more specific)
// and uncheck descendant checkboxes to avoid redundancy
const li = cb.closest('li');
li.querySelectorAll('.dir-check,.file-check').forEach(c => {
if (c !== cb) { c.checked = false; checkedPaths.delete(c.dataset.path); }
});
} else {
checkedPaths.delete(cb.dataset.path);
}
updatePreview();
}
function onFileCheck(cb) {
if (cb.checked) checkedPaths.add(cb.dataset.path);
else checkedPaths.delete(cb.dataset.path);
updatePreview();
}
function selectAll() {
checkedPaths.clear();
document.querySelectorAll('.dir-check').forEach(cb => {
// Only select top-level dirs
if (!cb.dataset.path.includes('/')) {
cb.checked = true;
checkedPaths.add(cb.dataset.path);
cb.closest('li').querySelectorAll('.dir-check,.file-check').forEach(c => {
if (c !== cb) { c.checked = false; }
});
}
});
updatePreview();
}
function clearAll() {
checkedPaths.clear();
document.querySelectorAll('.dir-check,.file-check').forEach(c => c.checked = false);
updatePreview();
}
function updatePreview() {
const paths = [...checkedPaths].sort();
document.getElementById('include_paths').value = paths.join('\n');
const preview = document.getElementById('selected-paths-preview');
const list = document.getElementById('paths-list');
if (paths.length > 0) {
preview.style.display = 'block';
list.textContent = paths.join(', ');
} else {
preview.style.display = 'none';
list.textContent = '';
}
}
document.getElementById('save-form').addEventListener('submit', function(e) {
if (checkedPaths.size === 0) {
if (!confirm('No paths selected — this will ingest the entire repository. Continue?')) {
e.preventDefault();
}
}
});
</script>
{% endblock %} {% endblock %}