Fix git sync and add repo browser with path selection

- Fix discover_files: rel_path always computed (was stuck at '.' at root),
  include_path_match now uses relative path, 'return' changed to 'continue'
- Fix ingest_git_source: files were cloned but ingested from wrong path
  (docs/repo-id instead of data/repos/repo-id). Now stages filtered files
  into DOCS_PATH/library_id before calling ingest_library.
- Add browse_repo_tree() for interactive repo exploration
- Add POST /api/v1/sources/browse endpoint to backend
- Add /sources/browse proxy route to webui
- Rewrite sources.html: browse repo, expand/collapse tree, check paths to
  include, then save source and sync

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
george
2026-06-06 01:28:10 +01:00
parent 1b61af8873
commit ff4da0cb9e
4 changed files with 394 additions and 128 deletions
+85 -65
View File
@@ -129,84 +129,59 @@ def discover_files(
discovered = []
def should_include(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any include pattern."""
if not include_patterns:
return True
# Normalize paths for comparison (handle trailing slashes, etc.)
path_str = str(path).replace("\\", "/")
def should_exclude(rel_path: Path) -> bool:
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# If pattern has subdirs, check prefix match
if "/" in inc_str and not inc_str.endswith("/"):
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
if rel_str.startswith(pattern_base):
return True
elif rel_str == inc_str:
return True
return False
def should_exclude(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
for exc_pattern in exclude_patterns:
exc_str = str(exc_pattern).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
# Exact match or parent directory match
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
return True
return False
def file_under_include(rel_path: Path) -> bool:
"""True if the file is inside an include path."""
if not include_patterns:
return True
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
if rel_str == inc_str or rel_str.startswith(inc_str + "/"):
return True
return False
def dir_may_contain_includes(rel_path: Path) -> bool:
"""True if this directory is inside, equal to, or a parent of any include path."""
if not include_patterns:
return True
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
if rel_str == inc_str or rel_str.startswith(inc_str + "/") or inc_str.startswith(rel_str + "/"):
return True
return False
def walk_and_collect(current: Path, rel_prefix: Path):
"""Recursive walk function."""
try:
for entry in sorted(os.scandir(current), key=lambda e: e.name):
entry_path = current / entry.name
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
rel_path = rel_prefix / entry.name
# Filter by exclude paths first
if should_exclude(entry_path, rel_path):
if should_exclude(rel_path):
continue
# If include_paths specified, only go into matching directories
if include_patterns and not include_path_match(entry_path, rel_path):
if entry.is_dir():
return # Don't descend into this directory
if entry.is_file():
if file_under_include(rel_path):
discovered.append({
"path": str(rel_path).lstrip("/"),
"path": str(rel_path).replace("\\", "/"),
"full_path": str(entry_path),
"is_binary": is_probably_binary(str(entry_path))
})
elif entry.is_dir():
if dir_may_contain_includes(rel_path):
walk_and_collect(entry_path, rel_path)
except PermissionError:
# Skip directories we can't read
pass
def include_path_match(path: Path, rel_path: Path) -> bool:
"""Check if path matches any include pattern (for filtering on the fly)."""
if not include_patterns:
return True
path_str = str(path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# Exact match or parent directory match
if path_str == inc_str or path_str.startswith(inc_str + "/"):
return True
return False
def is_probably_binary(filepath: str) -> bool:
"""Simple binary detection based on file extension and first bytes."""
ext = Path(filepath).suffix.lower()
@@ -224,14 +199,44 @@ def discover_files(
return False
root_str = str(repo_path).replace("\\", "/")
# Walk the repository starting from repo root
walk_and_collect(repo_path, Path("."))
walk_and_collect(repo_path, Path())
return discovered
def browse_repo_tree(repo_path: Path, max_depth: int = 4) -> List[Dict[str, Any]]:
"""
Return a nested directory tree for browsing a cloned repository.
Returns a list of nodes:
{"path": "docs", "type": "dir", "children": [...]}
{"path": "README.md", "type": "file"}
Paths are relative to repo_path. Hidden entries (starting with '.') are skipped.
"""
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", ".tox"}
def build(current: Path, rel_prefix: Path, depth: int) -> List[Dict[str, Any]]:
if depth <= 0:
return []
nodes: List[Dict[str, Any]] = []
try:
entries = sorted(os.scandir(current), key=lambda e: (not e.is_dir(), e.name.lower()))
except PermissionError:
return []
for entry in entries:
if entry.name.startswith(".") or entry.name in SKIP_DIRS:
continue
rel_path = rel_prefix / entry.name
rel_str = str(rel_path).replace("\\", "/")
if entry.is_dir():
children = build(Path(entry.path), rel_path, depth - 1)
nodes.append({"path": rel_str, "type": "dir", "children": children})
else:
nodes.append({"path": rel_str, "type": "file"})
return nodes
return build(repo_path, Path(), max_depth)
async def ingest_git_source(
library_id: str,
name: str,
@@ -263,7 +268,6 @@ async def ingest_git_source(
Raises:
GitCloneError: If git operations fail
"""
from .db import upsert_library
from .ingest import ingest_library
print(f"\n[Git Ingestion] Processing library: {library_id}")
@@ -304,18 +308,34 @@ async def ingest_git_source(
"files_discovered": 0
}
# Remove .git directory if present (avoid processing it)
git_dir = repo_path / ".git"
if git_dir.exists():
shutil.rmtree(git_dir)
print(f" [Git] Removed .git directory")
# Stage only the filtered files into DOCS_PATH/library_id so that
# ingest_library reads exactly what discover_files selected.
from .config import settings
docs_dir = Path(settings.docs_path) / library_id
if docs_dir.exists():
shutil.rmtree(docs_dir)
docs_dir.mkdir(parents=True, exist_ok=True)
staged = 0
for file_info in files:
if file_info.get("is_binary"):
continue
src = Path(file_info["full_path"])
dst = docs_dir / file_info["path"]
dst.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(src, dst)
staged += 1
except OSError as exc:
print(f" [Git] Warning: could not copy {src}: {exc}")
print(f" [Git] Staged {staged} file(s) to {docs_dir}")
# Ingest using existing library ingestion pipeline
result = await ingest_library(
library_id=library_id,
name=name,
description=description,
source_path=repo_id # Use repo_id as the "source path" for tracking
source_path=library_id,
)
return {
+26 -1
View File
@@ -22,7 +22,7 @@ from .db import (
search_libraries,
upsert_library,
)
from .git_source import ingest_git_source
from .git_source import browse_repo_tree, clone_or_update_repo, ingest_git_source
from .ingest import ingest_all, ingest_library
from .search import get_library_docs, resolve_library_id, search_docs
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
@@ -55,6 +55,11 @@ class GitSourceRequest(BaseModel):
exclude_paths: Optional[list[str]] = None
class BrowseRepoRequest(BaseModel):
repo_url: str = Field(..., min_length=1)
branch: str = "main"
DOCUMENT_EXTENSIONS = {
".md",
".txt",
@@ -424,6 +429,26 @@ async def api_add_source(source: GitSourceRequest):
return {"success": True, "created": created, "source": source_entry}
@app.post("/api/v1/sources/browse")
async def browse_repo_api(payload: BrowseRepoRequest):
"""Shallow-clone a repo and return its directory tree for path selection."""
import hashlib
repo_hash = hashlib.md5(payload.repo_url.encode()).hexdigest()[:10]
repo_id = f"browse-{repo_hash}"
try:
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=payload.repo_url,
branch=payload.branch,
)
repo_path = Path(clone_result["repo_path"])
tree = browse_repo_tree(repo_path)
return {"success": True, "tree": tree, "repo_url": payload.repo_url, "branch": payload.branch}
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc))
@app.post("/sources/sync")
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
source_data = await api_list_sources()
+14 -1
View File
@@ -5,7 +5,7 @@ from pathlib import Path
from typing import List, Optional
from fastapi import FastAPI, File, Form, Request, UploadFile
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
@@ -257,6 +257,19 @@ def parse_path_list(value: str) -> List[str]:
return paths
@app.post("/sources/browse")
async def browse_source(repo_url: str = Form(...), branch: str = Form("main")):
client = get_client()
try:
result = await client.post(
"/api/v1/sources/browse",
json={"repo_url": repo_url, "branch": branch},
)
return JSONResponse(content=result)
except Exception as e:
return JSONResponse(status_code=400, content={"success": False, "error": str(e)})
@app.post("/sources/add")
async def add_source(
library_id: str = Form(...),
+243 -35
View File
@@ -3,59 +3,267 @@
{% block title %}Sources - Context7 Docs{% endblock %}
{% block content %}
<h2>Git Repository Sync</h2>
<h2>Git Repository Sources</h2>
<div class="status-message">Add Git repositories to <code>docs_sources.yaml</code>, then sync them into searchable libraries.</div>
<div class="status-message">Add a Git repository, browse its structure, select paths to include, then save and sync.</div>
<h3>Add Git Source</h3>
<form method="post" action="/sources/add" class="sync-form">
<label for="library_id">Library ID:</label>
<input type="text" id="library_id" name="library_id" placeholder="my-project" required>
<!-- Step 1: Enter repo details and browse -->
<div class="create-form">
<h3>Add Git Source</h3>
<label for="repo_url">Repository URL:</label>
<input type="text" id="repo_url" name="repo_url" placeholder="https://github.com/user/repo.git" required>
<div style="display:flex;gap:10px;flex-wrap:wrap;margin-bottom:8px;">
<div style="flex:2;min-width:220px;">
<label for="browse_url" style="display:block;font-weight:bold;margin-bottom:4px;">Repository URL</label>
<input type="text" id="browse_url" placeholder="https://github.com/user/repo.git"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div style="flex:0 0 120px;">
<label for="browse_branch" style="display:block;font-weight:bold;margin-bottom:4px;">Branch</label>
<input type="text" id="browse_branch" value="main"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div style="flex:0 0 auto;align-self:flex-end;padding-bottom:1px;">
<button type="button" id="browse-btn" class="btn btn-secondary" onclick="browseRepo()">Browse Repository</button>
</div>
</div>
<label for="name">Display Name:</label>
<input type="text" id="name" name="name" placeholder="My Project">
<!-- Tree viewer -->
<div id="tree-section" style="display:none;margin-top:12px;">
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:6px;">
<strong>Select paths to include:</strong>
<span>
<button type="button" class="btn btn-sm btn-info" onclick="selectAll()">Select All Dirs</button>
<button type="button" class="btn btn-sm" style="background:#888;color:#fff;" onclick="clearAll()">Clear</button>
</span>
</div>
<div id="tree-container"
style="border:1px solid #ccc;border-radius:4px;padding:10px;max-height:380px;overflow-y:auto;background:#fafafa;font-family:monospace;font-size:0.88rem;">
</div>
</div>
<label for="description">Description:</label>
<input type="text" id="description" name="description" placeholder="Project documentation">
<!-- Step 2: Library details (shown after browse) -->
<div id="save-section" style="display:none;margin-top:16px;">
<form method="post" action="/sources/add" id="save-form">
<input type="hidden" id="include_paths" name="include_paths" value="">
<label for="branch">Branch:</label>
<input type="text" id="branch" name="branch" value="main">
<div style="display:grid;grid-template-columns:1fr 1fr;gap:10px;margin-bottom:10px;">
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Library ID <span style="color:red;">*</span></label>
<input type="text" name="library_id" id="lib_id" placeholder="my-project" required
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Display Name</label>
<input type="text" name="name" id="lib_name" placeholder="My Project"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Description</label>
<input type="text" name="description" placeholder="Optional description"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
<div>
<label style="display:block;font-weight:bold;margin-bottom:4px;">Exclude Paths</label>
<input type="text" name="exclude_paths" value="node_modules,.git" placeholder="node_modules,.git"
style="width:100%;box-sizing:border-box;padding:8px;border:1px solid #ccc;border-radius:4px;">
</div>
</div>
<label for="include_paths">Include Paths:</label>
<textarea id="include_paths" name="include_paths" rows="4">docs</textarea>
<input type="hidden" name="repo_url" id="form_repo_url">
<input type="hidden" name="branch" id="form_branch">
<label for="exclude_paths">Exclude Paths:</label>
<textarea id="exclude_paths" name="exclude_paths" rows="4">node_modules
.git</textarea>
<div id="selected-paths-preview"
style="background:#e8f4fd;border-radius:4px;padding:8px;margin-bottom:10px;font-size:0.85rem;display:none;">
<strong>Selected include paths:</strong> <span id="paths-list"></span>
</div>
<button type="submit">Save Git Source</button>
</form>
<form method="post" action="/sources/sync" class="sync-form">
<label for="override">Override existing repos:</label>
<input type="checkbox" id="override" name="override">
<button type="submit">Sync All Repositories</button>
</form>
<div id="source-list"></div>
<button type="submit" class="btn btn-primary">Save Git Source</button>
</form>
</div>
</div>
<!-- Configured sources list -->
{% if sources %}
<h3>Configured Sources</h3>
<h3 style="margin-top:30px;">Configured Sources ({{ sources|length }})</h3>
<div class="source-cards">
{% for src in sources %}
<div class="source-card">
<strong>{{ src.library_id | default('unknown') }}</strong><br>
URL: {{ src.repo_url | default('N/A') }}<br>
Branch: {{ src.branch | default('main') }}<br>
Include: {{ src.include_paths | default(['*']) | join(', ') }}
<strong>{{ src.library_id | default('unknown') }}</strong>
&nbsp;<span style="color:#666;font-size:0.85rem;">{{ src.repo_url | default('') }}</span><br>
Branch: {{ src.branch | default('main') }}
&nbsp;|&nbsp; Include: {{ src.include_paths | default(['*']) | join(', ') }}
{% if src.exclude_paths %}
&nbsp;|&nbsp; Exclude: {{ src.exclude_paths | join(', ') }}
{% endif %}
</div>
{% endfor %}
</div>
{% else %}
<p>No git sources configured. Add repositories to <code>docs_sources.yaml</code>.</p>
<p style="margin-top:20px;color:#666;">No git sources configured yet.</p>
{% endif %}
<div style="margin-top:24px;">
<form method="post" action="/sources/sync" style="display:inline;">
<button type="submit" class="btn btn-primary">Sync All Repositories</button>
</form>
</div>
{% endblock %}
{% block scripts %}
<script>
const checkedPaths = new Set();
async function browseRepo() {
const url = document.getElementById('browse_url').value.trim();
const branch = document.getElementById('browse_branch').value.trim() || 'main';
if (!url) { alert('Enter a repository URL first.'); return; }
const btn = document.getElementById('browse-btn');
btn.disabled = true;
btn.textContent = 'Cloning…';
try {
const fd = new FormData();
fd.append('repo_url', url);
fd.append('branch', branch);
const resp = await fetch('/sources/browse', { method: 'POST', body: fd });
const data = await resp.json();
if (!data.success) {
alert('Browse failed: ' + (data.detail || data.error || 'unknown error'));
return;
}
checkedPaths.clear();
renderTree(data.tree);
document.getElementById('tree-section').style.display = 'block';
document.getElementById('save-section').style.display = 'block';
document.getElementById('form_repo_url').value = url;
document.getElementById('form_branch').value = branch;
// Auto-fill library ID from repo name
const repoName = url.split('/').pop().replace(/\.git$/, '').toLowerCase().replace(/[^a-z0-9-]/g, '-');
if (!document.getElementById('lib_id').value) {
document.getElementById('lib_id').value = repoName;
document.getElementById('lib_name').value = repoName;
}
} catch(e) {
alert('Request failed: ' + e.message);
} finally {
btn.disabled = false;
btn.textContent = 'Browse Repository';
}
}
function renderTree(nodes) {
const container = document.getElementById('tree-container');
container.innerHTML = buildTreeHTML(nodes, 0);
}
function buildTreeHTML(nodes, depth) {
if (!nodes || nodes.length === 0) return '';
let html = '<ul style="list-style:none;padding-left:' + (depth === 0 ? 0 : 18) + 'px;margin:0;">';
for (const node of nodes) {
if (node.type === 'dir') {
const hasChildren = node.children && node.children.length > 0;
html += `<li style="margin:2px 0;">
<span class="tree-toggle" onclick="toggleDir(this)" style="cursor:pointer;user-select:none;">&#9656;</span>
<label style="cursor:pointer;">
<input type="checkbox" class="dir-check" data-path="${node.path}"
onchange="onDirCheck(this)"> &#128193; ${node.path.split('/').pop()}/
</label>
<div class="tree-children" style="display:none;">
${hasChildren ? buildTreeHTML(node.children, depth + 1) : ''}
</div>
</li>`;
} else {
html += `<li style="margin:2px 0;color:#555;">
<label style="cursor:pointer;padding-left:20px;">
<input type="checkbox" class="file-check" data-path="${node.path}"
onchange="onFileCheck(this)"> &#128196; ${node.path.split('/').pop()}
</label>
</li>`;
}
}
html += '</ul>';
return html;
}
function toggleDir(arrow) {
const li = arrow.closest('li');
const children = li.querySelector('.tree-children');
if (children.style.display === 'none') {
children.style.display = 'block';
arrow.innerHTML = '&#9662;';
} else {
children.style.display = 'none';
arrow.innerHTML = '&#9656;';
}
}
function onDirCheck(cb) {
if (cb.checked) {
checkedPaths.add(cb.dataset.path);
// Uncheck any parent dirs that are already checked (this dir is more specific)
// and uncheck descendant checkboxes to avoid redundancy
const li = cb.closest('li');
li.querySelectorAll('.dir-check,.file-check').forEach(c => {
if (c !== cb) { c.checked = false; checkedPaths.delete(c.dataset.path); }
});
} else {
checkedPaths.delete(cb.dataset.path);
}
updatePreview();
}
function onFileCheck(cb) {
if (cb.checked) checkedPaths.add(cb.dataset.path);
else checkedPaths.delete(cb.dataset.path);
updatePreview();
}
function selectAll() {
checkedPaths.clear();
document.querySelectorAll('.dir-check').forEach(cb => {
// Only select top-level dirs
if (!cb.dataset.path.includes('/')) {
cb.checked = true;
checkedPaths.add(cb.dataset.path);
cb.closest('li').querySelectorAll('.dir-check,.file-check').forEach(c => {
if (c !== cb) { c.checked = false; }
});
}
});
updatePreview();
}
function clearAll() {
checkedPaths.clear();
document.querySelectorAll('.dir-check,.file-check').forEach(c => c.checked = false);
updatePreview();
}
function updatePreview() {
const paths = [...checkedPaths].sort();
document.getElementById('include_paths').value = paths.join('\n');
const preview = document.getElementById('selected-paths-preview');
const list = document.getElementById('paths-list');
if (paths.length > 0) {
preview.style.display = 'block';
list.textContent = paths.join(', ');
} else {
preview.style.display = 'none';
list.textContent = '';
}
}
document.getElementById('save-form').addEventListener('submit', function(e) {
if (checkedPaths.size === 0) {
if (!confirm('No paths selected — this will ingest the entire repository. Continue?')) {
e.preventDefault();
}
}
});
</script>
{% endblock %}