Fix git sync and add repo browser with path selection
- Fix discover_files: rel_path always computed (was stuck at '.' at root), include_path_match now uses relative path, 'return' changed to 'continue' - Fix ingest_git_source: files were cloned but ingested from wrong path (docs/repo-id instead of data/repos/repo-id). Now stages filtered files into DOCS_PATH/library_id before calling ingest_library. - Add browse_repo_tree() for interactive repo exploration - Add POST /api/v1/sources/browse endpoint to backend - Add /sources/browse proxy route to webui - Rewrite sources.html: browse repo, expand/collapse tree, check paths to include, then save source and sync Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+99
-79
@@ -128,85 +128,60 @@ def discover_files(
|
||||
}
|
||||
|
||||
discovered = []
|
||||
|
||||
def should_include(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if a path matches any include pattern."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
|
||||
# Normalize paths for comparison (handle trailing slashes, etc.)
|
||||
path_str = str(path).replace("\\", "/")
|
||||
|
||||
def should_exclude(rel_path: Path) -> bool:
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
|
||||
# If pattern has subdirs, check prefix match
|
||||
if "/" in inc_str and not inc_str.endswith("/"):
|
||||
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
|
||||
if rel_str.startswith(pattern_base):
|
||||
return True
|
||||
elif rel_str == inc_str:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def should_exclude(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
|
||||
for exc_pattern in exclude_patterns:
|
||||
exc_str = str(exc_pattern).replace("\\", "/")
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
|
||||
# Exact match or parent directory match
|
||||
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def file_under_include(rel_path: Path) -> bool:
|
||||
"""True if the file is inside an include path."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
if rel_str == inc_str or rel_str.startswith(inc_str + "/"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def dir_may_contain_includes(rel_path: Path) -> bool:
|
||||
"""True if this directory is inside, equal to, or a parent of any include path."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
if rel_str == inc_str or rel_str.startswith(inc_str + "/") or inc_str.startswith(rel_str + "/"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def walk_and_collect(current: Path, rel_prefix: Path):
|
||||
"""Recursive walk function."""
|
||||
try:
|
||||
for entry in sorted(os.scandir(current), key=lambda e: e.name):
|
||||
entry_path = current / entry.name
|
||||
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
|
||||
|
||||
# Filter by exclude paths first
|
||||
if should_exclude(entry_path, rel_path):
|
||||
rel_path = rel_prefix / entry.name
|
||||
|
||||
if should_exclude(rel_path):
|
||||
continue
|
||||
|
||||
# If include_paths specified, only go into matching directories
|
||||
if include_patterns and not include_path_match(entry_path, rel_path):
|
||||
if entry.is_dir():
|
||||
return # Don't descend into this directory
|
||||
|
||||
|
||||
if entry.is_file():
|
||||
discovered.append({
|
||||
"path": str(rel_path).lstrip("/"),
|
||||
"full_path": str(entry_path),
|
||||
"is_binary": is_probably_binary(str(entry_path))
|
||||
})
|
||||
if file_under_include(rel_path):
|
||||
discovered.append({
|
||||
"path": str(rel_path).replace("\\", "/"),
|
||||
"full_path": str(entry_path),
|
||||
"is_binary": is_probably_binary(str(entry_path))
|
||||
})
|
||||
elif entry.is_dir():
|
||||
walk_and_collect(entry_path, rel_path)
|
||||
|
||||
if dir_may_contain_includes(rel_path):
|
||||
walk_and_collect(entry_path, rel_path)
|
||||
|
||||
except PermissionError:
|
||||
# Skip directories we can't read
|
||||
pass
|
||||
|
||||
def include_path_match(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if path matches any include pattern (for filtering on the fly)."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
|
||||
path_str = str(path).replace("\\", "/")
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
|
||||
# Exact match or parent directory match
|
||||
if path_str == inc_str or path_str.startswith(inc_str + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_probably_binary(filepath: str) -> bool:
|
||||
"""Simple binary detection based on file extension and first bytes."""
|
||||
ext = Path(filepath).suffix.lower()
|
||||
@@ -224,14 +199,44 @@ def discover_files(
|
||||
|
||||
return False
|
||||
|
||||
root_str = str(repo_path).replace("\\", "/")
|
||||
|
||||
# Walk the repository starting from repo root
|
||||
walk_and_collect(repo_path, Path("."))
|
||||
|
||||
walk_and_collect(repo_path, Path())
|
||||
return discovered
|
||||
|
||||
|
||||
def browse_repo_tree(repo_path: Path, max_depth: int = 4) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Return a nested directory tree for browsing a cloned repository.
|
||||
|
||||
Returns a list of nodes:
|
||||
{"path": "docs", "type": "dir", "children": [...]}
|
||||
{"path": "README.md", "type": "file"}
|
||||
Paths are relative to repo_path. Hidden entries (starting with '.') are skipped.
|
||||
"""
|
||||
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", ".tox"}
|
||||
|
||||
def build(current: Path, rel_prefix: Path, depth: int) -> List[Dict[str, Any]]:
|
||||
if depth <= 0:
|
||||
return []
|
||||
nodes: List[Dict[str, Any]] = []
|
||||
try:
|
||||
entries = sorted(os.scandir(current), key=lambda e: (not e.is_dir(), e.name.lower()))
|
||||
except PermissionError:
|
||||
return []
|
||||
for entry in entries:
|
||||
if entry.name.startswith(".") or entry.name in SKIP_DIRS:
|
||||
continue
|
||||
rel_path = rel_prefix / entry.name
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
if entry.is_dir():
|
||||
children = build(Path(entry.path), rel_path, depth - 1)
|
||||
nodes.append({"path": rel_str, "type": "dir", "children": children})
|
||||
else:
|
||||
nodes.append({"path": rel_str, "type": "file"})
|
||||
return nodes
|
||||
|
||||
return build(repo_path, Path(), max_depth)
|
||||
|
||||
|
||||
async def ingest_git_source(
|
||||
library_id: str,
|
||||
name: str,
|
||||
@@ -263,7 +268,6 @@ async def ingest_git_source(
|
||||
Raises:
|
||||
GitCloneError: If git operations fail
|
||||
"""
|
||||
from .db import upsert_library
|
||||
from .ingest import ingest_library
|
||||
|
||||
print(f"\n[Git Ingestion] Processing library: {library_id}")
|
||||
@@ -303,19 +307,35 @@ async def ingest_git_source(
|
||||
"message": "No files found matching include/exclude criteria",
|
||||
"files_discovered": 0
|
||||
}
|
||||
|
||||
# Remove .git directory if present (avoid processing it)
|
||||
git_dir = repo_path / ".git"
|
||||
if git_dir.exists():
|
||||
shutil.rmtree(git_dir)
|
||||
print(f" [Git] Removed .git directory")
|
||||
|
||||
# Ingest using existing library ingestion pipeline
|
||||
|
||||
# Stage only the filtered files into DOCS_PATH/library_id so that
|
||||
# ingest_library reads exactly what discover_files selected.
|
||||
from .config import settings
|
||||
docs_dir = Path(settings.docs_path) / library_id
|
||||
if docs_dir.exists():
|
||||
shutil.rmtree(docs_dir)
|
||||
docs_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
staged = 0
|
||||
for file_info in files:
|
||||
if file_info.get("is_binary"):
|
||||
continue
|
||||
src = Path(file_info["full_path"])
|
||||
dst = docs_dir / file_info["path"]
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
shutil.copy2(src, dst)
|
||||
staged += 1
|
||||
except OSError as exc:
|
||||
print(f" [Git] Warning: could not copy {src}: {exc}")
|
||||
|
||||
print(f" [Git] Staged {staged} file(s) to {docs_dir}")
|
||||
|
||||
result = await ingest_library(
|
||||
library_id=library_id,
|
||||
name=name,
|
||||
description=description,
|
||||
source_path=repo_id # Use repo_id as the "source path" for tracking
|
||||
source_path=library_id,
|
||||
)
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user