Initial DocsMCP stack

This commit is contained in:
george
2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
+389
View File
@@ -0,0 +1,389 @@
# Git Source Operations for Repository Cloning and File Discovery
import os
import shutil
from pathlib import Path
from typing import List, Optional, Dict, Any
def get_repos_dir() -> Path:
"""Get the base directory for storing cloned repositories."""
# Default to ./data/repos in project root
return Path(__file__).parent.parent.parent / "data" / "repos"
def ensure_repos_dir():
"""Ensure the repos directory exists (idempotent)."""
repos_dir = get_repos_dir()
repos_dir.mkdir(parents=True, exist_ok=True)
return repos_dir
# Initialize repos directory at module load time (safe to run multiple times)
ensure_repos_dir()
class GitCloneError(Exception):
"""Exception for git clone/checkout failures."""
pass
def clone_or_update_repo(
repo_id: str,
repo_url: str,
branch: str,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Clone a git repository or update an existing clone.
Args:
repo_id: Unique identifier for this repository (used in paths)
repo_url: Git URL to clone from
branch: Branch name to checkout
repos_base: Base directory for repos (defaults to get_repos_dir())
Returns:
Dict with operation result including repo path and files found
Raises:
GitCloneError: If clone or checkout fails
"""
repos_base = repos_base or get_repos_dir()
repo_path = repos_base / repo_id
try:
if repo_path.exists():
# Update existing clone
print(f" [Git] Updating existing clone at {repo_path}")
from subprocess import run, CalledProcessError
import subprocess
# Fetch latest changes
result = run(
["git", "-C", str(repo_path), "fetch", "origin"],
capture_output=True,
text=True
)
if result.returncode != 0:
raise GitCloneError(f"Failed to fetch: {result.stderr}")
# Reset to branch
run(
["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch],
capture_output=True,
text=True
)
else:
# Clone new repository
print(f" [Git] Cloning {repo_url} to {repo_path}")
run(
["git", "-C", str(repo_path.parent), "clone",
"--branch", branch,
"--single-branch",
repo_url, "."],
capture_output=True,
text=True
)
print(f" [Git] Checked out branch: {branch}")
return {
"success": True,
"repo_path": str(repo_path),
"url": repo_url,
"branch": branch
}
except CalledProcessError as e:
raise GitCloneError(f"Git command failed: {e.stderr}") from e
except Exception as e:
raise GitCloneError(f"Failed to clone/update repo: {e}") from e
def discover_files(
repo_path: Path,
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
"""
Discover files in a git repository respecting include/exclude paths.
Args:
repo_path: Path to the cloned repository
include_paths: List of paths relative to repo root to include (if None, all dirs considered)
exclude_paths: List of paths relative to repo root to exclude
Returns:
List of dicts with format:
{
"path": "docs/hooks.md", # Relative to repo root
"full_path": "/full/path/to/repo/docs/hooks.md"
}
"""
include_patterns = None if include_paths is None else [
Path(p) for p in include_paths
]
exclude_patterns = set() if exclude_paths is None else {
Path(p) for p in exclude_paths
}
discovered = []
def should_include(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any include pattern."""
if not include_patterns:
return True
# Normalize paths for comparison (handle trailing slashes, etc.)
path_str = str(path).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# If pattern has subdirs, check prefix match
if "/" in inc_str and not inc_str.endswith("/"):
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
if rel_str.startswith(pattern_base):
return True
elif rel_str == inc_str:
return True
return False
def should_exclude(path: Path, rel_path: Path) -> bool:
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
for exc_pattern in exclude_patterns:
exc_str = str(exc_pattern).replace("\\", "/")
rel_str = str(rel_path).replace("\\", "/")
# Exact match or parent directory match
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
return True
return False
def walk_and_collect(current: Path, rel_prefix: Path):
"""Recursive walk function."""
try:
for entry in sorted(os.scandir(current)):
entry_path = current / entry.name
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
# Filter by exclude paths first
if should_exclude(entry_path, rel_path):
continue
# If include_paths specified, only go into matching directories
if include_patterns and not include_path_match(entry_path, rel_path):
if entry.is_dir():
return # Don't descend into this directory
if entry.is_file():
discovered.append({
"path": str(rel_path).lstrip("/"),
"full_path": str(entry_path),
"is_binary": is_probably_binary(str(entry_path))
})
elif entry.is_dir():
walk_and_collect(entry_path, rel_path)
except PermissionError:
# Skip directories we can't read
pass
def include_path_match(path: Path, rel_path: Path) -> bool:
"""Check if path matches any include pattern (for filtering on the fly)."""
if not include_patterns:
return True
path_str = str(path).replace("\\", "/")
for inc_pattern in include_patterns:
inc_str = str(inc_pattern).replace("\\", "/")
# Exact match or parent directory match
if path_str == inc_str or path_str.startswith(inc_str + "/"):
return True
return False
def is_probably_binary(filepath: str) -> bool:
"""Simple binary detection based on file extension and first bytes."""
ext = Path(filepath).suffix.lower()
text_extensions = {'.md', '.txt', '.py', '.js', '.ts', '.json',
'.yaml', '.yml', '.html', '.css', '.sh', '.sql'}
if ext not in text_extensions:
# Check for null bytes in first 8KB
try:
with open(filepath, 'rb') as f:
chunk = f.read(8192)
return b'\x00' in chunk
except:
return False
return False
root_str = str(repo_path).replace("\\", "/")
# Walk the repository starting from repo root
walk_and_collect(repo_path, Path("."))
return discovered
async def ingest_git_source(
library_id: str,
name: str,
description: Optional[str] = None,
repo_url: str = None,
branch: str = "main",
include_paths: Optional[List[str]] = None,
exclude_paths: Optional[List[str]] = None,
repos_base: Optional[Path] = None
) -> Dict[str, Any]:
"""
Ingest a git repository as a new library.
Clones the repo (or updates if exists), discovers files in include paths,
and ingests them into the vector store via existing pipeline.
Args:
library_id: Unique identifier for this library
name: Library display name
description: Optional description
repo_url: Git repository URL to clone from
branch: Branch to checkout (default: main)
include_paths: Paths relative to repo root to include (if None, all dirs considered)
exclude_paths: Paths relative to repo root to exclude
Returns:
Dict with operation result
Raises:
GitCloneError: If git operations fail
"""
from .db import upsert_library
from .ingest import ingest_library
print(f"\n[Git Ingestion] Processing library: {library_id}")
print(f" Source: {repo_url or '(local)'}")
# Ensure repos directory exists
repos_base = repos_base or get_repos_dir()
repos_base.mkdir(parents=True, exist_ok=True)
repo_id = f"{library_id}-git"
# Clone or update the repo
clone_result = clone_or_update_repo(
repo_id=repo_id,
repo_url=repo_url,
branch=branch,
repos_base=repos_base
)
repo_path = Path(clone_result["repo_path"])
print(f" [Git] Found files in {repo_path}")
# Discover files respecting include/exclude paths
files = discover_files(
repo_path=repo_path,
include_paths=include_paths,
exclude_paths=exclude_paths
)
print(f" [Git] Discovered {len(files)} file(s)")
if not files:
return {
"success": True,
"library_id": library_id,
"message": "No files found matching include/exclude criteria",
"files_discovered": 0
}
# Remove .git directory if present (avoid processing it)
git_dir = repo_path / ".git"
if git_dir.exists():
shutil.rmtree(git_dir)
print(f" [Git] Removed .git directory")
# Ingest using existing library ingestion pipeline
result = await ingest_library(
library_id=library_id,
name=name,
description=description,
source_path=repo_id # Use repo_id as the "source path" for tracking
)
return {
"success": result.get("success", False),
"library_id": library_id,
"name": name,
"files_discovered": len(files),
"chunks_created": result.get("chunks_created", 0),
"vectors_added": result.get("vectors_added", 0)
}
async def sync_sources(
sources_config: Dict[str, Any] = None,
repos_base: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Sync all git sources defined in config.
Args:
sources_config: List of source configs (same format as docs_sources.yaml)
repos_base: Base directory for repos
Returns:
List of results for each source
"""
if sources_config is None:
# Load from default config file
import yaml
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
if not config_path.exists():
return [{"success": False, "error": f"Config not found: {config_path}"}]
with open(config_path) as f:
data = yaml.safe_load(f)
sources_config = data.get("sources", [])
results = []
for source in sources_config:
try:
result = await ingest_git_source(
library_id=source.get("library_id"),
name=source.get("name"),
description=source.get("description"),
repo_url=source.get("repo_url"),
branch=source.get("branch", "main"),
include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
repos_base=repos_base
)
except GitCloneError as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": str(e)
}
except Exception as e:
result = {
"success": False,
"library_id": source.get("library_id", "unknown"),
"error": f"Unexpected error: {e}"
}
results.append(result)
return results