Fix Git source sync failures

This commit is contained in:
george
2026-06-06 00:48:21 +01:00
parent 54986cda99
commit fc683b2803
4 changed files with 111 additions and 38 deletions
+1
View File
@@ -6,6 +6,7 @@ WORKDIR /app
# Install system dependencies for PDF parsing and embeddings # Install system dependencies for PDF parsing and embeddings
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
curl \ curl \
git \
libgl1 \ libgl1 \
libglib2.0-0 \ libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
+22 -25
View File
@@ -1,6 +1,7 @@
# Git Source Operations for Repository Cloning and File Discovery # Git Source Operations for Repository Cloning and File Discovery
import os import os
import shutil import shutil
from subprocess import run
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, Any from typing import List, Optional, Dict, Any
@@ -27,6 +28,13 @@ class GitCloneError(Exception):
pass pass
def run_git(command: List[str]) -> None:
result = run(command, capture_output=True, text=True)
if result.returncode != 0:
error = (result.stderr or result.stdout or "unknown git error").strip()
raise GitCloneError(error)
def clone_or_update_repo( def clone_or_update_repo(
repo_id: str, repo_id: str,
repo_url: str, repo_url: str,
@@ -56,36 +64,25 @@ def clone_or_update_repo(
# Update existing clone # Update existing clone
print(f" [Git] Updating existing clone at {repo_path}") print(f" [Git] Updating existing clone at {repo_path}")
from subprocess import run, CalledProcessError
import subprocess
# Fetch latest changes # Fetch latest changes
result = run( run_git(["git", "-C", str(repo_path), "fetch", "origin"])
["git", "-C", str(repo_path), "fetch", "origin"],
capture_output=True,
text=True
)
if result.returncode != 0:
raise GitCloneError(f"Failed to fetch: {result.stderr}")
# Reset to branch # Reset to branch
run( run_git(["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch])
["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch],
capture_output=True,
text=True
)
else: else:
# Clone new repository # Clone new repository
print(f" [Git] Cloning {repo_url} to {repo_path}") print(f" [Git] Cloning {repo_url} to {repo_path}")
run( run_git(
["git", "-C", str(repo_path.parent), "clone", [
"--branch", branch, "git",
"--single-branch", "clone",
repo_url, "."], "--branch",
capture_output=True, branch,
text=True "--single-branch",
repo_url,
str(repo_path),
]
) )
print(f" [Git] Checked out branch: {branch}") print(f" [Git] Checked out branch: {branch}")
@@ -97,8 +94,8 @@ def clone_or_update_repo(
"branch": branch "branch": branch
} }
except CalledProcessError as e: except GitCloneError:
raise GitCloneError(f"Git command failed: {e.stderr}") from e raise
except Exception as e: except Exception as e:
raise GitCloneError(f"Failed to clone/update repo: {e}") from e raise GitCloneError(f"Failed to clone/update repo: {e}") from e
+18 -9
View File
@@ -432,15 +432,24 @@ async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
results = [] results = []
for source in sources: for source in sources:
result = await ingest_git_source( library_id = source.get("library_id", "unknown")
library_id=source["library_id"], try:
name=source.get("name") or source["library_id"], result = await ingest_git_source(
description=source.get("description"), library_id=library_id,
repo_url=source["repo_url"], name=source.get("name") or library_id,
branch=source.get("branch", "main"), description=source.get("description"),
include_paths=source.get("include_paths"), repo_url=source["repo_url"],
exclude_paths=source.get("exclude_paths"), branch=source.get("branch", "main"),
) include_paths=source.get("include_paths"),
exclude_paths=source.get("exclude_paths"),
)
except Exception as exc:
result = {
"success": False,
"library_id": library_id,
"repo_url": source.get("repo_url"),
"error": str(exc),
}
results.append(result) results.append(result)
successful = len([r for r in results if r.get("success")]) successful = len([r for r in results if r.get("success")])
+66
View File
@@ -0,0 +1,66 @@
import pytest
from backend.app import git_source, main
def test_clone_or_update_repo_clones_into_repo_path(monkeypatch, tmp_path):
commands = []
def fake_run(command, capture_output=True, text=True):
commands.append(command)
class Result:
returncode = 0
stdout = ""
stderr = ""
return Result()
monkeypatch.setattr(git_source, "run", fake_run)
result = git_source.clone_or_update_repo(
repo_id="neoforge-git",
repo_url="https://github.com/neoforged/Documentation.git",
branch="main",
repos_base=tmp_path,
)
assert result["success"] is True
assert commands == [
[
"git",
"clone",
"--branch",
"main",
"--single-branch",
"https://github.com/neoforged/Documentation.git",
str(tmp_path / "neoforge-git"),
]
]
@pytest.mark.asyncio
async def test_sync_sources_returns_failed_result_for_source_exception(monkeypatch):
async def fake_list_sources():
return {
"sources": [
{
"library_id": "neoforge",
"repo_url": "https://github.com/neoforged/Documentation.git",
"branch": "main",
}
]
}
async def fake_ingest_git_source(**kwargs):
raise RuntimeError("git is unavailable")
monkeypatch.setattr(main, "api_list_sources", fake_list_sources)
monkeypatch.setattr(main, "ingest_git_source", fake_ingest_git_source)
result = await main.sync_sources_api()
assert result["success"] is False
assert result["failed"] == 1
assert result["results"][0]["library_id"] == "neoforge"
assert result["results"][0]["error"] == "git is unavailable"