Initial DocsMCP stack

2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
@@ -0,0 +1,387 @@
+# Document Ingestion Logic
+import asyncio
+import os
+from pathlib import Path
+from typing import List, Dict, Any, Optional, BinaryIO
+
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+
+# Import local modules
+from .config import settings
+from .chunking import chunk_text, estimate_tokens
+from .embeddings import embed_texts
+from .vector_store import upsert_chunks
+from .db import insert_document_chunk, upsert_library, clear_library_documents
+from .git_source import ingest_git_source
+
+SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json', 
+                        '.yaml', '.yml', '.html', '.css', '.pdf'}
+
+# Default documents path from environment or fallback
+DOCS_PATH = Path(os.getenv("DOCS_PATH", "./docs"))
+
+
+def get_file_size(path: Path) -> int:
+    """Get file size in bytes."""
+    try:
+        return path.stat().st_size
+    except OSError:
+        return -1
+
+
+async def read_document_file(path: Path) -> str:
+    """
+    Read document content from a file.
+    
+    Args:
+        path: Path to the file
+        
+    Returns:
+        Content as string, or empty string if error
+    
+    Raises:
+        ValueError: If file type not supported
+    """
+    if not path.exists():
+        return ""
+    
+    # Check extension
+    suffix = path.suffix.lower()
+    if suffix == '.pdf':
+        from pypdf import PdfReader
+        
+        try:
+            reader = PdfReader(str(path))
+            pages = []
+            for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+                text = page.extract_text()
+                if text:
+                    pages.append(text)
+            return "\n\n".join(pages)
+        except ImportError:
+            raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")
+        except Exception as e:
+            print(f"  Warning: Could not read PDF {path}: {e}")
+            return ""
+    elif suffix not in SUPPORTED_EXTENSIONS:
+        print(f"  Unsupported file type: {suffix}")
+        return ""
+    
+    # Read text-based files
+    try:
+        content = path.read_text(encoding='utf-8')
+        return content if content.strip() else ""
+    except Exception as e:
+        print(f"  Warning: Could not read {path}: {e}")
+        return ""
+
+
+async def ingest_library(library_id: str, name: str, description: Optional[str] = None, source_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Ingest all documents for a library.
+    
+    Args:
+        library_id: Unique identifier for the library
+        name: Library name
+        description: Optional description
+        source_path: Path to library folder (relative to DOCS_PATH)
+        
+    Returns:
+        Summary dict with operation results
+    """
+    print(f"\n[Library] Processing: {library_id}")
+    if source_path:
+        print(f"  Source: {source_path}")
+    
+    # Ensure library record exists
+    result = upsert_library(library_id, name, description, source_path)
+    print(f"  [{result.get('success', False)}] Library record: {'created' if not result.get('exists') else 'updated'}")
+    
+    # Get the library folder path
+    library_dir = DOCS_PATH / source_path
+    
+    if not library_dir.exists():
+        print(f"  Error: Directory does not exist: {library_dir}")
+        return {"success": False, "error": f"Directory not found: {library_dir}"}
+    
+    # Find all supported files (recursive)
+    print(f"  [Library] Scanning for files in: {library_dir}")
+    doc_files = []
+    
+    for file_path in library_dir.rglob('*'):
+        if file_path.is_file():
+            suffix = file_path.suffix.lower()
+            if suffix == '.pdf':
+                doc_files.append(file_path)
+            elif suffix in SUPPORTED_EXTENSIONS:
+                doc_files.append(file_path)
+    
+    print(f"  [Library] Found {len(doc_files)} document(s)")
+    
+    # Clear old chunks for this library
+    print(f"  [Library] Clearing existing chunks...")
+    clear_result = clear_library_documents(library_id)
+    if not clear_result.get('success'):
+        print(f"  Warning: Could not clear library docs: {clear_result}")
+    else:
+        print(f"  [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
+    
+    # Process documents
+    all_chunks = []
+    processed_files = 0
+    
+    for file_path in doc_files:
+        # Read file content
+        print(f"  [File] Reading: {file_path.relative_to(library_dir)}")
+        content = await read_document_file(file_path)
+        
+        if not content:
+            continue
+        
+        # Estimate tokens and chunk
+        num_tokens = estimate_tokens(content)
+        chunks = chunk_text(content, max_tokens=500, overlap_tokens=80)
+        
+        if not chunks:
+            print(f"  [File] No valid chunks from {file_path.name}")
+            continue
+        
+        # Embed chunks and prepare for storage
+        print(f"    Chunked into {len(chunks)} pieces (approx. {num_tokens} tokens)")
+        
+        embeddings = embed_texts(chunks)
+        
+        # Build chunk dicts
+        chunk_dicts = []
+        base_path = file_path.relative_to(library_dir).as_posix()
+        
+        for i, chunk in enumerate(chunks):
+            chunk_dict = {
+                "id": f"{file_path.stem}-{i}",
+                "library_id": library_id,
+                "path": base_path,
+                "title": Path(base_path).stem,
+                "content": chunk,
+                "chunk_index": i,
+                "embedding": embeddings[i]
+            }
+            all_chunks.append(chunk_dict)
+        
+        processed_files += 1
+    
+    print(f"  [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
+    
+    # Save chunks to SQLite
+    if all_chunks:
+        for chunk in all_chunks:
+            insert_result = insert_document_chunk(
+                doc_id=chunk["id"],
+                library_id=chunk["library_id"],
+                path=chunk["path"],
+                title=chunk.get("title"),
+                content=chunk["content"],
+                chunk_index=chunk["chunk_index"],
+                token_estimate=estimate_tokens(chunk["content"])
+            )
+            if insert_result.get('success'):
+                continue
+        print(f"  [Library] Saved {len(all_chunks)} chunks to SQLite")
+    else:
+        print(f"  [Library] No chunks to save to SQLite")
+    
+    # Save vectors to Qdrant
+    if all_chunks:
+        upsert_result = await upsert_chunks(all_chunks)
+        print(f"  [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
+    else:
+        print(f"  [Library] No vectors to add to Qdrant")
+    
+    return {
+        "success": True,
+        "library_id": library_id,
+        "files_processed": processed_files,
+        "chunks_created": len(all_chunks),
+        "vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
+    }
+
+
+async def ingest_git_source_from_config(
+    repo_url: str,
+    branch: str = "main",
+    include_paths: Optional[List[str]] = None,
+    exclude_paths: Optional[List[str]] = None,
+    repos_base: Optional[Path] = None
+) -> Dict[str, Any]:
+    """
+    Ingest a git repository defined in sources configuration.
+    
+    Args:
+        repo_url: Git repository URL to clone from
+        branch: Branch to checkout (default: main)
+        include_paths: Paths relative to repo root to include (if None, all dirs considered)
+        exclude_paths: Paths relative to repo root to exclude
+        repos_base: Base directory for cloned repos (defaults to ./data/repos)
+        
+    Returns:
+        Dict with operation result
+        
+    Raises:
+        GitCloneError: If git operations fail
+    """
+    # Auto-generate library_id from URL if not provided
+    import urllib.parse
+    parsed = urllib.parse.urlparse(repo_url)
+    path_part = parsed.path.rstrip('.git')
+    library_id = Path(path_part).name or "unknown"
+    
+    name = Path(parsed.hostname or path_part).stem
+    description = f"Documentation from {path_part}"
+    
+    result = await ingest_git_source(
+        library_id=library_id,
+        name=name,
+        description=description,
+        repo_url=repo_url,
+        branch=branch,
+        include_paths=include_paths,
+        exclude_paths=exclude_paths,
+        repos_base=repos_base
+    )
+    
+    return result
+
+
+async def detect_libraries() -> List[Dict[str, Any]]:
+    """
+    Detect all top-level folders under DOCS_PATH as libraries.
+    
+    Returns:
+        List of dicts with library metadata
+    """
+    print(f"\n[Detection] Scanning for libraries in: {DOCS_PATH}")
+    
+    if not DOCS_PATH.exists():
+        print(f"  [Detection] Directory does not exist: {DOCS_PATH}")
+        return []
+    
+    # Get top-level directories
+    directories = list(DOCS_PATH.iterdir())
+    dirs_only = [d for d in directories if d.is_dir()]
+    
+    libraries = []
+    for i, lib_dir in enumerate(dirs_only, 1):
+        name = lib_dir.name
+        
+        # Create library record with defaults
+        result = upsert_library(
+            library_id=lib_dir.name.lower(),
+            name=name,
+            description=None,
+            source_path=lib_dir.name
+        )
+        
+        libraries.append({
+            "id": lib_dir.name.lower(),
+            "name": name,
+            "source_path": lib_dir.name
+        })
+        
+        print(f"  [{i}/{len(dirs_only)}] Library detected: {name} (id: {lib_dir.name.lower()})")
+    
+    print(f"\n[Detection] Found {len(libraries)} library(ies)")
+    return libraries
+
+
+async def ingest_all(verbose: bool = True) -> Dict[str, Any]:
+    """
+    Ingest all discovered libraries.
+    
+    Args:
+        verbose: Whether to print progress messages
+        
+    Returns:
+        Summary dict with overall results
+    """
+    if verbose:
+        print("\n" + "=" * 60)
+        print("DOCUMENT INGESTION STARTED")
+        print("=" * 60)
+    
+    # Detect libraries
+    libraries = await detect_libraries()
+    
+    if not libraries:
+        result = {"total_libraries": 0, "total_chunks": 0, "successful": []}
+        if verbose:
+            print("\n[Summary] No libraries to ingest")
+        return result
+    
+    # Ingest each library
+    results = []
+    for lib in libraries:
+        lib_id = lib["id"]
+        
+        result = await ingest_library(
+            library_id=lib_id,
+            name=lib["name"],
+            description=None,
+            source_path=lib.get("source_path")
+        )
+        
+        if verbose and result.get('success'):
+            print(f"  [Library] Done: {result.get('library_id')} - {result.get('chunks_created', 0)} chunks")
+        
+        results.append(result)
+    
+    # Calculate totals
+    total_chunks = sum(r.get('chunks_created', 0) for r in results)
+    successful = len([r for r in results if r.get('success')])
+    
+    result = {
+        "total_libraries": len(libraries),
+        "successful": successful,
+        "failed": len(results) - successful,
+        "total_chunks": total_chunks
+    }
+    
+    if verbose:
+        print("\n" + "=" * 60)
+        print("INGESTION COMPLETE")
+        print("=" * 60)
+        print(f"  Libraries processed: {result['total_libraries']}")
+        print(f"  Successful: {result['successful']}")
+        print(f"  Failed: {result['failed']}")
+        print(f"  Total chunks created: {result['total_chunks']}")
+    
+    return result
+
+
+if __name__ == "__main__":
+    # Run ingestion tests
+    import asyncio
+    
+    async def test_run():
+        print("Testing ingestion module...\n")
+        
+        # Test detect_libraries
+        libs = await detect_libraries()
+        print(f"\nDetected libraries: {len(libs)}")
+        
+        if libs:
+            # Try to ingest the first library (may fail if no docs exist, which is ok for test)
+            print("\nAttempting sample ingestion...")
+            result = await ingest_library(
+                library_id=libs[0]["id"],
+                name=libs[0]["name"],
+                source_path=libs[0].get("source_path")
+            )
+            print(f"Result: {result}")
+        
+        print("\n✅ Tests completed!")
+    
+    asyncio.run(test_run())