Initial DocsMCP stack

2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
@@ -0,0 +1,235 @@
+# Search Operations for Semantic Query and Library Navigation
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+from .config import settings
+from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
+from .embeddings import embed_text, get_embedding_size
+from .db import get_chunks_for_library, list_libraries
+
+
+def search_docs(
+    query: str,
+    library_id: Optional[str] = None,
+    limit: int = 10
+) -> List[Dict[str, Any]]:
+    """
+    Search documents by semantic similarity in Qdrant.
+    
+    Args:
+        query: The search query string
+        library_id: Optional filter to search only within a library
+        limit: Maximum number of results to return
+        
+    Returns:
+        List of dicts with format:
+            {
+              "id": "...",
+              "score": 0.123,
+              "library_id": "...",
+              "path": "...",
+              "title": "...",
+              "chunk_index": 0
+            }
+    """
+    try:
+        # Generate embedding for the query
+        query_embedding = embed_text(query)
+        
+        client = get_client()
+        
+        # Build filter if library_id is specified
+        search_filter = None
+        if library_id:
+            try:
+                from qdrant_client.models import FieldCondition, Filter, MatchValue
+                search_filter = Filter(
+                    must=[
+                        FieldCondition(
+                            key="library_id",
+                            match=MatchValue(value=library_id),
+                        )
+                    ]
+                )
+            except ImportError:
+                search_filter = None
+        
+        # Perform vector search
+        results = client.search(
+            collection_name=VECTOR_COLLECTION,
+            query_vector=query_embedding,
+            limit=limit,
+            search_filter=search_filter
+        )
+        
+        # Format and return results
+        formatted_results = []
+        for result in results:
+            if result.score > 0 and result.payload:
+                formatted_results.append({
+                    "id": result.payload["id"],
+                    "score": float(result.score),
+                    "library_id": result.payload.get("library_id", ""),
+                    "path": result.payload.get("path", ""),
+                    "title": result.payload.get("title", ""),
+                    "chunk_index": result.payload.get("chunk_index", 0)
+                })
+        
+        return formatted_results
+        
+    except Exception as e:
+        print(f"Search error: {e}")
+        return []
+
+
+def get_library_docs(
+    library_id: str,
+    topic: Optional[str] = None,
+    token_limit: int = 8000
+) -> str:
+    """
+    Retrieve documentation content from a library.
+    
+    Args:
+        library_id: The library ID to fetch docs from
+        topic: Optional topic filter - if provided, searches for topic first
+        token_limit: Maximum tokens to include in output
+        
+    Returns:
+        Combined markdown content as string
+    """
+    try:
+        # If topic is specified, search for relevant chunks
+        if topic:
+            print(f"  [Search] Searching library '{library_id}' for topic: {topic}")
+            search_results = search_docs(query=topic, library_id=library_id, limit=20)
+            
+            if not search_results:
+                return f"No documents found in library '{library_id}' matching topic: {topic}"
+            
+            print(f"  [Search] Found {len(search_results)} relevant chunks")
+        else:
+            # Fetch all chunks for the library and select most useful ones
+            print(f"  [Fetch] Retrieving chunks from library '{library_id}'")
+            chunks_data = get_chunks_for_library(library_id)
+            
+            if not chunks_data:
+                return f"No documents found in library '{library_id}'"
+            
+            # Sort by chunk_index descending and pick top ones to respect token limit
+            sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
+            selected_chunks = []
+            total_tokens = 0
+            
+            for chunk in sorted_chunks:
+                content = chunk.get("content", "")
+                tokens = len(content) // 4  # Simple token estimate
+                
+                if total_tokens + tokens <= token_limit:
+                    selected_chunks.append(chunk)
+                    total_tokens += tokens
+                else:
+                    # Take part of this chunk to fill remaining space
+                    remaining = token_limit - total_tokens
+                    content_preview = content[:remaining * 4] if remaining > 0 else ""
+                    if content_preview:
+                        selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
+            
+            print(f"  [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
+        
+        # Combine chunks into markdown
+        md_parts = []
+        for chunk in selected_chunks:
+            title = chunk.get("title")
+            content = chunk.get("content", "")
+            
+            if title and content.strip():
+                # Add heading before first chunk or if this is the first chunk
+                if not md_parts or "\n\n" not in "".join(md_parts):
+                    md_parts.append(f"# {title}")
+                elif not any(part.startswith("#") for part in md_parts[-5:]):
+                    md_parts.append(f"\n# {title}\n")
+            
+            md_parts.append(content)
+        
+        result = "\n\n".join(md_parts)
+        
+        # If no headings were added, prepend library title
+        if not any(part.startswith("#") for part in result.split("\n")[:3]):
+            result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
+        
+        return result.rstrip()
+        
+    except Exception as e:
+        print(f"Error getting library docs: {e}")
+        return f"Error retrieving documents from library '{library_id}': {str(e)}"
+
+
+def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
+    """
+    Resolve a library name to potential matches (Context7-style).
+    
+    Args:
+        library_name: Partial or full library name to search for
+        
+    Returns:
+        List of Context7-style candidate dicts:
+            {
+              "id": "/local/foundryvtt",
+              "name": "foundryvtt",
+              "description": "...",
+              "source": "local"
+            }
+    """
+    try:
+        libraries = list_libraries()
+        
+        if not libraries:
+            return []
+        
+        # Filter by name match (case-insensitive)
+        candidates = []
+        for lib in libraries:
+            lib_name = lib.get("name", "").lower()
+            lib_id = lib.get("id", "").lower()
+            
+            if library_name.lower() in lib_name or library_name.lower() in lib_id:
+                candidates.append({
+                    "id": f"/local/{lib['id']}",
+                    "name": lib["name"],
+                    "description": lib.get("description", ""),
+                    "source": "local"
+                })
+        
+        # Return top matches (or all if less than 3)
+        candidates = candidates[:min(5, len(candidates))]
+        
+        print(f"  [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
+        
+        return candidates
+        
+    except Exception as e:
+        print(f"Error resolving library ID: {e}")
+        return []
+
+
+if __name__ == "__main__":
+    import asyncio
+    
+    async def test_search():
+        """Test search functionality."""
+        print("Testing search module...\n")
+        
+        # Test 1: Simple search with dummy vector (simulated)
+        print("1. Testing resolve_library_id()...")
+        results = await resolve_library_id("foundryvtt")
+        print(f"   Results: {len(results)} candidates\n")
+        
+        # Test 2: Empty query should return empty list
+        print("2. Testing search_docs() with empty query...")
+        results = await search_docs("")
+        print(f"   Results: {len(results)} chunks\n")
+        
+        print("✅ All tests completed!")
+    
+    asyncio.run(test_search())