DocsMCP/backend/app/search.py

# Search Operations for Semantic Query and Library Navigation
from typing import List, Dict, Any, Optional
from pathlib import Path

from .config import settings
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
from .embeddings import embed_text, get_embedding_size
from .db import get_chunks_for_library, list_libraries


def search_docs(
    query: str,
    library_id: Optional[str] = None,
    limit: int = 10
) -> List[Dict[str, Any]]:
    """
    Search documents by semantic similarity in Qdrant.

    Args:
        query: The search query string
        library_id: Optional filter to search only within a library
        limit: Maximum number of results to return

    Returns:
        List of dicts with format:
            {
              "id": "...",
              "score": 0.123,
              "library_id": "...",
              "path": "...",
              "title": "...",
              "chunk_index": 0
            }
    """
    try:
        # Generate embedding for the query
        query_embedding = embed_text(query)

        client = get_client()

        # Build filter if library_id is specified
        search_filter = None
        if library_id:
            try:
                from qdrant_client.models import FieldCondition, Filter, MatchValue
                search_filter = Filter(
                    must=[
                        FieldCondition(
                            key="library_id",
                            match=MatchValue(value=library_id),
                        )
                    ]
                )
            except ImportError:
                search_filter = None

        # Perform vector search
        results = client.search(
            collection_name=VECTOR_COLLECTION,
            query_vector=query_embedding,
            limit=limit,
            search_filter=search_filter
        )

        # Format and return results
        formatted_results = []
        for result in results:
            if result.score > 0 and result.payload:
                formatted_results.append({
                    "id": result.payload["id"],
                    "score": float(result.score),
                    "library_id": result.payload.get("library_id", ""),
                    "path": result.payload.get("path", ""),
                    "title": result.payload.get("title", ""),
                    "chunk_index": result.payload.get("chunk_index", 0)
                })

        return formatted_results

    except Exception as e:
        print(f"Search error: {e}")
        return []


def get_library_docs(
    library_id: str,
    topic: Optional[str] = None,
    token_limit: int = 8000
) -> str:
    """
    Retrieve documentation content from a library.

    Args:
        library_id: The library ID to fetch docs from
        topic: Optional topic filter - if provided, searches for topic first
        token_limit: Maximum tokens to include in output

    Returns:
        Combined markdown content as string
    """
    try:
        # If topic is specified, search for relevant chunks
        if topic:
            print(f"  [Search] Searching library '{library_id}' for topic: {topic}")
            search_results = search_docs(query=topic, library_id=library_id, limit=20)

            if not search_results:
                return f"No documents found in library '{library_id}' matching topic: {topic}"

            print(f"  [Search] Found {len(search_results)} relevant chunks")
        else:
            # Fetch all chunks for the library and select most useful ones
            print(f"  [Fetch] Retrieving chunks from library '{library_id}'")
            chunks_data = get_chunks_for_library(library_id)

            if not chunks_data:
                return f"No documents found in library '{library_id}'"

            # Sort by chunk_index descending and pick top ones to respect token limit
            sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
            selected_chunks = []
            total_tokens = 0

            for chunk in sorted_chunks:
                content = chunk.get("content", "")
                tokens = len(content) // 4  # Simple token estimate

                if total_tokens + tokens <= token_limit:
                    selected_chunks.append(chunk)
                    total_tokens += tokens
                else:
                    # Take part of this chunk to fill remaining space
                    remaining = token_limit - total_tokens
                    content_preview = content[:remaining * 4] if remaining > 0 else ""
                    if content_preview:
                        selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})

            print(f"  [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")

        # Combine chunks into markdown
        md_parts = []
        for chunk in selected_chunks:
            title = chunk.get("title")
            content = chunk.get("content", "")

            if title and content.strip():
                # Add heading before first chunk or if this is the first chunk
                if not md_parts or "\n\n" not in "".join(md_parts):
                    md_parts.append(f"# {title}")
                elif not any(part.startswith("#") for part in md_parts[-5:]):
                    md_parts.append(f"\n# {title}\n")

            md_parts.append(content)

        result = "\n\n".join(md_parts)

        # If no headings were added, prepend library title
        if not any(part.startswith("#") for part in result.split("\n")[:3]):
            result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result

        return result.rstrip()

    except Exception as e:
        print(f"Error getting library docs: {e}")
        return f"Error retrieving documents from library '{library_id}': {str(e)}"


def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
    """
    Resolve a library name to potential matches (Context7-style).

    Args:
        library_name: Partial or full library name to search for

    Returns:
        List of Context7-style candidate dicts:
            {
              "id": "/local/foundryvtt",
              "name": "foundryvtt",
              "description": "...",
              "source": "local"
            }
    """
    try:
        libraries = list_libraries()

        if not libraries:
            return []

        # Filter by name match (case-insensitive)
        candidates = []
        for lib in libraries:
            lib_name = lib.get("name", "").lower()
            lib_id = lib.get("id", "").lower()

            if library_name.lower() in lib_name or library_name.lower() in lib_id:
                candidates.append({
                    "id": f"/local/{lib['id']}",
                    "name": lib["name"],
                    "description": lib.get("description", ""),
                    "source": "local"
                })

        # Return top matches (or all if less than 3)
        candidates = candidates[:min(5, len(candidates))]

        print(f"  [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")

        return candidates

    except Exception as e:
        print(f"Error resolving library ID: {e}")
        return []


if __name__ == "__main__":
    import asyncio

    async def test_search():
        """Test search functionality."""
        print("Testing search module...\n")

        # Test 1: Simple search with dummy vector (simulated)
        print("1. Testing resolve_library_id()...")
        results = await resolve_library_id("foundryvtt")
        print(f"   Results: {len(results)} candidates\n")

        # Test 2: Empty query should return empty list
        print("2. Testing search_docs() with empty query...")
        results = await search_docs("")
        print(f"   Results: {len(results)} chunks\n")

        print("✅ All tests completed!")

    asyncio.run(test_search())