Files
DocsMCP/backend/app/search.py
T
2026-06-06 13:01:52 +01:00

231 lines
8.0 KiB
Python

# Search Operations for Semantic Query and Library Navigation
from typing import List, Dict, Any, Optional
from pathlib import Path
from .config import settings
from .vector_store import get_client, query_points
from .embeddings import embed_text, get_embedding_size
from .db import get_chunks_for_library, list_libraries
def search_docs(
query: str,
library_id: Optional[str] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search documents by semantic similarity in Qdrant.
Args:
query: The search query string
library_id: Optional filter to search only within a library
limit: Maximum number of results to return
Returns:
List of dicts with format:
{
"id": "...",
"score": 0.123,
"library_id": "...",
"path": "...",
"title": "...",
"chunk_index": 0
}
"""
try:
# Generate embedding for the query
query_embedding = embed_text(query)
client = get_client()
# Build filter if library_id is specified
search_filter = None
if library_id:
try:
from qdrant_client.models import FieldCondition, Filter, MatchValue
search_filter = Filter(
must=[
FieldCondition(
key="library_id",
match=MatchValue(value=library_id),
)
]
)
except ImportError:
search_filter = None
# Perform vector search
results = query_points(client, query_embedding, limit, search_filter)
# Format and return results
formatted_results = []
for result in results:
if result.score > 0 and result.payload:
formatted_results.append({
"id": result.payload["id"],
"score": float(result.score),
"library_id": result.payload.get("library_id", ""),
"path": result.payload.get("path", ""),
"title": result.payload.get("title", ""),
"chunk_index": result.payload.get("chunk_index", 0)
})
return formatted_results
except Exception as e:
print(f"Search error: {e}")
return []
def get_library_docs(
library_id: str,
topic: Optional[str] = None,
token_limit: int = 8000
) -> str:
"""
Retrieve documentation content from a library.
Args:
library_id: The library ID to fetch docs from
topic: Optional topic filter - if provided, searches for topic first
token_limit: Maximum tokens to include in output
Returns:
Combined markdown content as string
"""
try:
# If topic is specified, search for relevant chunks
if topic:
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
search_results = search_docs(query=topic, library_id=library_id, limit=20)
if not search_results:
return f"No documents found in library '{library_id}' matching topic: {topic}"
print(f" [Search] Found {len(search_results)} relevant chunks")
else:
# Fetch all chunks for the library and select most useful ones
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
chunks_data = get_chunks_for_library(library_id)
if not chunks_data:
return f"No documents found in library '{library_id}'"
# Sort by chunk_index descending and pick top ones to respect token limit
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
selected_chunks = []
total_tokens = 0
for chunk in sorted_chunks:
content = chunk.get("content", "")
tokens = len(content) // 4 # Simple token estimate
if total_tokens + tokens <= token_limit:
selected_chunks.append(chunk)
total_tokens += tokens
else:
# Take part of this chunk to fill remaining space
remaining = token_limit - total_tokens
content_preview = content[:remaining * 4] if remaining > 0 else ""
if content_preview:
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
# Combine chunks into markdown
md_parts = []
for chunk in selected_chunks:
title = chunk.get("title")
content = chunk.get("content", "")
if title and content.strip():
# Add heading before first chunk or if this is the first chunk
if not md_parts or "\n\n" not in "".join(md_parts):
md_parts.append(f"# {title}")
elif not any(part.startswith("#") for part in md_parts[-5:]):
md_parts.append(f"\n# {title}\n")
md_parts.append(content)
result = "\n\n".join(md_parts)
# If no headings were added, prepend library title
if not any(part.startswith("#") for part in result.split("\n")[:3]):
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
return result.rstrip()
except Exception as e:
print(f"Error getting library docs: {e}")
return f"Error retrieving documents from library '{library_id}': {str(e)}"
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
"""
Resolve a library name to potential matches (Context7-style).
Args:
library_name: Partial or full library name to search for
Returns:
List of Context7-style candidate dicts:
{
"id": "/local/foundryvtt",
"name": "foundryvtt",
"description": "...",
"source": "local"
}
"""
try:
libraries = list_libraries()
if not libraries:
return []
# Filter by name match (case-insensitive)
candidates = []
for lib in libraries:
lib_name = lib.get("name", "").lower()
lib_id = lib.get("id", "").lower()
if library_name.lower() in lib_name or library_name.lower() in lib_id:
candidates.append({
"id": f"/local/{lib['id']}",
"name": lib["name"],
"description": lib.get("description", ""),
"source": "local"
})
# Return top matches (or all if less than 3)
candidates = candidates[:min(5, len(candidates))]
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
return candidates
except Exception as e:
print(f"Error resolving library ID: {e}")
return []
if __name__ == "__main__":
import asyncio
async def test_search():
"""Test search functionality."""
print("Testing search module...\n")
# Test 1: Simple search with dummy vector (simulated)
print("1. Testing resolve_library_id()...")
results = await resolve_library_id("foundryvtt")
print(f" Results: {len(results)} candidates\n")
# Test 2: Empty query should return empty list
print("2. Testing search_docs() with empty query...")
results = await search_docs("")
print(f" Results: {len(results)} chunks\n")
print("✅ All tests completed!")
asyncio.run(test_search())