Initial DocsMCP stack
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
# Search Operations for Semantic Query and Library Navigation
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from .config import settings
|
||||
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
|
||||
from .embeddings import embed_text, get_embedding_size
|
||||
from .db import get_chunks_for_library, list_libraries
|
||||
|
||||
|
||||
def search_docs(
|
||||
query: str,
|
||||
library_id: Optional[str] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search documents by semantic similarity in Qdrant.
|
||||
|
||||
Args:
|
||||
query: The search query string
|
||||
library_id: Optional filter to search only within a library
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of dicts with format:
|
||||
{
|
||||
"id": "...",
|
||||
"score": 0.123,
|
||||
"library_id": "...",
|
||||
"path": "...",
|
||||
"title": "...",
|
||||
"chunk_index": 0
|
||||
}
|
||||
"""
|
||||
try:
|
||||
# Generate embedding for the query
|
||||
query_embedding = embed_text(query)
|
||||
|
||||
client = get_client()
|
||||
|
||||
# Build filter if library_id is specified
|
||||
search_filter = None
|
||||
if library_id:
|
||||
try:
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
search_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="library_id",
|
||||
match=MatchValue(value=library_id),
|
||||
)
|
||||
]
|
||||
)
|
||||
except ImportError:
|
||||
search_filter = None
|
||||
|
||||
# Perform vector search
|
||||
results = client.search(
|
||||
collection_name=VECTOR_COLLECTION,
|
||||
query_vector=query_embedding,
|
||||
limit=limit,
|
||||
search_filter=search_filter
|
||||
)
|
||||
|
||||
# Format and return results
|
||||
formatted_results = []
|
||||
for result in results:
|
||||
if result.score > 0 and result.payload:
|
||||
formatted_results.append({
|
||||
"id": result.payload["id"],
|
||||
"score": float(result.score),
|
||||
"library_id": result.payload.get("library_id", ""),
|
||||
"path": result.payload.get("path", ""),
|
||||
"title": result.payload.get("title", ""),
|
||||
"chunk_index": result.payload.get("chunk_index", 0)
|
||||
})
|
||||
|
||||
return formatted_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_library_docs(
|
||||
library_id: str,
|
||||
topic: Optional[str] = None,
|
||||
token_limit: int = 8000
|
||||
) -> str:
|
||||
"""
|
||||
Retrieve documentation content from a library.
|
||||
|
||||
Args:
|
||||
library_id: The library ID to fetch docs from
|
||||
topic: Optional topic filter - if provided, searches for topic first
|
||||
token_limit: Maximum tokens to include in output
|
||||
|
||||
Returns:
|
||||
Combined markdown content as string
|
||||
"""
|
||||
try:
|
||||
# If topic is specified, search for relevant chunks
|
||||
if topic:
|
||||
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
|
||||
search_results = search_docs(query=topic, library_id=library_id, limit=20)
|
||||
|
||||
if not search_results:
|
||||
return f"No documents found in library '{library_id}' matching topic: {topic}"
|
||||
|
||||
print(f" [Search] Found {len(search_results)} relevant chunks")
|
||||
else:
|
||||
# Fetch all chunks for the library and select most useful ones
|
||||
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
|
||||
chunks_data = get_chunks_for_library(library_id)
|
||||
|
||||
if not chunks_data:
|
||||
return f"No documents found in library '{library_id}'"
|
||||
|
||||
# Sort by chunk_index descending and pick top ones to respect token limit
|
||||
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
|
||||
selected_chunks = []
|
||||
total_tokens = 0
|
||||
|
||||
for chunk in sorted_chunks:
|
||||
content = chunk.get("content", "")
|
||||
tokens = len(content) // 4 # Simple token estimate
|
||||
|
||||
if total_tokens + tokens <= token_limit:
|
||||
selected_chunks.append(chunk)
|
||||
total_tokens += tokens
|
||||
else:
|
||||
# Take part of this chunk to fill remaining space
|
||||
remaining = token_limit - total_tokens
|
||||
content_preview = content[:remaining * 4] if remaining > 0 else ""
|
||||
if content_preview:
|
||||
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
|
||||
|
||||
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
|
||||
|
||||
# Combine chunks into markdown
|
||||
md_parts = []
|
||||
for chunk in selected_chunks:
|
||||
title = chunk.get("title")
|
||||
content = chunk.get("content", "")
|
||||
|
||||
if title and content.strip():
|
||||
# Add heading before first chunk or if this is the first chunk
|
||||
if not md_parts or "\n\n" not in "".join(md_parts):
|
||||
md_parts.append(f"# {title}")
|
||||
elif not any(part.startswith("#") for part in md_parts[-5:]):
|
||||
md_parts.append(f"\n# {title}\n")
|
||||
|
||||
md_parts.append(content)
|
||||
|
||||
result = "\n\n".join(md_parts)
|
||||
|
||||
# If no headings were added, prepend library title
|
||||
if not any(part.startswith("#") for part in result.split("\n")[:3]):
|
||||
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
|
||||
|
||||
return result.rstrip()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting library docs: {e}")
|
||||
return f"Error retrieving documents from library '{library_id}': {str(e)}"
|
||||
|
||||
|
||||
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Resolve a library name to potential matches (Context7-style).
|
||||
|
||||
Args:
|
||||
library_name: Partial or full library name to search for
|
||||
|
||||
Returns:
|
||||
List of Context7-style candidate dicts:
|
||||
{
|
||||
"id": "/local/foundryvtt",
|
||||
"name": "foundryvtt",
|
||||
"description": "...",
|
||||
"source": "local"
|
||||
}
|
||||
"""
|
||||
try:
|
||||
libraries = list_libraries()
|
||||
|
||||
if not libraries:
|
||||
return []
|
||||
|
||||
# Filter by name match (case-insensitive)
|
||||
candidates = []
|
||||
for lib in libraries:
|
||||
lib_name = lib.get("name", "").lower()
|
||||
lib_id = lib.get("id", "").lower()
|
||||
|
||||
if library_name.lower() in lib_name or library_name.lower() in lib_id:
|
||||
candidates.append({
|
||||
"id": f"/local/{lib['id']}",
|
||||
"name": lib["name"],
|
||||
"description": lib.get("description", ""),
|
||||
"source": "local"
|
||||
})
|
||||
|
||||
# Return top matches (or all if less than 3)
|
||||
candidates = candidates[:min(5, len(candidates))]
|
||||
|
||||
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
|
||||
|
||||
return candidates
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error resolving library ID: {e}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
async def test_search():
|
||||
"""Test search functionality."""
|
||||
print("Testing search module...\n")
|
||||
|
||||
# Test 1: Simple search with dummy vector (simulated)
|
||||
print("1. Testing resolve_library_id()...")
|
||||
results = await resolve_library_id("foundryvtt")
|
||||
print(f" Results: {len(results)} candidates\n")
|
||||
|
||||
# Test 2: Empty query should return empty list
|
||||
print("2. Testing search_docs() with empty query...")
|
||||
results = await search_docs("")
|
||||
print(f" Results: {len(results)} chunks\n")
|
||||
|
||||
print("✅ All tests completed!")
|
||||
|
||||
asyncio.run(test_search())
|
||||
Reference in New Issue
Block a user