236 lines
8.1 KiB
Python
236 lines
8.1 KiB
Python
# Search Operations for Semantic Query and Library Navigation
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
|
|
from .config import settings
|
|
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
|
|
from .embeddings import embed_text, get_embedding_size
|
|
from .db import get_chunks_for_library, list_libraries
|
|
|
|
|
|
def search_docs(
|
|
query: str,
|
|
library_id: Optional[str] = None,
|
|
limit: int = 10
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Search documents by semantic similarity in Qdrant.
|
|
|
|
Args:
|
|
query: The search query string
|
|
library_id: Optional filter to search only within a library
|
|
limit: Maximum number of results to return
|
|
|
|
Returns:
|
|
List of dicts with format:
|
|
{
|
|
"id": "...",
|
|
"score": 0.123,
|
|
"library_id": "...",
|
|
"path": "...",
|
|
"title": "...",
|
|
"chunk_index": 0
|
|
}
|
|
"""
|
|
try:
|
|
# Generate embedding for the query
|
|
query_embedding = embed_text(query)
|
|
|
|
client = get_client()
|
|
|
|
# Build filter if library_id is specified
|
|
search_filter = None
|
|
if library_id:
|
|
try:
|
|
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
|
search_filter = Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="library_id",
|
|
match=MatchValue(value=library_id),
|
|
)
|
|
]
|
|
)
|
|
except ImportError:
|
|
search_filter = None
|
|
|
|
# Perform vector search
|
|
results = client.search(
|
|
collection_name=VECTOR_COLLECTION,
|
|
query_vector=query_embedding,
|
|
limit=limit,
|
|
search_filter=search_filter
|
|
)
|
|
|
|
# Format and return results
|
|
formatted_results = []
|
|
for result in results:
|
|
if result.score > 0 and result.payload:
|
|
formatted_results.append({
|
|
"id": result.payload["id"],
|
|
"score": float(result.score),
|
|
"library_id": result.payload.get("library_id", ""),
|
|
"path": result.payload.get("path", ""),
|
|
"title": result.payload.get("title", ""),
|
|
"chunk_index": result.payload.get("chunk_index", 0)
|
|
})
|
|
|
|
return formatted_results
|
|
|
|
except Exception as e:
|
|
print(f"Search error: {e}")
|
|
return []
|
|
|
|
|
|
def get_library_docs(
|
|
library_id: str,
|
|
topic: Optional[str] = None,
|
|
token_limit: int = 8000
|
|
) -> str:
|
|
"""
|
|
Retrieve documentation content from a library.
|
|
|
|
Args:
|
|
library_id: The library ID to fetch docs from
|
|
topic: Optional topic filter - if provided, searches for topic first
|
|
token_limit: Maximum tokens to include in output
|
|
|
|
Returns:
|
|
Combined markdown content as string
|
|
"""
|
|
try:
|
|
# If topic is specified, search for relevant chunks
|
|
if topic:
|
|
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
|
|
search_results = search_docs(query=topic, library_id=library_id, limit=20)
|
|
|
|
if not search_results:
|
|
return f"No documents found in library '{library_id}' matching topic: {topic}"
|
|
|
|
print(f" [Search] Found {len(search_results)} relevant chunks")
|
|
else:
|
|
# Fetch all chunks for the library and select most useful ones
|
|
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
|
|
chunks_data = get_chunks_for_library(library_id)
|
|
|
|
if not chunks_data:
|
|
return f"No documents found in library '{library_id}'"
|
|
|
|
# Sort by chunk_index descending and pick top ones to respect token limit
|
|
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
|
|
selected_chunks = []
|
|
total_tokens = 0
|
|
|
|
for chunk in sorted_chunks:
|
|
content = chunk.get("content", "")
|
|
tokens = len(content) // 4 # Simple token estimate
|
|
|
|
if total_tokens + tokens <= token_limit:
|
|
selected_chunks.append(chunk)
|
|
total_tokens += tokens
|
|
else:
|
|
# Take part of this chunk to fill remaining space
|
|
remaining = token_limit - total_tokens
|
|
content_preview = content[:remaining * 4] if remaining > 0 else ""
|
|
if content_preview:
|
|
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
|
|
|
|
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
|
|
|
|
# Combine chunks into markdown
|
|
md_parts = []
|
|
for chunk in selected_chunks:
|
|
title = chunk.get("title")
|
|
content = chunk.get("content", "")
|
|
|
|
if title and content.strip():
|
|
# Add heading before first chunk or if this is the first chunk
|
|
if not md_parts or "\n\n" not in "".join(md_parts):
|
|
md_parts.append(f"# {title}")
|
|
elif not any(part.startswith("#") for part in md_parts[-5:]):
|
|
md_parts.append(f"\n# {title}\n")
|
|
|
|
md_parts.append(content)
|
|
|
|
result = "\n\n".join(md_parts)
|
|
|
|
# If no headings were added, prepend library title
|
|
if not any(part.startswith("#") for part in result.split("\n")[:3]):
|
|
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
|
|
|
|
return result.rstrip()
|
|
|
|
except Exception as e:
|
|
print(f"Error getting library docs: {e}")
|
|
return f"Error retrieving documents from library '{library_id}': {str(e)}"
|
|
|
|
|
|
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Resolve a library name to potential matches (Context7-style).
|
|
|
|
Args:
|
|
library_name: Partial or full library name to search for
|
|
|
|
Returns:
|
|
List of Context7-style candidate dicts:
|
|
{
|
|
"id": "/local/foundryvtt",
|
|
"name": "foundryvtt",
|
|
"description": "...",
|
|
"source": "local"
|
|
}
|
|
"""
|
|
try:
|
|
libraries = list_libraries()
|
|
|
|
if not libraries:
|
|
return []
|
|
|
|
# Filter by name match (case-insensitive)
|
|
candidates = []
|
|
for lib in libraries:
|
|
lib_name = lib.get("name", "").lower()
|
|
lib_id = lib.get("id", "").lower()
|
|
|
|
if library_name.lower() in lib_name or library_name.lower() in lib_id:
|
|
candidates.append({
|
|
"id": f"/local/{lib['id']}",
|
|
"name": lib["name"],
|
|
"description": lib.get("description", ""),
|
|
"source": "local"
|
|
})
|
|
|
|
# Return top matches (or all if less than 3)
|
|
candidates = candidates[:min(5, len(candidates))]
|
|
|
|
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
|
|
|
|
return candidates
|
|
|
|
except Exception as e:
|
|
print(f"Error resolving library ID: {e}")
|
|
return []
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
async def test_search():
|
|
"""Test search functionality."""
|
|
print("Testing search module...\n")
|
|
|
|
# Test 1: Simple search with dummy vector (simulated)
|
|
print("1. Testing resolve_library_id()...")
|
|
results = await resolve_library_id("foundryvtt")
|
|
print(f" Results: {len(results)} candidates\n")
|
|
|
|
# Test 2: Empty query should return empty list
|
|
print("2. Testing search_docs() with empty query...")
|
|
results = await search_docs("")
|
|
print(f" Results: {len(results)} chunks\n")
|
|
|
|
print("✅ All tests completed!")
|
|
|
|
asyncio.run(test_search())
|