DocsMCP/backend/app/embeddings.py

# Local Embedding Generation using FastEmbed
import asyncio
from typing import List
from functools import lru_cache


# Module-level singleton for cached model instance
_embedding_model = None
_embedding_size = 384  # BAAI/bge-small-en-v1.5 output dimension


def _load_model():
    """Lazy-load the FastEmbed model on first use."""
    global _embedding_model, _embedding_size

    try:
        from fastembed import TextEmbedding

        if _embedding_model is None:
            print("Loading embedding model (this may take a few minutes on first run)...")

            # Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
            _embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache")
            print("Embedding model loaded successfully.")

        return _embedding_model

    except ImportError as e:
        raise ImportError(
            "FastEmbed is not installed. Please install with:\n"
            "  pip install fastembed\n\n"
            f"Import error details: {e}"
        ) from e

    except RuntimeError as e:
        # Model download/installation failed
        if "No space left" in str(e) or "disk quota exceeded" in str(e):
            raise RuntimeError(
                "Failed to load embedding model due to disk space constraints.\n\n"
                "Please free up space on your system (at least 500MB required).\n"
                "Or specify a custom cache directory with available space:\n"
                "  from fastembed import TextEmbedding\n"
                "  model = TextEmbedding(model_name='...', cache_dir='/path/to/large/storage')\n\n"
                f"Error: {e}"
            ) from e
        raise


def get_embedding_model():
    """
    Get the cached embedding model instance.

    Returns:
        FastEmbed TextEmbedding instance (lazy-loaded on first call)

    Raises:
        ImportError: If FastEmbed is not installed
        RuntimeError: If model download/load failed
    """
    global _embedding_model
    if _embedding_model is None:
        _embedding_model = _load_model()
    return _embedding_model


def embed_text(text: str) -> List[float]:
    """
    Generate embedding for a single text.

    Args:
        text: The text string to embed

    Returns:
        List of floats representing the embedding vector

    Raises:
        ImportError: If FastEmbed is not installed
        RuntimeError: If model loading failed
    """
    if not text or not isinstance(text, str):
        return [0.0] * get_embedding_size()

    model = get_embedding_model()
    embedding = model.embed([text])
    return embedding[0].tolist()


def embed_texts(texts: List[str]) -> List[List[float]]:
    """
    Generate embeddings for multiple texts.

    Args:
        texts: List of text strings to embed

    Returns:
        List of lists containing embedding vectors (one per input text)

    Raises:
        ImportError: If FastEmbed is not installed
        RuntimeError: If model loading failed
    """
    if not texts:
        return []

    model = get_embedding_model()
    embeddings = model.embed(texts)

    result = []
    for emb in embeddings:
        if hasattr(emb, 'tolist'):
            result.append(emb.tolist())
        else:
            result.append(emb)

    return result


def get_embedding_size() -> int:
    """
    Get the embedding dimension size.

    Returns:
        Integer representing vector dimension (384 for bge-small-en-v1.5)

    Note:
        This returns a sensible default. Actual dimension is determined by model.
    """
    return _embedding_size


# Async wrapper for compatibility with existing code
async def generate_embeddings(chunks: List[str]) -> List[List[float]]:
    """
    Async wrapper around embed_texts for compatibility.

    Args:
        chunks: List of text strings to embed

    Returns:
        List of embedding vectors
    """
    return embed_texts(chunks)


if __name__ == "__main__":
    # Test the embeddings module
    print("Testing embeddings module...\n")

    # Test get_embedding_size
    size = get_embedding_size()
    print(f"Embedding dimension: {size}")

    # Test single text embedding
    test_text = "Hello, world! This is a test of the embedding generation."
    try:
        emb = embed_text(test_text)
        print(f"\nSingle text embedding shape: ({len(emb)},)")
        print(f"First 5 values: {emb[:5]}")
        print("✓ Single embedding works")
    except Exception as e:
        print(f"✗ Single embedding failed: {e}")

    # Test batch embedding
    test_texts = [
        "The quick brown fox jumps over the lazy dog.",
        "Machine learning is a subset of artificial intelligence.",
        "Natural language processing enables computers to understand human language."
    ]
    try:
        embeddings = embed_texts(test_texts)
        print(f"\nBatch embedding shape: ({len(embeddings)}, {len(embeddings[0])})")
        print("✓ Batch embeddings work")
    except Exception as e:
        print(f"✗ Batch embeddings failed: {e}")

    # Test empty inputs
    assert embed_text("") == [0.0] * size, "Empty text should return zero vector"
    assert embed_texts([]) == [], "Empty list should return empty list"
    print("✓ Empty input handling works")

    print("\n✅ All tests passed!")