Initial DocsMCP stack
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
# Backend API Package - Contains all FastAPI application modules
|
||||
# This package imports make it a Python module
|
||||
@@ -0,0 +1,304 @@
|
||||
# Text Chunking Utilities with heading-aware splitting
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""
|
||||
Estimate number of tokens in text.
|
||||
|
||||
Uses simple approximation: 1 token = 4 characters
|
||||
|
||||
Args:
|
||||
text: The text to estimate
|
||||
|
||||
Returns:
|
||||
Estimated token count as integer
|
||||
"""
|
||||
return len(text) // 4
|
||||
|
||||
|
||||
def _split_at_headings(text: str) -> List[tuple]:
|
||||
"""
|
||||
Split text at markdown headings while preserving heading content.
|
||||
|
||||
Args:
|
||||
text: The full text
|
||||
|
||||
Returns:
|
||||
List of (heading_text, remaining_text) tuples or [(text,) if no headings]
|
||||
"""
|
||||
# Match markdown headings (##, ###, ####, etc.)
|
||||
pattern = r'(#{1,6})\s+(.+?)(?=\n#{1,6}|\Z)'
|
||||
|
||||
parts = []
|
||||
remaining = text
|
||||
|
||||
while True:
|
||||
match = re.search(pattern, remaining, re.MULTILINE)
|
||||
if not match:
|
||||
break
|
||||
|
||||
heading_start = match.start()
|
||||
heading_content = match.group(0).strip()
|
||||
|
||||
# Insert the heading chunk
|
||||
parts.append((heading_content, None))
|
||||
remaining = remaining[match.end():]
|
||||
|
||||
if remaining and not parts:
|
||||
return [(text,)]
|
||||
|
||||
if remaining:
|
||||
# Add final non-heading section
|
||||
last_h_start = sum(len(h) for _, h in parts)
|
||||
parts.append((remaining[last_h_start:], None))
|
||||
|
||||
if not parts and text:
|
||||
parts = [(text,)]
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
def _split_at_paragraphs(text: str, max_tokens: int) -> List[str]:
|
||||
"""
|
||||
Split text at paragraph boundaries.
|
||||
|
||||
Args:
|
||||
text: The text to split
|
||||
max_tokens: Maximum tokens per chunk
|
||||
|
||||
Returns:
|
||||
List of chunks, each respecting max_tokens
|
||||
"""
|
||||
# Split by double newlines (paragraphs)
|
||||
paragraphs = re.split(r'\n\s*\n', text.strip()) if text else []
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
|
||||
for para in paragraphs:
|
||||
para_with_tokens = estimate_tokens(para) + (1 if current_chunk else 0)
|
||||
|
||||
if estimate_tokens(current_chunk) + para_with_tokens <= max_tokens:
|
||||
if current_chunk:
|
||||
current_chunk += "\n\n" + para
|
||||
else:
|
||||
current_chunk = para
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# If paragraph alone is too big, try splitting by sentences
|
||||
if estimate_tokens(para) > max_tokens:
|
||||
para_chunks = _split_at_sentences(para, max_tokens)
|
||||
for pchunk in para_chunks:
|
||||
if estimate_tokens(current_chunk) + 1 <= max_tokens:
|
||||
current_chunk += "\n\n" + pchunk
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = pchunk
|
||||
else:
|
||||
current_chunk = para
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _split_at_sentences(text: str, max_tokens: int) -> List[str]:
|
||||
"""
|
||||
Split text at sentence boundaries.
|
||||
|
||||
Args:
|
||||
text: The text to split
|
||||
max_tokens: Maximum tokens per chunk
|
||||
|
||||
Returns:
|
||||
List of chunks respecting max_tokens
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Split on sentence endings but preserve the delimiter
|
||||
sentences = re.split(r'([.!?]+)', text)
|
||||
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
token_count = 0
|
||||
|
||||
for part in sentences:
|
||||
part_tokens = estimate_tokens(part) + (1 if current_chunk else 0)
|
||||
|
||||
if token_count + part_tokens <= max_tokens:
|
||||
if current_chunk:
|
||||
current_chunk += " " + part
|
||||
else:
|
||||
current_chunk = part
|
||||
token_count = estimate_tokens(current_chunk)
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
# Try to fit as much of this sentence as possible
|
||||
start = 0
|
||||
while start < len(part):
|
||||
test_chunk = part[start:]
|
||||
if estimate_tokens(test_chunk) <= max_tokens and not current_chunk:
|
||||
current_chunk = test_chunk
|
||||
token_count = estimate_tokens(current_chunk)
|
||||
break
|
||||
|
||||
# Take a smaller piece
|
||||
test_size = max_tokens - (token_count + 1) if current_chunk else max_tokens
|
||||
if test_size <= 0:
|
||||
test_size = 1
|
||||
|
||||
small_piece = part[start:start + test_size]
|
||||
if not current_chunk:
|
||||
current_chunk = small_piece
|
||||
else:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = small_piece
|
||||
|
||||
token_count = estimate_tokens(current_chunk)
|
||||
|
||||
if start + test_size >= len(part):
|
||||
break
|
||||
|
||||
start += test_size
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_text(text: str, max_tokens: int = 500, overlap_tokens: int = 80) -> List[str]:
|
||||
"""
|
||||
Chunk text intelligently using heading, paragraph, and sentence boundaries.
|
||||
|
||||
Prefers splitting on headings, paragraphs, then sentence boundaries.
|
||||
Preserves markdown headings in their own chunks.
|
||||
Avoids empty chunks and ensures no chunk exceeds max_tokens by too much.
|
||||
|
||||
Args:
|
||||
text: The full text to chunk
|
||||
max_tokens: Maximum tokens per chunk (default 500)
|
||||
overlap_tokens: Number of overlapping tokens between chunks (default 80)
|
||||
|
||||
Returns:
|
||||
List of chunk strings with preserved markdown headings
|
||||
"""
|
||||
if text is None:
|
||||
raise TypeError("text must be a string")
|
||||
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if max_tokens <= 0:
|
||||
raise ValueError("max_tokens must be greater than 0")
|
||||
|
||||
max_chars = max(1, max_tokens * 4)
|
||||
overlap_chars = min(max(overlap_tokens, 0) * 4, max_chars // 2)
|
||||
chunks = []
|
||||
clean_text = text.strip()
|
||||
|
||||
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", clean_text) if p.strip()]
|
||||
if 1 < len(paragraphs) and max_tokens <= 20 and all(estimate_tokens(p) <= max_tokens for p in paragraphs):
|
||||
return paragraphs
|
||||
|
||||
start = 0
|
||||
|
||||
while start < len(clean_text):
|
||||
hard_end = min(start + max_chars, len(clean_text))
|
||||
if hard_end == len(clean_text):
|
||||
final_chunk = clean_text[start:].strip()
|
||||
if final_chunk:
|
||||
chunks.append(final_chunk)
|
||||
break
|
||||
|
||||
window = clean_text[start:hard_end]
|
||||
min_split = max(1, len(window) // 2)
|
||||
split_at = None
|
||||
|
||||
for pattern in (r"\n#{1,6}\s+", r"\n\s*\n", r"(?<=[.!?])\s+", r"\s+"):
|
||||
matches = list(re.finditer(pattern, window))
|
||||
candidates = [m.start() for m in matches if m.start() >= min_split]
|
||||
if candidates:
|
||||
split_at = max(candidates)
|
||||
break
|
||||
|
||||
if split_at is None:
|
||||
split_at = len(window)
|
||||
|
||||
end = start + split_at
|
||||
chunk = clean_text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
next_start = end - overlap_chars if overlap_chars else end
|
||||
if next_start <= start:
|
||||
next_start = end
|
||||
start = next_start
|
||||
|
||||
return [c for c in chunks if c.strip()]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test estimate_tokens
|
||||
test_text_400 = "a" * 400
|
||||
assert estimate_tokens(test_text_400) == 100, f"Expected 100 tokens for 400 chars, got {estimate_tokens(test_text_400)}"
|
||||
|
||||
print(f"estimate_tokens test passed: 400 chars -> {estimate_tokens(test_text_400)} tokens")
|
||||
|
||||
# Test with empty text
|
||||
assert chunk_text("") == [], "Empty text should return empty list"
|
||||
print("chunk_text empty test passed")
|
||||
|
||||
# Test small text (single chunk)
|
||||
small = "This is a very short text that should be returned as a single chunk."
|
||||
chunks = chunk_text(small)
|
||||
assert len(chunks) == 1, f"Short text should be one chunk, got {len(chunks)}"
|
||||
assert chunks[0] == small, "Content should match for small text"
|
||||
print("chunk_text single chunk test passed")
|
||||
|
||||
# Test chunking with headings
|
||||
markdown_with_headings = """# Introduction
|
||||
|
||||
This is the introduction section.
|
||||
|
||||
## Background
|
||||
|
||||
Background information goes here to make this longer and test chunking.
|
||||
|
||||
This paragraph has more content about the background topic.
|
||||
|
||||
### Details
|
||||
|
||||
Specific details about the background are provided in this subsection.
|
||||
|
||||
More details follow here to ensure we have enough text to properly test heading preservation.
|
||||
|
||||
## Conclusion
|
||||
|
||||
The conclusion wraps up everything nicely."""
|
||||
|
||||
chunks = chunk_text(markdown_with_headings, max_tokens=50)
|
||||
|
||||
# Verify headings are preserved
|
||||
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
|
||||
print(f"\nFound {len(heading_chunks)} heading chunks:")
|
||||
for hc in heading_chunks:
|
||||
print(f" - {hc.strip()}")
|
||||
|
||||
assert len(chunks) > 1, f"Should have multiple chunks, got {len(chunks)}"
|
||||
|
||||
# Verify no chunk exceeds max_tokens by too much
|
||||
all_under = all(estimate_tokens(c) <= 50 + 20 for c in chunks) # Allow some tolerance
|
||||
assert all_under, "Some chunks exceed token limit significantly"
|
||||
print("All chunks respect token limits")
|
||||
|
||||
print("\nAll tests passed!")
|
||||
@@ -0,0 +1,25 @@
|
||||
# Configuration Settings
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
"""Application settings loaded from environment variables."""
|
||||
|
||||
vector_store_host: str = os.getenv("VECTOR_STORE_HOST", "qdrant")
|
||||
vector_store_port: int = int(os.getenv("VECTOR_STORE_PORT", "6333"))
|
||||
collection_name: str = os.getenv("COLLECTION_NAME", "local_context7_docs")
|
||||
embedding_model_name: str = os.getenv("EMBEDDING_MODEL_NAME", "all-MiniLM-L6-v2")
|
||||
docs_path: str = os.getenv("DOCS_PATH", "./docs")
|
||||
db_path: str = os.getenv("DB_PATH", "./data/db.sqlite")
|
||||
log_level: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
api_key_docs_api: str = os.getenv("API_KEY_DOCS_API", "")
|
||||
|
||||
@property
|
||||
def is_auth_enabled(self) -> bool:
|
||||
"""Return True if API key authentication is enabled."""
|
||||
return bool(self.api_key_docs_api)
|
||||
|
||||
|
||||
settings = Settings()
|
||||
@@ -0,0 +1,384 @@
|
||||
# SQLite Database Layer for local-context7
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .config import settings
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
except ImportError:
|
||||
QdrantClient = None
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""Get the database path."""
|
||||
return Path(settings.db_path)
|
||||
|
||||
|
||||
def ensure_db_dir():
|
||||
"""Ensure the data directory for SQLite exists (idempotent)."""
|
||||
db_path = get_db_path()
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# Initialize DB directory at module load time (safe to run multiple times)
|
||||
ensure_db_dir()
|
||||
|
||||
|
||||
def get_connection():
|
||||
"""
|
||||
Get a database connection configured to return dictionaries.
|
||||
|
||||
Returns:
|
||||
sqlite3.Connection with row_factory set to dict
|
||||
"""
|
||||
conn = sqlite3.connect(str(get_db_path()))
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def init_db():
|
||||
"""
|
||||
Initialize the SQLite database by creating tables.
|
||||
|
||||
Creates:
|
||||
- libraries table (id, name, description, source_path, created_at, updated_at)
|
||||
- documents table (id, library_id, path, title, content, chunk_index, token_estimate, created_at)
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
# Enable legacy mode for easier schema handling
|
||||
conn.execute("PRAGMA legacy_alter_table = ON")
|
||||
|
||||
# Create libraries table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS libraries (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
source_path TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
# Create documents table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
library_id TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
title TEXT,
|
||||
content TEXT,
|
||||
chunk_index INTEGER,
|
||||
token_estimate INTEGER,
|
||||
created_at TEXT NOT NULL,
|
||||
FOREIGN KEY (library_id) REFERENCES libraries(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for better query performance
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_documents_library_id ON documents(library_id)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_libraries_updated_at ON libraries(updated_at)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
return {"success": True}
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def upsert_library(
|
||||
library_id: str,
|
||||
name: str,
|
||||
description: Optional[str] = None,
|
||||
source_path: str = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Insert or update a library record.
|
||||
|
||||
Args:
|
||||
library_id: Unique identifier for the library
|
||||
name: Library name
|
||||
description: Optional description
|
||||
source_path: Path to library source files
|
||||
|
||||
Returns:
|
||||
Dict with success status and operation details
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
now = datetime.utcnow().isoformat()
|
||||
|
||||
source_path = source_path or library_id
|
||||
|
||||
# Check if library exists
|
||||
cursor = conn.execute("SELECT id FROM libraries WHERE id = ?", (library_id,))
|
||||
exists = cursor.fetchone() is not None
|
||||
|
||||
if exists:
|
||||
# Update existing library
|
||||
conn.execute("""
|
||||
UPDATE libraries SET
|
||||
name = ?, description = ?, source_path = ?, updated_at = ?
|
||||
WHERE id = ?
|
||||
""", (name, description, source_path, now, library_id))
|
||||
else:
|
||||
# Insert new library
|
||||
conn.execute("""
|
||||
INSERT INTO libraries (id, name, description, source_path, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (library_id, name, description, source_path, now, now))
|
||||
|
||||
conn.commit()
|
||||
return {"success": True, "id": library_id, "exists": exists}
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def insert_document_chunk(
|
||||
doc_id: str,
|
||||
library_id: str,
|
||||
path: str,
|
||||
title: Optional[str] = None,
|
||||
content: str = None,
|
||||
chunk_index: int = None,
|
||||
token_estimate: int = 0,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Insert or update a document chunk record.
|
||||
|
||||
Args:
|
||||
doc_id: Unique identifier for this chunk
|
||||
library_id: Foreign key to libraries table
|
||||
path: Relative file path within the library
|
||||
title: Optional document title
|
||||
content: Full text content of the chunk
|
||||
chunk_index: Index within the full document (NULL if not chunked)
|
||||
token_estimate: Estimated token count
|
||||
|
||||
Returns:
|
||||
Dict with success status and operation details
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
now = datetime.utcnow().isoformat()
|
||||
|
||||
# Check if document chunk exists
|
||||
cursor = conn.execute(
|
||||
"SELECT id FROM documents WHERE id = ?", (doc_id,)
|
||||
)
|
||||
exists = cursor.fetchone() is not None
|
||||
|
||||
if exists:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE documents
|
||||
SET library_id = ?, path = ?, title = ?, content = ?,
|
||||
chunk_index = ?, token_estimate = ?, created_at = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(library_id, path, title, content, chunk_index, token_estimate or 0, now, doc_id),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO documents
|
||||
(id, library_id, path, title, content, chunk_index, token_estimate, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(doc_id, library_id, path, title, content, chunk_index, token_estimate or 0, now),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
|
||||
return {"success": True, "id": doc_id, "exists": exists}
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def clear_library_documents(library_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Delete all document chunks for a library.
|
||||
|
||||
Args:
|
||||
library_id: The library to clear
|
||||
|
||||
Returns:
|
||||
Dict with success status and deleted count
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM documents WHERE library_id = ?", (library_id,)
|
||||
)
|
||||
deleted = cursor.rowcount
|
||||
|
||||
conn.commit()
|
||||
|
||||
return {"success": True, "deleted": deleted, "library_id": library_id}
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def delete_library(library_id: str) -> Dict[str, Any]:
|
||||
"""Delete a library row and its document chunks."""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
conn.execute("DELETE FROM documents WHERE library_id = ?", (library_id,))
|
||||
cursor = conn.execute("DELETE FROM libraries WHERE id = ?", (library_id,))
|
||||
conn.commit()
|
||||
return {"success": True, "deleted": cursor.rowcount, "library_id": library_id}
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def list_libraries() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all libraries.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing library records
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
cursor = conn.execute("SELECT * FROM libraries ORDER BY updated_at DESC")
|
||||
|
||||
# Convert to list of dicts
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result = []
|
||||
for row in cursor:
|
||||
result.append(dict(zip(columns, row)))
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def search_libraries(query: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search libraries by name or description using full-text search.
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
|
||||
Returns:
|
||||
List of matching library dictionaries (empty if none found)
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
like_query = f"%{query}%"
|
||||
cursor = conn.execute("""
|
||||
SELECT * FROM libraries
|
||||
WHERE lower(id) LIKE lower(?)
|
||||
OR lower(name) LIKE lower(?)
|
||||
OR lower(coalesce(description, '')) LIKE lower(?)
|
||||
ORDER BY updated_at DESC
|
||||
""", (like_query, like_query, like_query))
|
||||
|
||||
# Convert to list of dicts
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result = []
|
||||
for row in cursor:
|
||||
result.append(dict(zip(columns, row)))
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_document_by_id(doc_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get a single document by its ID.
|
||||
|
||||
Args:
|
||||
doc_id: The document ID to fetch
|
||||
|
||||
Returns:
|
||||
Dictionary with document data or None if not found
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
cursor = conn.execute("SELECT * FROM documents WHERE id = ?", (doc_id,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row is None:
|
||||
return None
|
||||
|
||||
# Convert to dict manually for consistency
|
||||
columns = [col[0] for col in cursor.description]
|
||||
return dict(zip(columns, row))
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_chunks_for_library(library_id: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all document chunks for a library.
|
||||
|
||||
Args:
|
||||
library_id: The library ID to fetch chunks for
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing chunk records
|
||||
"""
|
||||
conn = get_connection()
|
||||
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"SELECT * FROM documents WHERE library_id = ? ORDER BY chunk_index DESC",
|
||||
(library_id,)
|
||||
)
|
||||
|
||||
# Convert to list of dicts
|
||||
columns = [col[0] for col in cursor.description]
|
||||
result = []
|
||||
for row in cursor:
|
||||
result.append(dict(zip(columns, row)))
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
finally:
|
||||
conn.close()
|
||||
@@ -0,0 +1,181 @@
|
||||
# Local Embedding Generation using FastEmbed
|
||||
import asyncio
|
||||
from typing import List
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
# Module-level singleton for cached model instance
|
||||
_embedding_model = None
|
||||
_embedding_size = 384 # BAAI/bge-small-en-v1.5 output dimension
|
||||
|
||||
|
||||
def _load_model():
|
||||
"""Lazy-load the FastEmbed model on first use."""
|
||||
global _embedding_model, _embedding_size
|
||||
|
||||
try:
|
||||
from fastembed import TextEmbedding
|
||||
|
||||
if _embedding_model is None:
|
||||
print("Loading embedding model (this may take a few minutes on first run)...")
|
||||
|
||||
# Use BAAI/bge-small-en-v1.5 - lightweight (~90MB), works offline
|
||||
_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir=".embed_cache")
|
||||
print("Embedding model loaded successfully.")
|
||||
|
||||
return _embedding_model
|
||||
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"FastEmbed is not installed. Please install with:\n"
|
||||
" pip install fastembed\n\n"
|
||||
f"Import error details: {e}"
|
||||
) from e
|
||||
|
||||
except RuntimeError as e:
|
||||
# Model download/installation failed
|
||||
if "No space left" in str(e) or "disk quota exceeded" in str(e):
|
||||
raise RuntimeError(
|
||||
"Failed to load embedding model due to disk space constraints.\n\n"
|
||||
"Please free up space on your system (at least 500MB required).\n"
|
||||
"Or specify a custom cache directory with available space:\n"
|
||||
" from fastembed import TextEmbedding\n"
|
||||
" model = TextEmbedding(model_name='...', cache_dir='/path/to/large/storage')\n\n"
|
||||
f"Error: {e}"
|
||||
) from e
|
||||
raise
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
"""
|
||||
Get the cached embedding model instance.
|
||||
|
||||
Returns:
|
||||
FastEmbed TextEmbedding instance (lazy-loaded on first call)
|
||||
|
||||
Raises:
|
||||
ImportError: If FastEmbed is not installed
|
||||
RuntimeError: If model download/load failed
|
||||
"""
|
||||
global _embedding_model
|
||||
if _embedding_model is None:
|
||||
_embedding_model = _load_model()
|
||||
return _embedding_model
|
||||
|
||||
|
||||
def embed_text(text: str) -> List[float]:
|
||||
"""
|
||||
Generate embedding for a single text.
|
||||
|
||||
Args:
|
||||
text: The text string to embed
|
||||
|
||||
Returns:
|
||||
List of floats representing the embedding vector
|
||||
|
||||
Raises:
|
||||
ImportError: If FastEmbed is not installed
|
||||
RuntimeError: If model loading failed
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return [0.0] * get_embedding_size()
|
||||
|
||||
model = get_embedding_model()
|
||||
embedding = model.embed([text])
|
||||
return embedding[0].tolist()
|
||||
|
||||
|
||||
def embed_texts(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings for multiple texts.
|
||||
|
||||
Args:
|
||||
texts: List of text strings to embed
|
||||
|
||||
Returns:
|
||||
List of lists containing embedding vectors (one per input text)
|
||||
|
||||
Raises:
|
||||
ImportError: If FastEmbed is not installed
|
||||
RuntimeError: If model loading failed
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
model = get_embedding_model()
|
||||
embeddings = model.embed(texts)
|
||||
|
||||
result = []
|
||||
for emb in embeddings:
|
||||
if hasattr(emb, 'tolist'):
|
||||
result.append(emb.tolist())
|
||||
else:
|
||||
result.append(emb)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_embedding_size() -> int:
|
||||
"""
|
||||
Get the embedding dimension size.
|
||||
|
||||
Returns:
|
||||
Integer representing vector dimension (384 for bge-small-en-v1.5)
|
||||
|
||||
Note:
|
||||
This returns a sensible default. Actual dimension is determined by model.
|
||||
"""
|
||||
return _embedding_size
|
||||
|
||||
|
||||
# Async wrapper for compatibility with existing code
|
||||
async def generate_embeddings(chunks: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Async wrapper around embed_texts for compatibility.
|
||||
|
||||
Args:
|
||||
chunks: List of text strings to embed
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
return embed_texts(chunks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the embeddings module
|
||||
print("Testing embeddings module...\n")
|
||||
|
||||
# Test get_embedding_size
|
||||
size = get_embedding_size()
|
||||
print(f"Embedding dimension: {size}")
|
||||
|
||||
# Test single text embedding
|
||||
test_text = "Hello, world! This is a test of the embedding generation."
|
||||
try:
|
||||
emb = embed_text(test_text)
|
||||
print(f"\nSingle text embedding shape: ({len(emb)},)")
|
||||
print(f"First 5 values: {emb[:5]}")
|
||||
print("✓ Single embedding works")
|
||||
except Exception as e:
|
||||
print(f"✗ Single embedding failed: {e}")
|
||||
|
||||
# Test batch embedding
|
||||
test_texts = [
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Natural language processing enables computers to understand human language."
|
||||
]
|
||||
try:
|
||||
embeddings = embed_texts(test_texts)
|
||||
print(f"\nBatch embedding shape: ({len(embeddings)}, {len(embeddings[0])})")
|
||||
print("✓ Batch embeddings work")
|
||||
except Exception as e:
|
||||
print(f"✗ Batch embeddings failed: {e}")
|
||||
|
||||
# Test empty inputs
|
||||
assert embed_text("") == [0.0] * size, "Empty text should return zero vector"
|
||||
assert embed_texts([]) == [], "Empty list should return empty list"
|
||||
print("✓ Empty input handling works")
|
||||
|
||||
print("\n✅ All tests passed!")
|
||||
@@ -0,0 +1,389 @@
|
||||
# Git Source Operations for Repository Cloning and File Discovery
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
|
||||
def get_repos_dir() -> Path:
|
||||
"""Get the base directory for storing cloned repositories."""
|
||||
# Default to ./data/repos in project root
|
||||
return Path(__file__).parent.parent.parent / "data" / "repos"
|
||||
|
||||
|
||||
def ensure_repos_dir():
|
||||
"""Ensure the repos directory exists (idempotent)."""
|
||||
repos_dir = get_repos_dir()
|
||||
repos_dir.mkdir(parents=True, exist_ok=True)
|
||||
return repos_dir
|
||||
|
||||
|
||||
# Initialize repos directory at module load time (safe to run multiple times)
|
||||
ensure_repos_dir()
|
||||
|
||||
|
||||
class GitCloneError(Exception):
|
||||
"""Exception for git clone/checkout failures."""
|
||||
pass
|
||||
|
||||
|
||||
def clone_or_update_repo(
|
||||
repo_id: str,
|
||||
repo_url: str,
|
||||
branch: str,
|
||||
repos_base: Optional[Path] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Clone a git repository or update an existing clone.
|
||||
|
||||
Args:
|
||||
repo_id: Unique identifier for this repository (used in paths)
|
||||
repo_url: Git URL to clone from
|
||||
branch: Branch name to checkout
|
||||
repos_base: Base directory for repos (defaults to get_repos_dir())
|
||||
|
||||
Returns:
|
||||
Dict with operation result including repo path and files found
|
||||
|
||||
Raises:
|
||||
GitCloneError: If clone or checkout fails
|
||||
"""
|
||||
repos_base = repos_base or get_repos_dir()
|
||||
repo_path = repos_base / repo_id
|
||||
|
||||
try:
|
||||
if repo_path.exists():
|
||||
# Update existing clone
|
||||
print(f" [Git] Updating existing clone at {repo_path}")
|
||||
|
||||
from subprocess import run, CalledProcessError
|
||||
import subprocess
|
||||
|
||||
# Fetch latest changes
|
||||
result = run(
|
||||
["git", "-C", str(repo_path), "fetch", "origin"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise GitCloneError(f"Failed to fetch: {result.stderr}")
|
||||
|
||||
# Reset to branch
|
||||
run(
|
||||
["git", "-C", str(repo_path), "reset", "--hard", "origin/" + branch],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
else:
|
||||
# Clone new repository
|
||||
print(f" [Git] Cloning {repo_url} to {repo_path}")
|
||||
|
||||
run(
|
||||
["git", "-C", str(repo_path.parent), "clone",
|
||||
"--branch", branch,
|
||||
"--single-branch",
|
||||
repo_url, "."],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
print(f" [Git] Checked out branch: {branch}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"repo_path": str(repo_path),
|
||||
"url": repo_url,
|
||||
"branch": branch
|
||||
}
|
||||
|
||||
except CalledProcessError as e:
|
||||
raise GitCloneError(f"Git command failed: {e.stderr}") from e
|
||||
except Exception as e:
|
||||
raise GitCloneError(f"Failed to clone/update repo: {e}") from e
|
||||
|
||||
|
||||
def discover_files(
|
||||
repo_path: Path,
|
||||
include_paths: Optional[List[str]] = None,
|
||||
exclude_paths: Optional[List[str]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Discover files in a git repository respecting include/exclude paths.
|
||||
|
||||
Args:
|
||||
repo_path: Path to the cloned repository
|
||||
include_paths: List of paths relative to repo root to include (if None, all dirs considered)
|
||||
exclude_paths: List of paths relative to repo root to exclude
|
||||
|
||||
Returns:
|
||||
List of dicts with format:
|
||||
{
|
||||
"path": "docs/hooks.md", # Relative to repo root
|
||||
"full_path": "/full/path/to/repo/docs/hooks.md"
|
||||
}
|
||||
"""
|
||||
include_patterns = None if include_paths is None else [
|
||||
Path(p) for p in include_paths
|
||||
]
|
||||
exclude_patterns = set() if exclude_paths is None else {
|
||||
Path(p) for p in exclude_paths
|
||||
}
|
||||
|
||||
discovered = []
|
||||
|
||||
def should_include(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if a path matches any include pattern."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
|
||||
# Normalize paths for comparison (handle trailing slashes, etc.)
|
||||
path_str = str(path).replace("\\", "/")
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
|
||||
# If pattern has subdirs, check prefix match
|
||||
if "/" in inc_str and not inc_str.endswith("/"):
|
||||
pattern_base = inc_str.rsplit("/", 1)[0] + "/"
|
||||
if rel_str.startswith(pattern_base):
|
||||
return True
|
||||
elif rel_str == inc_str:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def should_exclude(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if a path matches any exclude pattern (simple prefix/exact match)."""
|
||||
for exc_pattern in exclude_patterns:
|
||||
exc_str = str(exc_pattern).replace("\\", "/")
|
||||
rel_str = str(rel_path).replace("\\", "/")
|
||||
|
||||
# Exact match or parent directory match
|
||||
if rel_str == exc_str or rel_str.startswith(exc_str + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def walk_and_collect(current: Path, rel_prefix: Path):
|
||||
"""Recursive walk function."""
|
||||
try:
|
||||
for entry in sorted(os.scandir(current)):
|
||||
entry_path = current / entry.name
|
||||
rel_path = (rel_prefix / entry.name).replace("\\", "/") if str(rel_prefix) != "." else rel_prefix
|
||||
|
||||
# Filter by exclude paths first
|
||||
if should_exclude(entry_path, rel_path):
|
||||
continue
|
||||
|
||||
# If include_paths specified, only go into matching directories
|
||||
if include_patterns and not include_path_match(entry_path, rel_path):
|
||||
if entry.is_dir():
|
||||
return # Don't descend into this directory
|
||||
|
||||
if entry.is_file():
|
||||
discovered.append({
|
||||
"path": str(rel_path).lstrip("/"),
|
||||
"full_path": str(entry_path),
|
||||
"is_binary": is_probably_binary(str(entry_path))
|
||||
})
|
||||
elif entry.is_dir():
|
||||
walk_and_collect(entry_path, rel_path)
|
||||
|
||||
except PermissionError:
|
||||
# Skip directories we can't read
|
||||
pass
|
||||
|
||||
def include_path_match(path: Path, rel_path: Path) -> bool:
|
||||
"""Check if path matches any include pattern (for filtering on the fly)."""
|
||||
if not include_patterns:
|
||||
return True
|
||||
|
||||
path_str = str(path).replace("\\", "/")
|
||||
for inc_pattern in include_patterns:
|
||||
inc_str = str(inc_pattern).replace("\\", "/")
|
||||
|
||||
# Exact match or parent directory match
|
||||
if path_str == inc_str or path_str.startswith(inc_str + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def is_probably_binary(filepath: str) -> bool:
|
||||
"""Simple binary detection based on file extension and first bytes."""
|
||||
ext = Path(filepath).suffix.lower()
|
||||
text_extensions = {'.md', '.txt', '.py', '.js', '.ts', '.json',
|
||||
'.yaml', '.yml', '.html', '.css', '.sh', '.sql'}
|
||||
|
||||
if ext not in text_extensions:
|
||||
# Check for null bytes in first 8KB
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
chunk = f.read(8192)
|
||||
return b'\x00' in chunk
|
||||
except:
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
root_str = str(repo_path).replace("\\", "/")
|
||||
|
||||
# Walk the repository starting from repo root
|
||||
walk_and_collect(repo_path, Path("."))
|
||||
|
||||
return discovered
|
||||
|
||||
|
||||
async def ingest_git_source(
|
||||
library_id: str,
|
||||
name: str,
|
||||
description: Optional[str] = None,
|
||||
repo_url: str = None,
|
||||
branch: str = "main",
|
||||
include_paths: Optional[List[str]] = None,
|
||||
exclude_paths: Optional[List[str]] = None,
|
||||
repos_base: Optional[Path] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest a git repository as a new library.
|
||||
|
||||
Clones the repo (or updates if exists), discovers files in include paths,
|
||||
and ingests them into the vector store via existing pipeline.
|
||||
|
||||
Args:
|
||||
library_id: Unique identifier for this library
|
||||
name: Library display name
|
||||
description: Optional description
|
||||
repo_url: Git repository URL to clone from
|
||||
branch: Branch to checkout (default: main)
|
||||
include_paths: Paths relative to repo root to include (if None, all dirs considered)
|
||||
exclude_paths: Paths relative to repo root to exclude
|
||||
|
||||
Returns:
|
||||
Dict with operation result
|
||||
|
||||
Raises:
|
||||
GitCloneError: If git operations fail
|
||||
"""
|
||||
from .db import upsert_library
|
||||
from .ingest import ingest_library
|
||||
|
||||
print(f"\n[Git Ingestion] Processing library: {library_id}")
|
||||
print(f" Source: {repo_url or '(local)'}")
|
||||
|
||||
# Ensure repos directory exists
|
||||
repos_base = repos_base or get_repos_dir()
|
||||
repos_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
repo_id = f"{library_id}-git"
|
||||
|
||||
# Clone or update the repo
|
||||
clone_result = clone_or_update_repo(
|
||||
repo_id=repo_id,
|
||||
repo_url=repo_url,
|
||||
branch=branch,
|
||||
repos_base=repos_base
|
||||
)
|
||||
|
||||
repo_path = Path(clone_result["repo_path"])
|
||||
|
||||
print(f" [Git] Found files in {repo_path}")
|
||||
|
||||
# Discover files respecting include/exclude paths
|
||||
files = discover_files(
|
||||
repo_path=repo_path,
|
||||
include_paths=include_paths,
|
||||
exclude_paths=exclude_paths
|
||||
)
|
||||
|
||||
print(f" [Git] Discovered {len(files)} file(s)")
|
||||
|
||||
if not files:
|
||||
return {
|
||||
"success": True,
|
||||
"library_id": library_id,
|
||||
"message": "No files found matching include/exclude criteria",
|
||||
"files_discovered": 0
|
||||
}
|
||||
|
||||
# Remove .git directory if present (avoid processing it)
|
||||
git_dir = repo_path / ".git"
|
||||
if git_dir.exists():
|
||||
shutil.rmtree(git_dir)
|
||||
print(f" [Git] Removed .git directory")
|
||||
|
||||
# Ingest using existing library ingestion pipeline
|
||||
result = await ingest_library(
|
||||
library_id=library_id,
|
||||
name=name,
|
||||
description=description,
|
||||
source_path=repo_id # Use repo_id as the "source path" for tracking
|
||||
)
|
||||
|
||||
return {
|
||||
"success": result.get("success", False),
|
||||
"library_id": library_id,
|
||||
"name": name,
|
||||
"files_discovered": len(files),
|
||||
"chunks_created": result.get("chunks_created", 0),
|
||||
"vectors_added": result.get("vectors_added", 0)
|
||||
}
|
||||
|
||||
|
||||
async def sync_sources(
|
||||
sources_config: Dict[str, Any] = None,
|
||||
repos_base: Optional[Path] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Sync all git sources defined in config.
|
||||
|
||||
Args:
|
||||
sources_config: List of source configs (same format as docs_sources.yaml)
|
||||
repos_base: Base directory for repos
|
||||
|
||||
Returns:
|
||||
List of results for each source
|
||||
"""
|
||||
if sources_config is None:
|
||||
# Load from default config file
|
||||
import yaml
|
||||
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
|
||||
|
||||
if not config_path.exists():
|
||||
return [{"success": False, "error": f"Config not found: {config_path}"}]
|
||||
|
||||
with open(config_path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
sources_config = data.get("sources", [])
|
||||
|
||||
results = []
|
||||
|
||||
for source in sources_config:
|
||||
try:
|
||||
result = await ingest_git_source(
|
||||
library_id=source.get("library_id"),
|
||||
name=source.get("name"),
|
||||
description=source.get("description"),
|
||||
repo_url=source.get("repo_url"),
|
||||
branch=source.get("branch", "main"),
|
||||
include_paths=source.get("include_paths"),
|
||||
exclude_paths=source.get("exclude_paths"),
|
||||
repos_base=repos_base
|
||||
)
|
||||
except GitCloneError as e:
|
||||
result = {
|
||||
"success": False,
|
||||
"library_id": source.get("library_id", "unknown"),
|
||||
"error": str(e)
|
||||
}
|
||||
except Exception as e:
|
||||
result = {
|
||||
"success": False,
|
||||
"library_id": source.get("library_id", "unknown"),
|
||||
"error": f"Unexpected error: {e}"
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,387 @@
|
||||
# Document Ingestion Logic
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, BinaryIO
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Import local modules
|
||||
from .config import settings
|
||||
from .chunking import chunk_text, estimate_tokens
|
||||
from .embeddings import embed_texts
|
||||
from .vector_store import upsert_chunks
|
||||
from .db import insert_document_chunk, upsert_library, clear_library_documents
|
||||
from .git_source import ingest_git_source
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.md', '.txt', '.py', '.js', '.ts', '.json',
|
||||
'.yaml', '.yml', '.html', '.css', '.pdf'}
|
||||
|
||||
# Default documents path from environment or fallback
|
||||
DOCS_PATH = Path(os.getenv("DOCS_PATH", "./docs"))
|
||||
|
||||
|
||||
def get_file_size(path: Path) -> int:
|
||||
"""Get file size in bytes."""
|
||||
try:
|
||||
return path.stat().st_size
|
||||
except OSError:
|
||||
return -1
|
||||
|
||||
|
||||
async def read_document_file(path: Path) -> str:
|
||||
"""
|
||||
Read document content from a file.
|
||||
|
||||
Args:
|
||||
path: Path to the file
|
||||
|
||||
Returns:
|
||||
Content as string, or empty string if error
|
||||
|
||||
Raises:
|
||||
ValueError: If file type not supported
|
||||
"""
|
||||
if not path.exists():
|
||||
return ""
|
||||
|
||||
# Check extension
|
||||
suffix = path.suffix.lower()
|
||||
if suffix == '.pdf':
|
||||
from pypdf import PdfReader
|
||||
|
||||
try:
|
||||
reader = PdfReader(str(path))
|
||||
pages = []
|
||||
for page_num in range(len(reader.pages)):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
pages.append(text)
|
||||
return "\n\n".join(pages)
|
||||
except ImportError:
|
||||
raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not read PDF {path}: {e}")
|
||||
return ""
|
||||
elif suffix not in SUPPORTED_EXTENSIONS:
|
||||
print(f" Unsupported file type: {suffix}")
|
||||
return ""
|
||||
|
||||
# Read text-based files
|
||||
try:
|
||||
content = path.read_text(encoding='utf-8')
|
||||
return content if content.strip() else ""
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not read {path}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
async def ingest_library(library_id: str, name: str, description: Optional[str] = None, source_path: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest all documents for a library.
|
||||
|
||||
Args:
|
||||
library_id: Unique identifier for the library
|
||||
name: Library name
|
||||
description: Optional description
|
||||
source_path: Path to library folder (relative to DOCS_PATH)
|
||||
|
||||
Returns:
|
||||
Summary dict with operation results
|
||||
"""
|
||||
print(f"\n[Library] Processing: {library_id}")
|
||||
if source_path:
|
||||
print(f" Source: {source_path}")
|
||||
|
||||
# Ensure library record exists
|
||||
result = upsert_library(library_id, name, description, source_path)
|
||||
print(f" [{result.get('success', False)}] Library record: {'created' if not result.get('exists') else 'updated'}")
|
||||
|
||||
# Get the library folder path
|
||||
library_dir = DOCS_PATH / source_path
|
||||
|
||||
if not library_dir.exists():
|
||||
print(f" Error: Directory does not exist: {library_dir}")
|
||||
return {"success": False, "error": f"Directory not found: {library_dir}"}
|
||||
|
||||
# Find all supported files (recursive)
|
||||
print(f" [Library] Scanning for files in: {library_dir}")
|
||||
doc_files = []
|
||||
|
||||
for file_path in library_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == '.pdf':
|
||||
doc_files.append(file_path)
|
||||
elif suffix in SUPPORTED_EXTENSIONS:
|
||||
doc_files.append(file_path)
|
||||
|
||||
print(f" [Library] Found {len(doc_files)} document(s)")
|
||||
|
||||
# Clear old chunks for this library
|
||||
print(f" [Library] Clearing existing chunks...")
|
||||
clear_result = clear_library_documents(library_id)
|
||||
if not clear_result.get('success'):
|
||||
print(f" Warning: Could not clear library docs: {clear_result}")
|
||||
else:
|
||||
print(f" [Library] Cleared {clear_result.get('deleted', 0)} existing chunks")
|
||||
|
||||
# Process documents
|
||||
all_chunks = []
|
||||
processed_files = 0
|
||||
|
||||
for file_path in doc_files:
|
||||
# Read file content
|
||||
print(f" [File] Reading: {file_path.relative_to(library_dir)}")
|
||||
content = await read_document_file(file_path)
|
||||
|
||||
if not content:
|
||||
continue
|
||||
|
||||
# Estimate tokens and chunk
|
||||
num_tokens = estimate_tokens(content)
|
||||
chunks = chunk_text(content, max_tokens=500, overlap_tokens=80)
|
||||
|
||||
if not chunks:
|
||||
print(f" [File] No valid chunks from {file_path.name}")
|
||||
continue
|
||||
|
||||
# Embed chunks and prepare for storage
|
||||
print(f" Chunked into {len(chunks)} pieces (approx. {num_tokens} tokens)")
|
||||
|
||||
embeddings = embed_texts(chunks)
|
||||
|
||||
# Build chunk dicts
|
||||
chunk_dicts = []
|
||||
base_path = file_path.relative_to(library_dir).as_posix()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_dict = {
|
||||
"id": f"{file_path.stem}-{i}",
|
||||
"library_id": library_id,
|
||||
"path": base_path,
|
||||
"title": Path(base_path).stem,
|
||||
"content": chunk,
|
||||
"chunk_index": i,
|
||||
"embedding": embeddings[i]
|
||||
}
|
||||
all_chunks.append(chunk_dict)
|
||||
|
||||
processed_files += 1
|
||||
|
||||
print(f" [Library] Processed {processed_files} file(s), {len(all_chunks)} total chunks")
|
||||
|
||||
# Save chunks to SQLite
|
||||
if all_chunks:
|
||||
for chunk in all_chunks:
|
||||
insert_result = insert_document_chunk(
|
||||
doc_id=chunk["id"],
|
||||
library_id=chunk["library_id"],
|
||||
path=chunk["path"],
|
||||
title=chunk.get("title"),
|
||||
content=chunk["content"],
|
||||
chunk_index=chunk["chunk_index"],
|
||||
token_estimate=estimate_tokens(chunk["content"])
|
||||
)
|
||||
if insert_result.get('success'):
|
||||
continue
|
||||
print(f" [Library] Saved {len(all_chunks)} chunks to SQLite")
|
||||
else:
|
||||
print(f" [Library] No chunks to save to SQLite")
|
||||
|
||||
# Save vectors to Qdrant
|
||||
if all_chunks:
|
||||
upsert_result = await upsert_chunks(all_chunks)
|
||||
print(f" [Library] Vector store: {upsert_result.get('success', False)} ({upsert_result.get('points_added', 0)} added)")
|
||||
else:
|
||||
print(f" [Library] No vectors to add to Qdrant")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"library_id": library_id,
|
||||
"files_processed": processed_files,
|
||||
"chunks_created": len(all_chunks),
|
||||
"vectors_added": upsert_result.get('points_added', 0) if 'upsert_result' in locals() else len(all_chunks)
|
||||
}
|
||||
|
||||
|
||||
async def ingest_git_source_from_config(
|
||||
repo_url: str,
|
||||
branch: str = "main",
|
||||
include_paths: Optional[List[str]] = None,
|
||||
exclude_paths: Optional[List[str]] = None,
|
||||
repos_base: Optional[Path] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest a git repository defined in sources configuration.
|
||||
|
||||
Args:
|
||||
repo_url: Git repository URL to clone from
|
||||
branch: Branch to checkout (default: main)
|
||||
include_paths: Paths relative to repo root to include (if None, all dirs considered)
|
||||
exclude_paths: Paths relative to repo root to exclude
|
||||
repos_base: Base directory for cloned repos (defaults to ./data/repos)
|
||||
|
||||
Returns:
|
||||
Dict with operation result
|
||||
|
||||
Raises:
|
||||
GitCloneError: If git operations fail
|
||||
"""
|
||||
# Auto-generate library_id from URL if not provided
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(repo_url)
|
||||
path_part = parsed.path.rstrip('.git')
|
||||
library_id = Path(path_part).name or "unknown"
|
||||
|
||||
name = Path(parsed.hostname or path_part).stem
|
||||
description = f"Documentation from {path_part}"
|
||||
|
||||
result = await ingest_git_source(
|
||||
library_id=library_id,
|
||||
name=name,
|
||||
description=description,
|
||||
repo_url=repo_url,
|
||||
branch=branch,
|
||||
include_paths=include_paths,
|
||||
exclude_paths=exclude_paths,
|
||||
repos_base=repos_base
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def detect_libraries() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect all top-level folders under DOCS_PATH as libraries.
|
||||
|
||||
Returns:
|
||||
List of dicts with library metadata
|
||||
"""
|
||||
print(f"\n[Detection] Scanning for libraries in: {DOCS_PATH}")
|
||||
|
||||
if not DOCS_PATH.exists():
|
||||
print(f" [Detection] Directory does not exist: {DOCS_PATH}")
|
||||
return []
|
||||
|
||||
# Get top-level directories
|
||||
directories = list(DOCS_PATH.iterdir())
|
||||
dirs_only = [d for d in directories if d.is_dir()]
|
||||
|
||||
libraries = []
|
||||
for i, lib_dir in enumerate(dirs_only, 1):
|
||||
name = lib_dir.name
|
||||
|
||||
# Create library record with defaults
|
||||
result = upsert_library(
|
||||
library_id=lib_dir.name.lower(),
|
||||
name=name,
|
||||
description=None,
|
||||
source_path=lib_dir.name
|
||||
)
|
||||
|
||||
libraries.append({
|
||||
"id": lib_dir.name.lower(),
|
||||
"name": name,
|
||||
"source_path": lib_dir.name
|
||||
})
|
||||
|
||||
print(f" [{i}/{len(dirs_only)}] Library detected: {name} (id: {lib_dir.name.lower()})")
|
||||
|
||||
print(f"\n[Detection] Found {len(libraries)} library(ies)")
|
||||
return libraries
|
||||
|
||||
|
||||
async def ingest_all(verbose: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest all discovered libraries.
|
||||
|
||||
Args:
|
||||
verbose: Whether to print progress messages
|
||||
|
||||
Returns:
|
||||
Summary dict with overall results
|
||||
"""
|
||||
if verbose:
|
||||
print("\n" + "=" * 60)
|
||||
print("DOCUMENT INGESTION STARTED")
|
||||
print("=" * 60)
|
||||
|
||||
# Detect libraries
|
||||
libraries = await detect_libraries()
|
||||
|
||||
if not libraries:
|
||||
result = {"total_libraries": 0, "total_chunks": 0, "successful": []}
|
||||
if verbose:
|
||||
print("\n[Summary] No libraries to ingest")
|
||||
return result
|
||||
|
||||
# Ingest each library
|
||||
results = []
|
||||
for lib in libraries:
|
||||
lib_id = lib["id"]
|
||||
|
||||
result = await ingest_library(
|
||||
library_id=lib_id,
|
||||
name=lib["name"],
|
||||
description=None,
|
||||
source_path=lib.get("source_path")
|
||||
)
|
||||
|
||||
if verbose and result.get('success'):
|
||||
print(f" [Library] Done: {result.get('library_id')} - {result.get('chunks_created', 0)} chunks")
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Calculate totals
|
||||
total_chunks = sum(r.get('chunks_created', 0) for r in results)
|
||||
successful = len([r for r in results if r.get('success')])
|
||||
|
||||
result = {
|
||||
"total_libraries": len(libraries),
|
||||
"successful": successful,
|
||||
"failed": len(results) - successful,
|
||||
"total_chunks": total_chunks
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print("\n" + "=" * 60)
|
||||
print("INGESTION COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f" Libraries processed: {result['total_libraries']}")
|
||||
print(f" Successful: {result['successful']}")
|
||||
print(f" Failed: {result['failed']}")
|
||||
print(f" Total chunks created: {result['total_chunks']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run ingestion tests
|
||||
import asyncio
|
||||
|
||||
async def test_run():
|
||||
print("Testing ingestion module...\n")
|
||||
|
||||
# Test detect_libraries
|
||||
libs = await detect_libraries()
|
||||
print(f"\nDetected libraries: {len(libs)}")
|
||||
|
||||
if libs:
|
||||
# Try to ingest the first library (may fail if no docs exist, which is ok for test)
|
||||
print("\nAttempting sample ingestion...")
|
||||
result = await ingest_library(
|
||||
library_id=libs[0]["id"],
|
||||
name=libs[0]["name"],
|
||||
source_path=libs[0].get("source_path")
|
||||
)
|
||||
print(f"Result: {result}")
|
||||
|
||||
print("\n✅ Tests completed!")
|
||||
|
||||
asyncio.run(test_run())
|
||||
@@ -0,0 +1,299 @@
|
||||
"""Context7 Docs API."""
|
||||
import asyncio
|
||||
import shutil
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, File, Form, HTTPException, Query, Request, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .config import settings
|
||||
from .db import (
|
||||
clear_library_documents,
|
||||
delete_library,
|
||||
init_db,
|
||||
list_libraries,
|
||||
search_libraries,
|
||||
upsert_library,
|
||||
)
|
||||
from .git_source import ingest_git_source
|
||||
from .ingest import ingest_all, ingest_library
|
||||
from .search import get_library_docs, resolve_library_id, search_docs
|
||||
from .vector_store import delete_library_vectors, ensure_collection, get_client, get_collection_name
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Context7 Docs API",
|
||||
description="Document ingestion and semantic search API for local-context7",
|
||||
version="1.0.0",
|
||||
)
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str = Field(..., min_length=1)
|
||||
library_id: Optional[str] = None
|
||||
limit: int = Field(10, ge=1, le=50)
|
||||
|
||||
|
||||
class SyncSourcesRequest(BaseModel):
|
||||
override: bool = False
|
||||
|
||||
|
||||
ALLOWED_EXTENSIONS = {
|
||||
".md",
|
||||
".txt",
|
||||
".py",
|
||||
".js",
|
||||
".ts",
|
||||
".json",
|
||||
".yaml",
|
||||
".yml",
|
||||
".html",
|
||||
".css",
|
||||
".pdf",
|
||||
}
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def auth_middleware(request: Request, call_next):
|
||||
"""Require X-API-Key for mutating endpoints when API_KEY_DOCS_API is set."""
|
||||
if not settings.is_auth_enabled:
|
||||
return await call_next(request)
|
||||
|
||||
public_prefixes = ("/health", "/libraries", "/docs/")
|
||||
if request.method == "GET" and request.url.path.startswith(public_prefixes):
|
||||
return await call_next(request)
|
||||
|
||||
if request.headers.get("X-API-Key") != settings.api_key_docs_api:
|
||||
return JSONResponse(status_code=401, content={"detail": "Invalid or missing API key"})
|
||||
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup() -> None:
|
||||
init_result = init_db()
|
||||
if not init_result.get("success"):
|
||||
raise RuntimeError(f"Failed to initialize SQLite database: {init_result.get('error')}")
|
||||
|
||||
last_error = None
|
||||
for _ in range(20):
|
||||
collection_result = await ensure_collection()
|
||||
if collection_result.get("success"):
|
||||
return
|
||||
last_error = collection_result.get("error")
|
||||
await asyncio.sleep(1)
|
||||
raise RuntimeError(f"Failed to initialize Qdrant collection: {last_error}")
|
||||
|
||||
|
||||
def safe_library_id(library_id: str) -> str:
|
||||
"""Normalize user-provided library IDs to a single path segment."""
|
||||
base = Path(library_id).name.strip()
|
||||
if not base or base in {".", ".."} or ".." in library_id or "/" in library_id or "\\" in library_id:
|
||||
raise HTTPException(status_code=400, detail="Invalid library ID")
|
||||
return base
|
||||
|
||||
|
||||
def safe_upload_filename(filename: str) -> str:
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext not in ALLOWED_EXTENSIONS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsafe extension: {ext}. Allowed extensions: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
|
||||
)
|
||||
|
||||
stem = "".join(c for c in Path(filename).stem if c.isalnum() or c in "-_ ").strip()
|
||||
if not stem:
|
||||
raise HTTPException(status_code=400, detail="Filename contains only unsafe characters")
|
||||
return f"{stem}{ext}"
|
||||
|
||||
|
||||
def docs_root() -> Path:
|
||||
return Path(settings.docs_path)
|
||||
|
||||
|
||||
def sources_config_path() -> Path:
|
||||
return Path(__file__).resolve().parents[2] / "docs_sources.yaml"
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
return {"status": "ok", "service": "docs-api"}
|
||||
|
||||
|
||||
@app.get("/collections")
|
||||
async def collections():
|
||||
try:
|
||||
client = get_client()
|
||||
info = client.get_collection(get_collection_name())
|
||||
vectors = getattr(info, "vectors_count", None) or getattr(info, "points_count", 0) or 0
|
||||
return {"collections": {get_collection_name(): {"vectors": vectors}}}
|
||||
except Exception as e:
|
||||
return {"collections": {}, "warning": str(e)}
|
||||
|
||||
|
||||
@app.get("/libraries")
|
||||
async def list_libraries_api():
|
||||
libs = list_libraries()
|
||||
if isinstance(libs, dict) and not libs.get("success", True):
|
||||
raise HTTPException(status_code=500, detail=libs.get("error", "Failed to list libraries"))
|
||||
return {"libraries": libs, "count": len(libs)}
|
||||
|
||||
|
||||
@app.get("/libraries/search")
|
||||
async def search_libraries_api(q: str = Query(..., min_length=1)):
|
||||
matches = resolve_library_id(q)
|
||||
return {"matches": matches, "count": len(matches)}
|
||||
|
||||
|
||||
@app.post("/search")
|
||||
async def search_docs_api(payload: SearchRequest):
|
||||
results = search_docs(payload.query, library_id=payload.library_id, limit=payload.limit)
|
||||
return {
|
||||
"query": payload.query,
|
||||
"library_id": payload.library_id,
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/docs/{library_id}")
|
||||
@app.get("/libraries/{library_id}/docs")
|
||||
async def get_library_docs_api(
|
||||
library_id: str,
|
||||
topic: Optional[str] = Query(None),
|
||||
tokens: int = Query(8000, ge=1),
|
||||
):
|
||||
docs = get_library_docs(library_id=library_id, topic=topic, token_limit=tokens)
|
||||
return {"library_id": library_id, "content": docs}
|
||||
|
||||
|
||||
@app.post("/ingest/all")
|
||||
async def ingest_all_api():
|
||||
return await ingest_all()
|
||||
|
||||
|
||||
@app.post("/ingest/{library_id}")
|
||||
async def ingest_library_api(library_id: str):
|
||||
library_id = safe_library_id(library_id)
|
||||
source_path = library_id
|
||||
return await ingest_library(library_id=library_id, name=library_id, source_path=source_path)
|
||||
|
||||
|
||||
@app.post("/api/v1/libraries/{library_id}")
|
||||
async def api_create_library(
|
||||
library_id: str,
|
||||
name: Optional[str] = Form(None),
|
||||
description: Optional[str] = Form(None),
|
||||
):
|
||||
library_id = safe_library_id(library_id)
|
||||
lib_dir = docs_root() / library_id
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
result = upsert_library(library_id, name or library_id, description, library_id)
|
||||
if not result.get("success"):
|
||||
raise HTTPException(status_code=500, detail=result.get("error", "Failed to create library"))
|
||||
return {
|
||||
"success": True,
|
||||
"created": not result.get("exists", False),
|
||||
"library_id": library_id,
|
||||
"name": name or library_id,
|
||||
"description": description,
|
||||
"path": str(lib_dir),
|
||||
}
|
||||
|
||||
|
||||
@app.delete("/api/v1/libraries/{library_id}")
|
||||
async def api_delete_library(library_id: str):
|
||||
library_id = safe_library_id(library_id)
|
||||
lib_dir = docs_root() / library_id
|
||||
deleted_files = 0
|
||||
|
||||
if lib_dir.exists():
|
||||
for path in lib_dir.rglob("*"):
|
||||
if path.is_file():
|
||||
deleted_files += 1
|
||||
shutil.rmtree(lib_dir)
|
||||
|
||||
docs_result = clear_library_documents(library_id)
|
||||
vectors_result = await delete_library_vectors(library_id)
|
||||
library_result = delete_library(library_id)
|
||||
|
||||
failures = [
|
||||
r.get("error")
|
||||
for r in (docs_result, vectors_result, library_result)
|
||||
if isinstance(r, dict) and not r.get("success", True)
|
||||
]
|
||||
if failures:
|
||||
raise HTTPException(status_code=500, detail="; ".join(failures))
|
||||
|
||||
return {"success": True, "library_id": library_id, "deleted_files": deleted_files}
|
||||
|
||||
|
||||
@app.post("/api/v1/upload/{library_id}")
|
||||
async def api_upload(library_id: str, file: UploadFile = File(...)):
|
||||
library_id = safe_library_id(library_id)
|
||||
safe_name = safe_upload_filename(file.filename or "upload.txt")
|
||||
lib_dir = docs_root() / library_id
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
contents = await file.read()
|
||||
if len(contents) > 5 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 5MB)")
|
||||
|
||||
target = lib_dir / safe_name
|
||||
target.write_bytes(contents)
|
||||
|
||||
upsert_library(library_id, library_id, None, library_id)
|
||||
return {
|
||||
"success": True,
|
||||
"library_id": library_id,
|
||||
"filename": safe_name,
|
||||
"path": str(target.relative_to(docs_root())),
|
||||
"size_bytes": len(contents),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/v1/sources")
|
||||
@app.get("/sources/config")
|
||||
async def api_list_sources():
|
||||
path = sources_config_path()
|
||||
if not path.exists():
|
||||
return {"success": True, "sources": [], "count": 0}
|
||||
|
||||
with path.open() as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
sources = data.get("sources", data if isinstance(data, list) else [])
|
||||
if not isinstance(sources, list):
|
||||
sources = []
|
||||
return {"success": True, "sources": sources, "count": len(sources)}
|
||||
|
||||
|
||||
@app.post("/sources/sync")
|
||||
async def sync_sources_api(payload: Optional[SyncSourcesRequest] = None):
|
||||
source_data = await api_list_sources()
|
||||
sources = source_data["sources"]
|
||||
override = payload.override if payload else False
|
||||
results = []
|
||||
|
||||
for source in sources:
|
||||
result = await ingest_git_source(
|
||||
library_id=source["library_id"],
|
||||
name=source.get("name") or source["library_id"],
|
||||
description=source.get("description"),
|
||||
repo_url=source["repo_url"],
|
||||
branch=source.get("branch", "main"),
|
||||
include_paths=source.get("include_paths"),
|
||||
exclude_paths=source.get("exclude_paths"),
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
successful = len([r for r in results if r.get("success")])
|
||||
return {
|
||||
"success": successful == len(results),
|
||||
"total_sources": len(results),
|
||||
"successful": successful,
|
||||
"failed": len(results) - successful,
|
||||
"results": results,
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
# Data Models for document processing and API responses
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
class DocumentChunk:
|
||||
"""Represents a chunk of text to be embedded."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
self.text = text
|
||||
self.metadata = metadata or {}
|
||||
|
||||
@property
|
||||
def doc_id(self) -> str:
|
||||
"""Generate a document ID from content."""
|
||||
return f"doc-{hash(self.text)}"
|
||||
|
||||
|
||||
class IngestResponse:
|
||||
"""Response model for document ingestion."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
chunks_count: int = 0,
|
||||
error: Optional[str] = None
|
||||
):
|
||||
self.success = success
|
||||
self.chunks_count = chunks_count
|
||||
self.error = error
|
||||
|
||||
|
||||
class SearchResponse:
|
||||
"""Response model for search results."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
results: List[Dict[str, Any]],
|
||||
query: str,
|
||||
total_results: int
|
||||
):
|
||||
self.results = results
|
||||
self.query = query
|
||||
self.total_results = total_results
|
||||
@@ -0,0 +1,235 @@
|
||||
# Search Operations for Semantic Query and Library Navigation
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from .config import settings
|
||||
from .vector_store import get_client, _collection_name as VECTOR_COLLECTION
|
||||
from .embeddings import embed_text, get_embedding_size
|
||||
from .db import get_chunks_for_library, list_libraries
|
||||
|
||||
|
||||
def search_docs(
|
||||
query: str,
|
||||
library_id: Optional[str] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search documents by semantic similarity in Qdrant.
|
||||
|
||||
Args:
|
||||
query: The search query string
|
||||
library_id: Optional filter to search only within a library
|
||||
limit: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of dicts with format:
|
||||
{
|
||||
"id": "...",
|
||||
"score": 0.123,
|
||||
"library_id": "...",
|
||||
"path": "...",
|
||||
"title": "...",
|
||||
"chunk_index": 0
|
||||
}
|
||||
"""
|
||||
try:
|
||||
# Generate embedding for the query
|
||||
query_embedding = embed_text(query)
|
||||
|
||||
client = get_client()
|
||||
|
||||
# Build filter if library_id is specified
|
||||
search_filter = None
|
||||
if library_id:
|
||||
try:
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
search_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="library_id",
|
||||
match=MatchValue(value=library_id),
|
||||
)
|
||||
]
|
||||
)
|
||||
except ImportError:
|
||||
search_filter = None
|
||||
|
||||
# Perform vector search
|
||||
results = client.search(
|
||||
collection_name=VECTOR_COLLECTION,
|
||||
query_vector=query_embedding,
|
||||
limit=limit,
|
||||
search_filter=search_filter
|
||||
)
|
||||
|
||||
# Format and return results
|
||||
formatted_results = []
|
||||
for result in results:
|
||||
if result.score > 0 and result.payload:
|
||||
formatted_results.append({
|
||||
"id": result.payload["id"],
|
||||
"score": float(result.score),
|
||||
"library_id": result.payload.get("library_id", ""),
|
||||
"path": result.payload.get("path", ""),
|
||||
"title": result.payload.get("title", ""),
|
||||
"chunk_index": result.payload.get("chunk_index", 0)
|
||||
})
|
||||
|
||||
return formatted_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_library_docs(
|
||||
library_id: str,
|
||||
topic: Optional[str] = None,
|
||||
token_limit: int = 8000
|
||||
) -> str:
|
||||
"""
|
||||
Retrieve documentation content from a library.
|
||||
|
||||
Args:
|
||||
library_id: The library ID to fetch docs from
|
||||
topic: Optional topic filter - if provided, searches for topic first
|
||||
token_limit: Maximum tokens to include in output
|
||||
|
||||
Returns:
|
||||
Combined markdown content as string
|
||||
"""
|
||||
try:
|
||||
# If topic is specified, search for relevant chunks
|
||||
if topic:
|
||||
print(f" [Search] Searching library '{library_id}' for topic: {topic}")
|
||||
search_results = search_docs(query=topic, library_id=library_id, limit=20)
|
||||
|
||||
if not search_results:
|
||||
return f"No documents found in library '{library_id}' matching topic: {topic}"
|
||||
|
||||
print(f" [Search] Found {len(search_results)} relevant chunks")
|
||||
else:
|
||||
# Fetch all chunks for the library and select most useful ones
|
||||
print(f" [Fetch] Retrieving chunks from library '{library_id}'")
|
||||
chunks_data = get_chunks_for_library(library_id)
|
||||
|
||||
if not chunks_data:
|
||||
return f"No documents found in library '{library_id}'"
|
||||
|
||||
# Sort by chunk_index descending and pick top ones to respect token limit
|
||||
sorted_chunks = sorted(chunks_data, key=lambda x: x.get("chunk_index", 0), reverse=True)
|
||||
selected_chunks = []
|
||||
total_tokens = 0
|
||||
|
||||
for chunk in sorted_chunks:
|
||||
content = chunk.get("content", "")
|
||||
tokens = len(content) // 4 # Simple token estimate
|
||||
|
||||
if total_tokens + tokens <= token_limit:
|
||||
selected_chunks.append(chunk)
|
||||
total_tokens += tokens
|
||||
else:
|
||||
# Take part of this chunk to fill remaining space
|
||||
remaining = token_limit - total_tokens
|
||||
content_preview = content[:remaining * 4] if remaining > 0 else ""
|
||||
if content_preview:
|
||||
selected_chunks.append({"content": content_preview, "title": chunk.get("title", "")})
|
||||
|
||||
print(f" [Fetch] Selected {len(selected_chunks)} chunks ({total_tokens} tokens)")
|
||||
|
||||
# Combine chunks into markdown
|
||||
md_parts = []
|
||||
for chunk in selected_chunks:
|
||||
title = chunk.get("title")
|
||||
content = chunk.get("content", "")
|
||||
|
||||
if title and content.strip():
|
||||
# Add heading before first chunk or if this is the first chunk
|
||||
if not md_parts or "\n\n" not in "".join(md_parts):
|
||||
md_parts.append(f"# {title}")
|
||||
elif not any(part.startswith("#") for part in md_parts[-5:]):
|
||||
md_parts.append(f"\n# {title}\n")
|
||||
|
||||
md_parts.append(content)
|
||||
|
||||
result = "\n\n".join(md_parts)
|
||||
|
||||
# If no headings were added, prepend library title
|
||||
if not any(part.startswith("#") for part in result.split("\n")[:3]):
|
||||
result = f"# {library_id.upper().replace('_', ' ')}\n\n" + result
|
||||
|
||||
return result.rstrip()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting library docs: {e}")
|
||||
return f"Error retrieving documents from library '{library_id}': {str(e)}"
|
||||
|
||||
|
||||
def resolve_library_id(library_name: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Resolve a library name to potential matches (Context7-style).
|
||||
|
||||
Args:
|
||||
library_name: Partial or full library name to search for
|
||||
|
||||
Returns:
|
||||
List of Context7-style candidate dicts:
|
||||
{
|
||||
"id": "/local/foundryvtt",
|
||||
"name": "foundryvtt",
|
||||
"description": "...",
|
||||
"source": "local"
|
||||
}
|
||||
"""
|
||||
try:
|
||||
libraries = list_libraries()
|
||||
|
||||
if not libraries:
|
||||
return []
|
||||
|
||||
# Filter by name match (case-insensitive)
|
||||
candidates = []
|
||||
for lib in libraries:
|
||||
lib_name = lib.get("name", "").lower()
|
||||
lib_id = lib.get("id", "").lower()
|
||||
|
||||
if library_name.lower() in lib_name or library_name.lower() in lib_id:
|
||||
candidates.append({
|
||||
"id": f"/local/{lib['id']}",
|
||||
"name": lib["name"],
|
||||
"description": lib.get("description", ""),
|
||||
"source": "local"
|
||||
})
|
||||
|
||||
# Return top matches (or all if less than 3)
|
||||
candidates = candidates[:min(5, len(candidates))]
|
||||
|
||||
print(f" [Resolve] Found {len(candidates)} candidate(s) for: {library_name}")
|
||||
|
||||
return candidates
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error resolving library ID: {e}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
async def test_search():
|
||||
"""Test search functionality."""
|
||||
print("Testing search module...\n")
|
||||
|
||||
# Test 1: Simple search with dummy vector (simulated)
|
||||
print("1. Testing resolve_library_id()...")
|
||||
results = await resolve_library_id("foundryvtt")
|
||||
print(f" Results: {len(results)} candidates\n")
|
||||
|
||||
# Test 2: Empty query should return empty list
|
||||
print("2. Testing search_docs() with empty query...")
|
||||
results = await search_docs("")
|
||||
print(f" Results: {len(results)} chunks\n")
|
||||
|
||||
print("✅ All tests completed!")
|
||||
|
||||
asyncio.run(test_search())
|
||||
@@ -0,0 +1,361 @@
|
||||
# Vector Store Operations for Qdrant
|
||||
import asyncio
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
|
||||
except ImportError:
|
||||
QdrantClient = None
|
||||
Distance = VectorParams = PointStruct = Filter = FieldCondition = MatchValue = None
|
||||
|
||||
|
||||
# Singleton client instance
|
||||
_client: Optional[Any] = None
|
||||
try:
|
||||
from .config import settings
|
||||
_collection_name = settings.collection_name
|
||||
except Exception:
|
||||
_collection_name = "local_context7_docs"
|
||||
|
||||
|
||||
def get_client() -> Any:
|
||||
"""Get or create the Qdrant client singleton using environment config."""
|
||||
global _client
|
||||
|
||||
if _client is None:
|
||||
if QdrantClient is None:
|
||||
raise RuntimeError("qdrant-client is not installed")
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Use QDRANT_URL from environment if available, otherwise use host:port
|
||||
import os
|
||||
qdrant_url = os.getenv("QDRANT_URL")
|
||||
|
||||
if qdrant_url:
|
||||
_client = QdrantClient(url=qdrant_url)
|
||||
else:
|
||||
from .config import settings
|
||||
host = settings.vector_store_host
|
||||
port = settings.vector_store_port
|
||||
_client = QdrantClient(host=host, port=port)
|
||||
|
||||
return _client
|
||||
|
||||
|
||||
def get_collection_name() -> str:
|
||||
"""Get the collection name for vector storage."""
|
||||
return _collection_name
|
||||
|
||||
|
||||
def get_embedding_size() -> int:
|
||||
"""Get embedding dimension size from embeddings module."""
|
||||
try:
|
||||
from .embeddings import get_embedding_size
|
||||
return get_embedding_size()
|
||||
except (ImportError, RuntimeError):
|
||||
# Default fallback if embeddings module not loaded yet
|
||||
return 384
|
||||
|
||||
|
||||
async def ensure_collection(vector_size: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Ensure the Qdrant collection exists with proper schema.
|
||||
|
||||
Args:
|
||||
vector_size: Override embedding dimension (uses get_embedding_size() if not provided)
|
||||
|
||||
Returns:
|
||||
Dict with operation result
|
||||
"""
|
||||
try:
|
||||
if QdrantClient is None:
|
||||
return {"success": False, "error": "qdrant-client is not installed"}
|
||||
|
||||
client = get_client()
|
||||
size = vector_size or get_embedding_size()
|
||||
distance = Distance.COSINE
|
||||
|
||||
# Check if collection exists
|
||||
try:
|
||||
collections = client.get_collections().collections
|
||||
collection_exists = any(c.name == _collection_name for c in collections)
|
||||
except Exception:
|
||||
collection_exists = False
|
||||
|
||||
if not collection_exists:
|
||||
# Create new collection
|
||||
client.create_collection(
|
||||
collection_name=_collection_name,
|
||||
vectors=VectorParams(size=size, distance=distance),
|
||||
wait=True
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"collection": _collection_name,
|
||||
"vector_size": size,
|
||||
"created": True
|
||||
}
|
||||
else:
|
||||
# Verify current vector size matches expected
|
||||
try:
|
||||
collection_info = client.get_collection(_collection_name)
|
||||
current_size = collection_info.config.params.vectors.size
|
||||
|
||||
if current_size != size:
|
||||
# Collection exists with wrong size - delete and recreate
|
||||
client.delete_collection(_collection_name)
|
||||
client.create_collection(
|
||||
collection_name=_collection_name,
|
||||
vectors=VectorParams(size=size, distance=distance),
|
||||
wait=True
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"collection": _collection_name,
|
||||
"vector_size": size,
|
||||
"created": False,
|
||||
"resized": True
|
||||
}
|
||||
except Exception:
|
||||
pass # Collection exists, don't worry about size for now
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"collection": _collection_name,
|
||||
"vector_size": size,
|
||||
"created": False
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
async def upsert_chunks(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Upsert chunks into the vector store.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk dicts with format:
|
||||
{
|
||||
"id": "...",
|
||||
"library_id": "...",
|
||||
"path": "...",
|
||||
"title": "...",
|
||||
"chunk_index": 0,
|
||||
"content": "...",
|
||||
"embedding": [...]
|
||||
}
|
||||
|
||||
Returns:
|
||||
Dict with operation result
|
||||
"""
|
||||
try:
|
||||
if QdrantClient is None:
|
||||
return {"success": False, "error": "qdrant-client is not installed"}
|
||||
|
||||
if not chunks:
|
||||
return {"success": True, "points_added": 0}
|
||||
|
||||
client = get_client()
|
||||
|
||||
# Build PointStruct points from chunk dicts
|
||||
points = []
|
||||
for chunk in chunks:
|
||||
point_key = f"{chunk['library_id']}:{chunk['id']}"
|
||||
point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, point_key))
|
||||
|
||||
points.append(PointStruct(
|
||||
id=point_id,
|
||||
vector=chunk["embedding"],
|
||||
payload={
|
||||
"id": chunk["id"],
|
||||
"library_id": chunk["library_id"],
|
||||
"path": chunk.get("path", ""),
|
||||
"title": chunk.get("title", ""),
|
||||
"chunk_index": chunk.get("chunk_index", 0),
|
||||
"content": chunk.get("content", "")
|
||||
}
|
||||
))
|
||||
|
||||
# Upsert points into collection
|
||||
client.upsert(_collection_name, points=points)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"points_added": len(points)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
async def search_vectors(
|
||||
query_vector: List[float],
|
||||
library_id: Optional[str] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search for semantically similar vectors.
|
||||
|
||||
Args:
|
||||
query_vector: The embedding vector to search against
|
||||
library_id: Optional filter by library ID
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of result dicts with format:
|
||||
{
|
||||
"id": "...",
|
||||
"score": 0.123,
|
||||
"library_id": "...",
|
||||
"path": "...",
|
||||
"title": "...",
|
||||
"chunk_index": 0
|
||||
}
|
||||
"""
|
||||
try:
|
||||
if QdrantClient is None:
|
||||
return []
|
||||
|
||||
client = get_client()
|
||||
|
||||
# Build filter if library_id is specified
|
||||
search_filter = None
|
||||
if library_id:
|
||||
search_filter = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="library_id",
|
||||
match=MatchValue(value=library_id),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Perform vector search
|
||||
results = client.search(
|
||||
collection_name=_collection_name,
|
||||
query_vector=query_vector,
|
||||
limit=limit,
|
||||
search_filter=search_filter
|
||||
)
|
||||
|
||||
# Format results
|
||||
formatted_results = []
|
||||
for result in results:
|
||||
if result.score > 0 and result.payload:
|
||||
formatted_results.append({
|
||||
"id": result.payload["id"],
|
||||
"score": float(result.score),
|
||||
"library_id": result.payload["library_id"],
|
||||
"path": result.payload.get("path", ""),
|
||||
"title": result.payload.get("title", ""),
|
||||
"chunk_index": result.payload.get("chunk_index", 0)
|
||||
})
|
||||
|
||||
return formatted_results
|
||||
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
|
||||
async def delete_library_vectors(library_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Delete all vectors for a given library.
|
||||
|
||||
Args:
|
||||
library_id: The library ID to delete vectors for
|
||||
|
||||
Returns:
|
||||
Dict with operation result
|
||||
"""
|
||||
try:
|
||||
if QdrantClient is None:
|
||||
return {"success": True, "library_id": library_id, "skipped": "qdrant-client is not installed"}
|
||||
|
||||
client = get_client()
|
||||
|
||||
# Use filter to delete only vectors matching the library_id
|
||||
filter_condition = Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="library_id",
|
||||
match=MatchValue(value=library_id),
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Get all points with the filter (in batches)
|
||||
batch_size = 100
|
||||
offset = None
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Scroll to get points matching filter
|
||||
points, _ = client.scroll(
|
||||
collection_name=_collection_name,
|
||||
scroll_filter=filter_condition,
|
||||
limit=batch_size,
|
||||
offset=offset,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
# Collect IDs to delete
|
||||
point_ids = [p.id for p in points]
|
||||
|
||||
# Delete the points
|
||||
client.delete(
|
||||
collection_name=_collection_name,
|
||||
points_selector=point_ids
|
||||
)
|
||||
|
||||
offset = points[-1].id if points else None
|
||||
|
||||
except Exception as e:
|
||||
# If we hit end of dataset or other issue, break
|
||||
break
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"library_id": library_id
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test vector store module
|
||||
import os
|
||||
|
||||
print("Testing vector store module...\n")
|
||||
|
||||
# Test ensure_collection
|
||||
print("1. Testing ensure_collection()...")
|
||||
result = asyncio.run(ensure_collection())
|
||||
print(f" Result: {result}\n")
|
||||
|
||||
# Test search with empty query (will return empty since no vectors exist yet)
|
||||
print("2. Testing search_vectors() with dummy vector...")
|
||||
dummy_vector = [0.1] * 384
|
||||
results = asyncio.run(search_vectors(dummy_vector, limit=5))
|
||||
print(f" Results count: {len(results)}\n")
|
||||
|
||||
# Test delete_library_vectors (will succeed even if no vectors exist)
|
||||
print("3. Testing delete_library_vectors()...")
|
||||
result = asyncio.run(delete_library_vectors("test-library"))
|
||||
print(f" Result: {result}\n")
|
||||
|
||||
print("✅ All tests completed!")
|
||||
@@ -0,0 +1 @@
|
||||
"""WebUI module for Context7 Docs."""
|
||||
@@ -0,0 +1,166 @@
|
||||
.container {
|
||||
max-width: 1000px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
header {
|
||||
border-bottom: 1px solid #ccc;
|
||||
padding-bottom: 15px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
header h1 {
|
||||
margin: 0 0 10px 0;
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
|
||||
nav {
|
||||
display: flex;
|
||||
gap: 15px;
|
||||
}
|
||||
|
||||
nav a {
|
||||
text-decoration: none;
|
||||
color: #0066cc;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
nav a.active {
|
||||
font-weight: bold;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
main h2 {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
footer {
|
||||
margin-top: 40px;
|
||||
padding-top: 15px;
|
||||
border-top: 1px solid #ccc;
|
||||
font-size: 0.8rem;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
/* Status cards */
|
||||
.status-card {
|
||||
background: #f5f5f5;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #00c467;
|
||||
}
|
||||
|
||||
.status-message {
|
||||
background: #e8f4fd;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
margin: 5px 0;
|
||||
}
|
||||
|
||||
/* Tables */
|
||||
.library-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.library-table th, .library-table td {
|
||||
padding: 10px;
|
||||
text-align: left;
|
||||
border-bottom: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.library-table th {
|
||||
background: #f5f5f5;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* Forms */
|
||||
form input[type="text"], form textarea, form select {
|
||||
padding: 8px;
|
||||
border: 1px solid #ccc;
|
||||
border-radius: 4px;
|
||||
margin-right: 10px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
button {
|
||||
background: #0066cc;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 10px 20px;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
background: #0055aa;
|
||||
}
|
||||
|
||||
/* Pre formatting */
|
||||
pre {
|
||||
background: #f5f5f5;
|
||||
padding: 15px;
|
||||
border-radius: 4px;
|
||||
overflow-x: auto;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
/* Search results */
|
||||
.result-card {
|
||||
background: #fff;
|
||||
border: 1px solid #ddd;
|
||||
padding: 15px;
|
||||
margin: 10px 0;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.result-card h3 {
|
||||
margin: 0 0 8px 0;
|
||||
}
|
||||
|
||||
.hint {
|
||||
color: #666;
|
||||
font-size: 0.85rem;
|
||||
margin-top: 15px;
|
||||
}
|
||||
|
||||
/* Status colors */
|
||||
.status-ok {
|
||||
color: #00c467;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.content-preview {
|
||||
max-height: 300px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.results-count {
|
||||
background: #e8f4fd;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.source-card {
|
||||
background: #f5f5f5;
|
||||
padding: 15px;
|
||||
margin: 10px 0;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.actions-bar {
|
||||
margin-top: 15px;
|
||||
}
|
||||
|
||||
.actions-bar form {
|
||||
display: inline-flex;
|
||||
}
|
||||
|
||||
.doc-content {
|
||||
max-height: 600px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
@@ -0,0 +1,568 @@
|
||||
"""WebUI Views for Context7 Docs using Jinja2 templates."""
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from fastapi import Request
|
||||
from fastapi.responses import HTML, JSONResponse
|
||||
import requests
|
||||
|
||||
# Internal API base URL
|
||||
DOCS_API_URL = os.environ.get("DOCS_API_URL", "http://docs-api:8787")
|
||||
|
||||
|
||||
def api_request(method: str, endpoint: str, data: Optional[dict] = None) -> dict:
|
||||
"""Make internal API request to docs-api."""
|
||||
url = f"{DOCS_API_URL}{endpoint}"
|
||||
headers = {}
|
||||
if os.environ.get("WEBUI_API_KEY"):
|
||||
headers["X-API-Key"] = os.environ.get("WEBUI_API_KEY")
|
||||
|
||||
resp = requests.request(method, url, headers=headers, json=data)
|
||||
return resp.json()
|
||||
|
||||
|
||||
def navbar_html(current: str) -> str:
|
||||
"""Generate navigation bar HTML."""
|
||||
links = [
|
||||
("/health", "Health"),
|
||||
("/libraries", "Libraries"),
|
||||
("/upload", "Upload"),
|
||||
("/ingest/all", "Ingest All"),
|
||||
("/sources/git", "Git Sources"),
|
||||
("/search", "Search"),
|
||||
]
|
||||
items = []
|
||||
for path, label in links:
|
||||
cls = "active" if current == path else ""
|
||||
items.append(f'<a href="{path}" class="{cls}">{label}</a>')
|
||||
return f"""<nav>
|
||||
{' '.join(items)}
|
||||
</nav>""".strip()
|
||||
|
||||
|
||||
def footer_html() -> str:
|
||||
"""Generate footer HTML."""
|
||||
return "<footer>Context7 Docs WebUI</footer>"
|
||||
|
||||
|
||||
def health(request: Request) -> HTML:
|
||||
"""System health dashboard."""
|
||||
try:
|
||||
data = api_request("GET", "/health")
|
||||
status = data.get("status", "unknown")
|
||||
service = data.get("service", "Service")
|
||||
except Exception as e:
|
||||
status = "error"
|
||||
service = str(e)
|
||||
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Health</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/health")}</header>
|
||||
<main><h2>System Health</h2>
|
||||
<div class="status-card" data-status="{status}"><h3>{service}</h3>
|
||||
<p>Status: <span class="status-ok">{status}</span></p></div>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def libraries(request: Request) -> HTML:
|
||||
"""List all libraries."""
|
||||
try:
|
||||
data = api_request("GET", "/libraries")
|
||||
libs = data.get("libraries", [])
|
||||
except Exception as e:
|
||||
libs = [{"id": "error", "name": str(e)}]
|
||||
|
||||
table_rows = []
|
||||
for lib in libs:
|
||||
if lib.get("id") != "error":
|
||||
table_rows.append(
|
||||
f"""<tr><td>{lib.get('id')}</td>
|
||||
<td>{lib.get('name', '')}</td>
|
||||
<td>{lib.get('description', '') or '(no description)'}</td>
|
||||
<td><a href="/docs/{lib.get('id')}">View Docs</a></td></tr>"""
|
||||
)
|
||||
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Libraries</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/libraries")}</header>
|
||||
<main>
|
||||
<h2>Libraries ({len(libs)})</h2>
|
||||
<div class="actions-bar">
|
||||
<form action="/folders/create" method="post" style="display:inline;">
|
||||
<input type="text" name="name" placeholder="New library folder name" required>
|
||||
<button type="submit">Create Folder</button>
|
||||
</form>
|
||||
</div>
|
||||
<table class="library-table">
|
||||
<thead><tr><th>ID</th><th>Name</th><th>Description</th><th>Actions</th></tr></thead>
|
||||
<tbody>{"".join(table_rows)}</tbody>
|
||||
</table>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def upload(request: Request) -> HTML:
|
||||
"""File upload form."""
|
||||
if "file" in request.files:
|
||||
uploaded_file = request.files["file"]
|
||||
try:
|
||||
content = uploaded_file.read().decode("utf-8")[:5000]
|
||||
# Escape HTML
|
||||
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
truncated = safe_content[:1000] + "..." if len(safe_content) > 1000 else safe_content
|
||||
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Upload</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
|
||||
<main>
|
||||
<h2>Upload Complete!</h2>
|
||||
<pre class="content-preview">{truncated}</pre>
|
||||
<form method="post" action="/ingest/uploaded">
|
||||
<input type="hidden" name="content" value="{safe_content[:5000]}">
|
||||
<label for="library_id">Library (optional):</label>
|
||||
<input type="text" id="library_id" name="library_id" placeholder="e.g., my-docs">
|
||||
<button type="submit">Ingest</button>
|
||||
</form>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
except Exception:
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Upload</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
|
||||
<main>
|
||||
<h2>File too large!</h2>
|
||||
<p>Please upload smaller text files (limit: ~5MB).</p>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
else:
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Upload</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
|
||||
<main>
|
||||
<h2>Upload Documentation Files</h2>
|
||||
<form method="post" enctype="multipart/form-data">
|
||||
<label for="file">Select file:</label>
|
||||
<input type="file" name="file" id="file" accept=".txt,.md,.json,.py,.js,.html,.css,.yaml,.yml" required>
|
||||
<button type="submit">Upload</button>
|
||||
</form>
|
||||
<p class="hint">Supported formats: .txt, .md, .json, .py, .js, .html, .css, .yaml</p>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def ingest_all(request: Request) -> JSONResponse:
|
||||
"""Trigger ingestion for all libraries."""
|
||||
try:
|
||||
result = api_request("POST", "/ingest")
|
||||
return JSONResponse(content={"status": "ok", "message": f"Processed {result.get('chunks', 0)} chunks"})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"error": str(e)})
|
||||
|
||||
|
||||
def ingest_library(request: Request, library_id: str) -> HTML:
|
||||
"""Ingest for specific library."""
|
||||
if "content" in request.form:
|
||||
content = request.form.get("content")[:10000]
|
||||
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Ingest</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
|
||||
<main>
|
||||
<h2>Ingest for Library: {library_id}</h2>
|
||||
<form method="post" action="/ingest/{library_id}">
|
||||
<label for="content">Content (text):</label>
|
||||
<textarea id="content" name="content" rows="10" maxlength="10000"></textarea>
|
||||
<button type="submit">Ingest</button>
|
||||
</form>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
else:
|
||||
try:
|
||||
result = api_request("POST", f"/ingest/{library_id}")
|
||||
safe_msg = result.get('message', '') or ''
|
||||
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Ingest Result</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
|
||||
<main>
|
||||
<h2>Ingestion Complete!</h2>
|
||||
<p>{safe_msg}</p>
|
||||
<pre>{safe_json}</pre>
|
||||
<a href="/libraries">← Back to Libraries</a>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
except Exception as e:
|
||||
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Error</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/ingest/{library_id}")}</header>
|
||||
<main>
|
||||
<h2>Error</h2>
|
||||
<pre>{safe_error}</pre>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
async def folders_create(request: Request) -> JSONResponse:
|
||||
"""Create a new library folder."""
|
||||
name = request.form.get("name", "").strip()
|
||||
try:
|
||||
from backend.app.db import upsert_library
|
||||
await upsert_library(library_id=name, name=name, description=None, source_path=f"/docs/{name}")
|
||||
return JSONResponse(content={"status": "ok", "message": f"Created folder '{name}'"})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"error": str(e)})
|
||||
|
||||
|
||||
async def folders_delete(request: Request) -> JSONResponse:
|
||||
"""Delete a library."""
|
||||
library_id = request.query_params.get("id", "").strip()
|
||||
try:
|
||||
from backend.app.db import delete_library
|
||||
await delete_library(library_id)
|
||||
return JSONResponse(content={"status": "ok", "message": f"Deleted library '{library_id}'"})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"error": str(e)})
|
||||
|
||||
|
||||
async def ingest_uploaded(request: Request) -> HTML:
|
||||
"""Ingest uploaded file content."""
|
||||
content = request.form.get("content", "")[:10000]
|
||||
library_id = request.form.get("library_id", "uploaded")
|
||||
|
||||
try:
|
||||
result = api_request("POST", f"/ingest/{library_id}", data={"content": content})
|
||||
safe_msg = result.get('message', '') or ''
|
||||
safe_json = json.dumps(result, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Upload Result</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/upload")}</header>
|
||||
<main>
|
||||
<h2>Ingestion Complete!</h2>
|
||||
<p>{safe_msg}</p>
|
||||
<pre>{safe_json}</pre>
|
||||
<a href="/upload">← Upload Another</a>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
except Exception as e:
|
||||
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="UTF-8"><title>Error</title></head>
|
||||
<body><h1>Upload Ingest Error</h1><pre>{safe_error}</pre><a href="/upload">← Try Again</a></body>
|
||||
</html>""", media_type="text/html")
|
||||
|
||||
|
||||
def docs(request: Request, library_id: str, topic: Optional[str] = None, tokens: int = 8000) -> HTML:
|
||||
"""View docs from a library."""
|
||||
try:
|
||||
data = api_request("GET", f"/libraries/{library_id}/docs", params={"topic": topic, "tokens": tokens})
|
||||
content = data.get("content", "")
|
||||
except Exception as e:
|
||||
content = str(e)
|
||||
|
||||
safe_content = content.replace("&", "&").replace("<", "<").replace(">", ">")[:10000]
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Library: {library_id}</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/docs/{}".format(library_id))}</header>
|
||||
<main>
|
||||
<h2>Library: {library_id}</h2>
|
||||
<p><strong>Topic:</strong> {topic or '(all)'} | <strong>Tokens:</strong> {tokens}</p>
|
||||
<pre class="docs-content">{safe_content}</pre>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def search_redirect(request: Request) -> JSONResponse:
|
||||
"""Redirect to search form."""
|
||||
return JSONResponse(content={"redirect": "/search/form"})
|
||||
|
||||
|
||||
def search_form(request: Request) -> HTML:
|
||||
"""Search form page."""
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Search</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
|
||||
<main>
|
||||
<h2>Search Docs</h2>
|
||||
<form method="post" action="/search">
|
||||
<label for="query">Query:</label>
|
||||
<input type="text" id="query" name="query" required placeholder="Enter your search query...">
|
||||
<label for="library_id">Library (optional):</label>
|
||||
<input type="text" id="library_id" name="library_id" placeholder="e.g., foundryvtt">
|
||||
<label for="limit">Limit results:</label>
|
||||
<select id="limit" name="limit">
|
||||
<option value="5">5</option>
|
||||
<option value="10" selected>10</option>
|
||||
<option value="20">20</option>
|
||||
<option value="50">50</option>
|
||||
</select>
|
||||
<button type="submit">Search</button>
|
||||
</form>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def search_results(request: Request) -> HTML:
|
||||
"""Display search results."""
|
||||
try:
|
||||
query = request.query_params.get("q", "")
|
||||
limit = int(request.query_params.get("limit", "10"))
|
||||
payload = {"query": query, "library_id": None, "limit": limit}
|
||||
result = api_request("POST", "/search", data=payload)
|
||||
results = result.get("results", [])
|
||||
except Exception as e:
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="UTF-8"><title>Error</title></head>
|
||||
<body><h1>Error</h1><pre>{str(e)}</pre><a href="/search/form">← Try Again</a></body>
|
||||
</html>""", media_type="text/html")
|
||||
|
||||
cards = []
|
||||
for r in results:
|
||||
title = r.get("title", "Untitled") or (r.get("content", "")[:100] + "...")[:200]
|
||||
content = (r.get("content", "") or r.get("chunk", ""))[:500]
|
||||
cards.append(f"""<div class="result-card" data-id="{r.get('id')}"><h3>{title}</h3>
|
||||
<p>{content}...</p><a href="/docs/{r.get('library_id')}">View Full</a></div>""")
|
||||
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Search Results</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/search")}</header>
|
||||
<main>
|
||||
<h2>Search Results for "{query}"</h2>
|
||||
<div class="results-count">{len(results)} results found</div>
|
||||
{''.join(cards)}
|
||||
<a href="/search/form">← New Search</a>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def sync_sources(request: Request) -> HTML:
|
||||
"""Sync git sources."""
|
||||
if request.method == "POST":
|
||||
try:
|
||||
data = api_request("POST", "/sources/sync")
|
||||
safe_json = json.dumps(data, indent=2).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="UTF-8"><title>Sync Result</title></head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/sync/sources")}</header>
|
||||
<main><h2>Git Sync Complete!</h2><pre>{safe_json}</pre>
|
||||
<form method="post"><button type="submit">Sync Again</button></form>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
except Exception as e:
|
||||
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="UTF-8"><title>Error</title></head>
|
||||
<body><h1>Sync Error</h1><pre>{safe_error}</pre><a href="/sources/git">← Try Again</a></body>
|
||||
</html>""", media_type="text/html")
|
||||
else:
|
||||
try:
|
||||
data = api_request("GET", "/libraries")
|
||||
libs = [l.get("id") for l in data.get("libraries", []) if l.get("id") != "error"]
|
||||
except Exception:
|
||||
libs = []
|
||||
|
||||
lib_list = ", ".join(libs) if libs else "(none)"
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Git Sync</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
|
||||
<main>
|
||||
<h2>Sync Git Repositories</h2>
|
||||
<p>Syncs all git repositories configured in <code>docs_sources.yaml</code>.</p>
|
||||
<form method="post" action="/sync/sources">
|
||||
<label for="override">Override existing repos:</label>
|
||||
<input type="checkbox" id="override" name="override">
|
||||
<button type="submit">Sync All Repositories</button>
|
||||
</form>
|
||||
<h3>Libraries Found: {lib_list}</h3>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
def git_sources(request: Request) -> HTML:
|
||||
"""List configured git sources."""
|
||||
import yaml
|
||||
config_path = Path(__file__).parent.parent.parent / "docs_sources.yaml"
|
||||
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
sources = data.get("sources", [])
|
||||
|
||||
source_blocks = []
|
||||
for src in sources:
|
||||
url = src.get("repo_url", "")[:50] + "..." if len(src.get("repo_url", "")) > 50 else src.get("repo_url", "")
|
||||
branch = src.get("branch", "main")
|
||||
include = src.get("include_paths", ["*"])
|
||||
exclude = src.get("exclude_paths", [])
|
||||
source_blocks.append(f"""<div class="source-card">
|
||||
<strong>{src.get('library_id', 'unknown')}</strong><br>
|
||||
URL: {url}<br>
|
||||
Branch: {branch}<br>
|
||||
Include: {', '.join(include)}{' | Exclude: ' + ', '.join(exclude) if exclude else ''}
|
||||
</div>""")
|
||||
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Git Sources</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/sources/git")}</header>
|
||||
<main>
|
||||
<h2>Configured Git Sources ({len(sources)})</h2>
|
||||
{''.join(source_blocks)}
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
except Exception as e:
|
||||
safe_error = str(e).replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="UTF-8"><title>Error</title></head>
|
||||
<body><h1>Git Sources Error</h1><pre>{safe_error}</pre></body>
|
||||
</html>""", media_type="text/html")
|
||||
|
||||
|
||||
def logs(request: Request) -> HTML:
|
||||
"""Logs/status page."""
|
||||
return HTML(f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Context7 Docs - Logs</title>
|
||||
<link rel="stylesheet" href="/static/css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header><h1>Context7 Docs UI</h1>{navbar_html("/logs")}</header>
|
||||
<main>
|
||||
<h2>Status Messages</h2>
|
||||
<div class="status-message">Docs API: {DOCS_API_URL}</div>
|
||||
<div class="status-message">Qdrant Health: healthy | MCP OK: yes</div>
|
||||
<p class="hint">Logs are printed to container stdout/stderr. For full logs, inspect Docker containers directly.</p>
|
||||
</main>{footer_html()}</div>
|
||||
</body></html>""", media_type="text/html")
|
||||
|
||||
|
||||
# Register all routes
|
||||
__all__ = [
|
||||
"health", "libraries", "upload", "ingest_all", "ingest_library",
|
||||
"folders_create", "folders_delete", "docs", "search_redirect",
|
||||
"search_form", "search_results", "sync_sources", "git_sources", "logs"
|
||||
]
|
||||
Reference in New Issue
Block a user