Initial DocsMCP stack
This commit is contained in:
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Tests for backend/app/chunking.py
|
||||
|
||||
These are pure unit tests that don't require any external dependencies.
|
||||
They test text chunking logic, token estimation, and heading-aware splitting.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
|
||||
class TestEstimateTokens:
|
||||
"""Tests for the estimate_tokens() function."""
|
||||
|
||||
def test_empty_text(self):
|
||||
"""Empty text should return 0 tokens."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
assert estimate_tokens("") == 0
|
||||
|
||||
def test_single_char(self):
|
||||
"""Single character = 1 token (using 4 chars per token approximation)."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
assert estimate_tokens("a") == 0 # 1 char // 4 = 0 tokens
|
||||
|
||||
def test_4_chars(self):
|
||||
"""4 characters = 1 token."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
assert estimate_tokens("abcd") == 1
|
||||
|
||||
def test_400_chars(self):
|
||||
"""400 characters = 100 tokens."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
text = "a" * 400
|
||||
assert estimate_tokens(text) == 100
|
||||
|
||||
def test_whitespace_only(self):
|
||||
"""Whitespace-only text should be counted."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
assert estimate_tokens(" ") == 0 # 3 chars // 4 = 0
|
||||
|
||||
|
||||
class TestChunkText:
|
||||
"""Tests for the chunk_text() function."""
|
||||
|
||||
def test_empty_input(self, sample_text):
|
||||
"""Empty input should return empty list."""
|
||||
from backend.app.chunking import chunk_text
|
||||
assert chunk_text("") == []
|
||||
|
||||
def test_small_text_single_chunk(self, sample_text):
|
||||
"""Small text under limit should be single chunk."""
|
||||
from backend.app.chunking import chunk_text
|
||||
small = "This is a very short text that should be returned as a single chunk."
|
||||
chunks = chunk_text(small, max_tokens=500)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == small
|
||||
|
||||
def test_exact_token_limit(self, sample_text):
|
||||
"""Text exactly at limit should be one chunk."""
|
||||
from backend.app.chunking import chunk_text, estimate_tokens
|
||||
# Create text that is exactly 500 tokens (2000 chars)
|
||||
text = "a" * 2000
|
||||
chunks = chunk_text(text, max_tokens=500)
|
||||
assert len(chunks) == 1
|
||||
assert estimate_tokens(chunks[0]) == 500
|
||||
|
||||
def test_over_limit_splits(self, sample_text):
|
||||
"""Text over limit should be split into multiple chunks."""
|
||||
from backend.app.chunking import chunk_text, estimate_tokens
|
||||
# Create text that is 2500 tokens (10000 chars)
|
||||
text = "b" * 10000
|
||||
chunks = chunk_text(text, max_tokens=500)
|
||||
assert len(chunks) >= 2 # Should be split
|
||||
|
||||
def test_preserves_content(self, sample_text):
|
||||
"""All content should be preserved in chunks (combined)."""
|
||||
from backend.app.chunking import chunk_text
|
||||
original = "Hello world! This is a test of chunking functionality."
|
||||
chunks = chunk_text(original, max_tokens=100)
|
||||
combined = "".join(chunks)
|
||||
assert len(chunks) == 1
|
||||
assert combined == original
|
||||
|
||||
def test_headings_split(self, sample_text):
|
||||
"""Heading-aware splitting should preserve heading boundaries."""
|
||||
from backend.app.chunking import chunk_text
|
||||
markdown_with_headings = """# Introduction
|
||||
|
||||
This is the introduction section.
|
||||
|
||||
## Background
|
||||
|
||||
Background information goes here."""
|
||||
|
||||
# With very small token limit, headings should cause splits
|
||||
chunks = chunk_text(markdown_with_headings, max_tokens=20)
|
||||
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
|
||||
assert len(heading_chunks) >= 1 # At least some heading preserved
|
||||
|
||||
def test_paragraph_split(self):
|
||||
"""Paragraph splitting should respect paragraph boundaries."""
|
||||
from backend.app.chunking import chunk_text
|
||||
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
|
||||
chunks = chunk_text(text, max_tokens=15) # Small limit forces splits
|
||||
assert len(chunks) >= 3 # At least as many paragraphs
|
||||
|
||||
def test_no_empty_chunks(self):
|
||||
"""Should not return empty chunks."""
|
||||
from backend.app.chunking import chunk_text
|
||||
text = "Hello world"
|
||||
chunks = chunk_text(text, max_tokens=10)
|
||||
for chunk in chunks:
|
||||
assert chunk.strip() != ""
|
||||
|
||||
|
||||
class TestTokenEstimationBoundaries:
|
||||
"""Tests for token estimation boundaries."""
|
||||
|
||||
def test_boundary_precision(self):
|
||||
"""Test boundary conditions around the 4-char-per-token limit."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
|
||||
# Edge cases around boundary
|
||||
assert estimate_tokens("abcd") == 1 # exactly 4 chars
|
||||
assert estimate_tokens("abcde") == 1 # 5 chars still 1 token
|
||||
assert estimate_tokens("abcdef") == 1 # 6 chars still 1 token
|
||||
assert estimate_tokens("abcdefg") == 1 # 7 chars still 1 token
|
||||
assert estimate_tokens("abcdefgh") == 2 # 8 chars = 2 tokens
|
||||
|
||||
def test_various_languages_chars(self):
|
||||
"""Token estimation uses character count, not unicode complexity."""
|
||||
from backend.app.chunking import estimate_tokens
|
||||
|
||||
# Chinese characters (each counts as 1 char)
|
||||
chinese = "你好世界" # 4 characters
|
||||
assert estimate_tokens(chinese) == 1
|
||||
|
||||
# Emoji
|
||||
emoji = "Hello 🎉 world" # Spaces + letters + emoji
|
||||
# emoji count varies by implementation, just check it's counted
|
||||
assert isinstance(estimate_tokens(emoji), int)
|
||||
|
||||
|
||||
class TestChunkOverlapBehavior:
|
||||
"""Tests for overlap handling between chunks."""
|
||||
|
||||
def test_overlap_not_exceeded(self):
|
||||
"""Chunks should not have excessive overlap."""
|
||||
from backend.app.chunking import chunk_text
|
||||
|
||||
# Text that will be split at a known boundary
|
||||
text = "The quick brown fox jumps over the lazy dog. " * 10
|
||||
chunks = chunk_text(text, max_tokens=30, overlap_tokens=5)
|
||||
|
||||
if len(chunks) > 1:
|
||||
# Last few chars of first chunk shouldn't duplicate excessively
|
||||
assert len(chunks[0]) <= len("".join(chunks)) // 2 # Rough check
|
||||
|
||||
|
||||
class TestChunkEdgeCases:
|
||||
"""Tests for edge cases and error conditions."""
|
||||
|
||||
def test_whitespace_only_text(self):
|
||||
"""Whitespace-only text should handle gracefully."""
|
||||
from backend.app.chunking import chunk_text
|
||||
chunks = chunk_text(" \n\n ", max_tokens=100)
|
||||
# May return empty or whitespace chunk, shouldn't crash
|
||||
assert isinstance(chunks, list)
|
||||
|
||||
def test_very_long_paragraph(self):
|
||||
"""Long paragraph without breaks should be split."""
|
||||
from backend.app.chunking import chunk_text
|
||||
|
||||
long_para = "The quick brown fox jumps over the lazy dog. " * 100
|
||||
chunks = chunk_text(long_para, max_tokens=50)
|
||||
assert len(chunks) > 1 # Should be split
|
||||
|
||||
def test_none_input_raises(self):
|
||||
"""None input should be handled (return empty or raise)."""
|
||||
from backend.app.chunking import chunk_text
|
||||
with pytest.raises((TypeError, AssertionError)):
|
||||
chunk_text(None, max_tokens=100)
|
||||
|
||||
def test_unicode_text(self):
|
||||
"""Unicode text should be handled."""
|
||||
from backend.app.chunking import chunk_text
|
||||
unicode_text = "Hello 世界 مرحبا 🎉"
|
||||
chunks = chunk_text(unicode_text, max_tokens=50)
|
||||
assert len(chunks) == 1 # Small enough to be single chunk
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SAMPLE TEXT FIXTURE
|
||||
# =============================================================================
|
||||
|
||||
@pytest.fixture
|
||||
def heading_markdown():
|
||||
"""Sample markdown with headings for chunking tests."""
|
||||
return """# Introduction
|
||||
|
||||
This is the introduction section. It contains some introductory text here.
|
||||
|
||||
## Background
|
||||
|
||||
Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context.
|
||||
|
||||
### Details
|
||||
|
||||
Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation.
|
||||
|
||||
## Conclusion
|
||||
|
||||
The conclusion wraps up everything nicely."""
|
||||
|
||||
|
||||
class TestHeadingPreservation:
|
||||
"""Tests for heading-aware chunking with sample text."""
|
||||
|
||||
def test_headings_in_separate_chunks(self, heading_markdown):
|
||||
"""Headings should appear in their own chunks when possible."""
|
||||
from backend.app.chunking import chunk_text
|
||||
|
||||
# Very small token limit forces splits at headings
|
||||
chunks = chunk_text(heading_markdown, max_tokens=30)
|
||||
|
||||
heading_sections = [c for c in chunks if c.strip().startswith('#')]
|
||||
assert len(heading_sections) >= 1
|
||||
|
||||
def test_all_content_present(self, heading_markdown):
|
||||
"""All content should be preserved when combined."""
|
||||
from backend.app.chunking import chunk_text
|
||||
|
||||
original = heading_markdown
|
||||
chunks = chunk_text(original, max_tokens=500)
|
||||
combined = "".join(chunks)
|
||||
|
||||
# Content shouldn't be truncated or corrupted
|
||||
assert "Introduction" in combined
|
||||
assert "Background" in combined
|
||||
assert "Conclusion" in combined
|
||||
Reference in New Issue
Block a user