Files
DocsMCP/tests/test_chunking.py
2026-06-05 23:02:55 +01:00

239 lines
9.1 KiB
Python

"""
Tests for backend/app/chunking.py
These are pure unit tests that don't require any external dependencies.
They test text chunking logic, token estimation, and heading-aware splitting.
"""
import pytest
class TestEstimateTokens:
"""Tests for the estimate_tokens() function."""
def test_empty_text(self):
"""Empty text should return 0 tokens."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("") == 0
def test_single_char(self):
"""Single character = 1 token (using 4 chars per token approximation)."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("a") == 0 # 1 char // 4 = 0 tokens
def test_4_chars(self):
"""4 characters = 1 token."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens("abcd") == 1
def test_400_chars(self):
"""400 characters = 100 tokens."""
from backend.app.chunking import estimate_tokens
text = "a" * 400
assert estimate_tokens(text) == 100
def test_whitespace_only(self):
"""Whitespace-only text should be counted."""
from backend.app.chunking import estimate_tokens
assert estimate_tokens(" ") == 0 # 3 chars // 4 = 0
class TestChunkText:
"""Tests for the chunk_text() function."""
def test_empty_input(self, sample_text):
"""Empty input should return empty list."""
from backend.app.chunking import chunk_text
assert chunk_text("") == []
def test_small_text_single_chunk(self, sample_text):
"""Small text under limit should be single chunk."""
from backend.app.chunking import chunk_text
small = "This is a very short text that should be returned as a single chunk."
chunks = chunk_text(small, max_tokens=500)
assert len(chunks) == 1
assert chunks[0] == small
def test_exact_token_limit(self, sample_text):
"""Text exactly at limit should be one chunk."""
from backend.app.chunking import chunk_text, estimate_tokens
# Create text that is exactly 500 tokens (2000 chars)
text = "a" * 2000
chunks = chunk_text(text, max_tokens=500)
assert len(chunks) == 1
assert estimate_tokens(chunks[0]) == 500
def test_over_limit_splits(self, sample_text):
"""Text over limit should be split into multiple chunks."""
from backend.app.chunking import chunk_text, estimate_tokens
# Create text that is 2500 tokens (10000 chars)
text = "b" * 10000
chunks = chunk_text(text, max_tokens=500)
assert len(chunks) >= 2 # Should be split
def test_preserves_content(self, sample_text):
"""All content should be preserved in chunks (combined)."""
from backend.app.chunking import chunk_text
original = "Hello world! This is a test of chunking functionality."
chunks = chunk_text(original, max_tokens=100)
combined = "".join(chunks)
assert len(chunks) == 1
assert combined == original
def test_headings_split(self, sample_text):
"""Heading-aware splitting should preserve heading boundaries."""
from backend.app.chunking import chunk_text
markdown_with_headings = """# Introduction
This is the introduction section.
## Background
Background information goes here."""
# With very small token limit, headings should cause splits
chunks = chunk_text(markdown_with_headings, max_tokens=20)
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
assert len(heading_chunks) >= 1 # At least some heading preserved
def test_paragraph_split(self):
"""Paragraph splitting should respect paragraph boundaries."""
from backend.app.chunking import chunk_text
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
chunks = chunk_text(text, max_tokens=15) # Small limit forces splits
assert len(chunks) >= 3 # At least as many paragraphs
def test_no_empty_chunks(self):
"""Should not return empty chunks."""
from backend.app.chunking import chunk_text
text = "Hello world"
chunks = chunk_text(text, max_tokens=10)
for chunk in chunks:
assert chunk.strip() != ""
class TestTokenEstimationBoundaries:
"""Tests for token estimation boundaries."""
def test_boundary_precision(self):
"""Test boundary conditions around the 4-char-per-token limit."""
from backend.app.chunking import estimate_tokens
# Edge cases around boundary
assert estimate_tokens("abcd") == 1 # exactly 4 chars
assert estimate_tokens("abcde") == 1 # 5 chars still 1 token
assert estimate_tokens("abcdef") == 1 # 6 chars still 1 token
assert estimate_tokens("abcdefg") == 1 # 7 chars still 1 token
assert estimate_tokens("abcdefgh") == 2 # 8 chars = 2 tokens
def test_various_languages_chars(self):
"""Token estimation uses character count, not unicode complexity."""
from backend.app.chunking import estimate_tokens
# Chinese characters (each counts as 1 char)
chinese = "你好世界" # 4 characters
assert estimate_tokens(chinese) == 1
# Emoji
emoji = "Hello 🎉 world" # Spaces + letters + emoji
# emoji count varies by implementation, just check it's counted
assert isinstance(estimate_tokens(emoji), int)
class TestChunkOverlapBehavior:
"""Tests for overlap handling between chunks."""
def test_overlap_not_exceeded(self):
"""Chunks should not have excessive overlap."""
from backend.app.chunking import chunk_text
# Text that will be split at a known boundary
text = "The quick brown fox jumps over the lazy dog. " * 10
chunks = chunk_text(text, max_tokens=30, overlap_tokens=5)
if len(chunks) > 1:
# Last few chars of first chunk shouldn't duplicate excessively
assert len(chunks[0]) <= len("".join(chunks)) // 2 # Rough check
class TestChunkEdgeCases:
"""Tests for edge cases and error conditions."""
def test_whitespace_only_text(self):
"""Whitespace-only text should handle gracefully."""
from backend.app.chunking import chunk_text
chunks = chunk_text(" \n\n ", max_tokens=100)
# May return empty or whitespace chunk, shouldn't crash
assert isinstance(chunks, list)
def test_very_long_paragraph(self):
"""Long paragraph without breaks should be split."""
from backend.app.chunking import chunk_text
long_para = "The quick brown fox jumps over the lazy dog. " * 100
chunks = chunk_text(long_para, max_tokens=50)
assert len(chunks) > 1 # Should be split
def test_none_input_raises(self):
"""None input should be handled (return empty or raise)."""
from backend.app.chunking import chunk_text
with pytest.raises((TypeError, AssertionError)):
chunk_text(None, max_tokens=100)
def test_unicode_text(self):
"""Unicode text should be handled."""
from backend.app.chunking import chunk_text
unicode_text = "Hello 世界 مرحبا 🎉"
chunks = chunk_text(unicode_text, max_tokens=50)
assert len(chunks) == 1 # Small enough to be single chunk
# =============================================================================
# SAMPLE TEXT FIXTURE
# =============================================================================
@pytest.fixture
def heading_markdown():
"""Sample markdown with headings for chunking tests."""
return """# Introduction
This is the introduction section. It contains some introductory text here.
## Background
Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context.
### Details
Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
class TestHeadingPreservation:
"""Tests for heading-aware chunking with sample text."""
def test_headings_in_separate_chunks(self, heading_markdown):
"""Headings should appear in their own chunks when possible."""
from backend.app.chunking import chunk_text
# Very small token limit forces splits at headings
chunks = chunk_text(heading_markdown, max_tokens=30)
heading_sections = [c for c in chunks if c.strip().startswith('#')]
assert len(heading_sections) >= 1
def test_all_content_present(self, heading_markdown):
"""All content should be preserved when combined."""
from backend.app.chunking import chunk_text
original = heading_markdown
chunks = chunk_text(original, max_tokens=500)
combined = "".join(chunks)
# Content shouldn't be truncated or corrupted
assert "Introduction" in combined
assert "Background" in combined
assert "Conclusion" in combined