""" Tests for backend/app/chunking.py These are pure unit tests that don't require any external dependencies. They test text chunking logic, token estimation, and heading-aware splitting. """ import pytest class TestEstimateTokens: """Tests for the estimate_tokens() function.""" def test_empty_text(self): """Empty text should return 0 tokens.""" from backend.app.chunking import estimate_tokens assert estimate_tokens("") == 0 def test_single_char(self): """Single character = 1 token (using 4 chars per token approximation).""" from backend.app.chunking import estimate_tokens assert estimate_tokens("a") == 0 # 1 char // 4 = 0 tokens def test_4_chars(self): """4 characters = 1 token.""" from backend.app.chunking import estimate_tokens assert estimate_tokens("abcd") == 1 def test_400_chars(self): """400 characters = 100 tokens.""" from backend.app.chunking import estimate_tokens text = "a" * 400 assert estimate_tokens(text) == 100 def test_whitespace_only(self): """Whitespace-only text should be counted.""" from backend.app.chunking import estimate_tokens assert estimate_tokens(" ") == 0 # 3 chars // 4 = 0 class TestChunkText: """Tests for the chunk_text() function.""" def test_empty_input(self, sample_text): """Empty input should return empty list.""" from backend.app.chunking import chunk_text assert chunk_text("") == [] def test_small_text_single_chunk(self, sample_text): """Small text under limit should be single chunk.""" from backend.app.chunking import chunk_text small = "This is a very short text that should be returned as a single chunk." chunks = chunk_text(small, max_tokens=500) assert len(chunks) == 1 assert chunks[0] == small def test_exact_token_limit(self, sample_text): """Text exactly at limit should be one chunk.""" from backend.app.chunking import chunk_text, estimate_tokens # Create text that is exactly 500 tokens (2000 chars) text = "a" * 2000 chunks = chunk_text(text, max_tokens=500) assert len(chunks) == 1 assert estimate_tokens(chunks[0]) == 500 def test_over_limit_splits(self, sample_text): """Text over limit should be split into multiple chunks.""" from backend.app.chunking import chunk_text, estimate_tokens # Create text that is 2500 tokens (10000 chars) text = "b" * 10000 chunks = chunk_text(text, max_tokens=500) assert len(chunks) >= 2 # Should be split def test_preserves_content(self, sample_text): """All content should be preserved in chunks (combined).""" from backend.app.chunking import chunk_text original = "Hello world! This is a test of chunking functionality." chunks = chunk_text(original, max_tokens=100) combined = "".join(chunks) assert len(chunks) == 1 assert combined == original def test_headings_split(self, sample_text): """Heading-aware splitting should preserve heading boundaries.""" from backend.app.chunking import chunk_text markdown_with_headings = """# Introduction This is the introduction section. ## Background Background information goes here.""" # With very small token limit, headings should cause splits chunks = chunk_text(markdown_with_headings, max_tokens=20) heading_chunks = [c for c in chunks if c.strip().startswith('#')] assert len(heading_chunks) >= 1 # At least some heading preserved def test_paragraph_split(self): """Paragraph splitting should respect paragraph boundaries.""" from backend.app.chunking import chunk_text text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph." chunks = chunk_text(text, max_tokens=15) # Small limit forces splits assert len(chunks) >= 3 # At least as many paragraphs def test_no_empty_chunks(self): """Should not return empty chunks.""" from backend.app.chunking import chunk_text text = "Hello world" chunks = chunk_text(text, max_tokens=10) for chunk in chunks: assert chunk.strip() != "" class TestTokenEstimationBoundaries: """Tests for token estimation boundaries.""" def test_boundary_precision(self): """Test boundary conditions around the 4-char-per-token limit.""" from backend.app.chunking import estimate_tokens # Edge cases around boundary assert estimate_tokens("abcd") == 1 # exactly 4 chars assert estimate_tokens("abcde") == 1 # 5 chars still 1 token assert estimate_tokens("abcdef") == 1 # 6 chars still 1 token assert estimate_tokens("abcdefg") == 1 # 7 chars still 1 token assert estimate_tokens("abcdefgh") == 2 # 8 chars = 2 tokens def test_various_languages_chars(self): """Token estimation uses character count, not unicode complexity.""" from backend.app.chunking import estimate_tokens # Chinese characters (each counts as 1 char) chinese = "你好世界" # 4 characters assert estimate_tokens(chinese) == 1 # Emoji emoji = "Hello 🎉 world" # Spaces + letters + emoji # emoji count varies by implementation, just check it's counted assert isinstance(estimate_tokens(emoji), int) class TestChunkOverlapBehavior: """Tests for overlap handling between chunks.""" def test_overlap_not_exceeded(self): """Chunks should not have excessive overlap.""" from backend.app.chunking import chunk_text # Text that will be split at a known boundary text = "The quick brown fox jumps over the lazy dog. " * 10 chunks = chunk_text(text, max_tokens=30, overlap_tokens=5) if len(chunks) > 1: # Last few chars of first chunk shouldn't duplicate excessively assert len(chunks[0]) <= len("".join(chunks)) // 2 # Rough check class TestChunkEdgeCases: """Tests for edge cases and error conditions.""" def test_whitespace_only_text(self): """Whitespace-only text should handle gracefully.""" from backend.app.chunking import chunk_text chunks = chunk_text(" \n\n ", max_tokens=100) # May return empty or whitespace chunk, shouldn't crash assert isinstance(chunks, list) def test_very_long_paragraph(self): """Long paragraph without breaks should be split.""" from backend.app.chunking import chunk_text long_para = "The quick brown fox jumps over the lazy dog. " * 100 chunks = chunk_text(long_para, max_tokens=50) assert len(chunks) > 1 # Should be split def test_none_input_raises(self): """None input should be handled (return empty or raise).""" from backend.app.chunking import chunk_text with pytest.raises((TypeError, AssertionError)): chunk_text(None, max_tokens=100) def test_unicode_text(self): """Unicode text should be handled.""" from backend.app.chunking import chunk_text unicode_text = "Hello 世界 مرحبا 🎉" chunks = chunk_text(unicode_text, max_tokens=50) assert len(chunks) == 1 # Small enough to be single chunk # ============================================================================= # SAMPLE TEXT FIXTURE # ============================================================================= @pytest.fixture def heading_markdown(): """Sample markdown with headings for chunking tests.""" return """# Introduction This is the introduction section. It contains some introductory text here. ## Background Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context. ### Details Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation. ## Conclusion The conclusion wraps up everything nicely.""" class TestHeadingPreservation: """Tests for heading-aware chunking with sample text.""" def test_headings_in_separate_chunks(self, heading_markdown): """Headings should appear in their own chunks when possible.""" from backend.app.chunking import chunk_text # Very small token limit forces splits at headings chunks = chunk_text(heading_markdown, max_tokens=30) heading_sections = [c for c in chunks if c.strip().startswith('#')] assert len(heading_sections) >= 1 def test_all_content_present(self, heading_markdown): """All content should be preserved when combined.""" from backend.app.chunking import chunk_text original = heading_markdown chunks = chunk_text(original, max_tokens=500) combined = "".join(chunks) # Content shouldn't be truncated or corrupted assert "Introduction" in combined assert "Background" in combined assert "Conclusion" in combined