Initial DocsMCP stack

2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
@@ -0,0 +1,238 @@
+"""
+Tests for backend/app/chunking.py
+
+These are pure unit tests that don't require any external dependencies.
+They test text chunking logic, token estimation, and heading-aware splitting.
+"""
+import pytest
+
+
+class TestEstimateTokens:
+    """Tests for the estimate_tokens() function."""
+
+    def test_empty_text(self):
+        """Empty text should return 0 tokens."""
+        from backend.app.chunking import estimate_tokens
+        assert estimate_tokens("") == 0
+
+    def test_single_char(self):
+        """Single character = 1 token (using 4 chars per token approximation)."""
+        from backend.app.chunking import estimate_tokens
+        assert estimate_tokens("a") == 0  # 1 char // 4 = 0 tokens
+
+    def test_4_chars(self):
+        """4 characters = 1 token."""
+        from backend.app.chunking import estimate_tokens
+        assert estimate_tokens("abcd") == 1
+
+    def test_400_chars(self):
+        """400 characters = 100 tokens."""
+        from backend.app.chunking import estimate_tokens
+        text = "a" * 400
+        assert estimate_tokens(text) == 100
+
+    def test_whitespace_only(self):
+        """Whitespace-only text should be counted."""
+        from backend.app.chunking import estimate_tokens
+        assert estimate_tokens("   ") == 0  # 3 chars // 4 = 0
+
+
+class TestChunkText:
+    """Tests for the chunk_text() function."""
+
+    def test_empty_input(self, sample_text):
+        """Empty input should return empty list."""
+        from backend.app.chunking import chunk_text
+        assert chunk_text("") == []
+
+    def test_small_text_single_chunk(self, sample_text):
+        """Small text under limit should be single chunk."""
+        from backend.app.chunking import chunk_text
+        small = "This is a very short text that should be returned as a single chunk."
+        chunks = chunk_text(small, max_tokens=500)
+        assert len(chunks) == 1
+        assert chunks[0] == small
+
+    def test_exact_token_limit(self, sample_text):
+        """Text exactly at limit should be one chunk."""
+        from backend.app.chunking import chunk_text, estimate_tokens
+        # Create text that is exactly 500 tokens (2000 chars)
+        text = "a" * 2000
+        chunks = chunk_text(text, max_tokens=500)
+        assert len(chunks) == 1
+        assert estimate_tokens(chunks[0]) == 500
+
+    def test_over_limit_splits(self, sample_text):
+        """Text over limit should be split into multiple chunks."""
+        from backend.app.chunking import chunk_text, estimate_tokens
+        # Create text that is 2500 tokens (10000 chars)
+        text = "b" * 10000
+        chunks = chunk_text(text, max_tokens=500)
+        assert len(chunks) >= 2  # Should be split
+
+    def test_preserves_content(self, sample_text):
+        """All content should be preserved in chunks (combined)."""
+        from backend.app.chunking import chunk_text
+        original = "Hello world! This is a test of chunking functionality."
+        chunks = chunk_text(original, max_tokens=100)
+        combined = "".join(chunks)
+        assert len(chunks) == 1
+        assert combined == original
+
+    def test_headings_split(self, sample_text):
+        """Heading-aware splitting should preserve heading boundaries."""
+        from backend.app.chunking import chunk_text
+        markdown_with_headings = """# Introduction
+
+This is the introduction section.
+
+## Background
+
+Background information goes here."""
+
+        # With very small token limit, headings should cause splits
+        chunks = chunk_text(markdown_with_headings, max_tokens=20)
+        heading_chunks = [c for c in chunks if c.strip().startswith('#')]
+        assert len(heading_chunks) >= 1  # At least some heading preserved
+
+    def test_paragraph_split(self):
+        """Paragraph splitting should respect paragraph boundaries."""
+        from backend.app.chunking import chunk_text
+        text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
+        chunks = chunk_text(text, max_tokens=15)  # Small limit forces splits
+        assert len(chunks) >= 3  # At least as many paragraphs
+
+    def test_no_empty_chunks(self):
+        """Should not return empty chunks."""
+        from backend.app.chunking import chunk_text
+        text = "Hello world"
+        chunks = chunk_text(text, max_tokens=10)
+        for chunk in chunks:
+            assert chunk.strip() != ""
+
+
+class TestTokenEstimationBoundaries:
+    """Tests for token estimation boundaries."""
+
+    def test_boundary_precision(self):
+        """Test boundary conditions around the 4-char-per-token limit."""
+        from backend.app.chunking import estimate_tokens
+        
+        # Edge cases around boundary
+        assert estimate_tokens("abcd") == 1      # exactly 4 chars
+        assert estimate_tokens("abcde") == 1     # 5 chars still 1 token
+        assert estimate_tokens("abcdef") == 1    # 6 chars still 1 token
+        assert estimate_tokens("abcdefg") == 1   # 7 chars still 1 token
+        assert estimate_tokens("abcdefgh") == 2   # 8 chars = 2 tokens
+
+    def test_various_languages_chars(self):
+        """Token estimation uses character count, not unicode complexity."""
+        from backend.app.chunking import estimate_tokens
+        
+        # Chinese characters (each counts as 1 char)
+        chinese = "你好世界"  # 4 characters
+        assert estimate_tokens(chinese) == 1
+        
+        # Emoji
+        emoji = "Hello 🎉 world"  # Spaces + letters + emoji
+        # emoji count varies by implementation, just check it's counted
+        assert isinstance(estimate_tokens(emoji), int)
+
+
+class TestChunkOverlapBehavior:
+    """Tests for overlap handling between chunks."""
+
+    def test_overlap_not_exceeded(self):
+        """Chunks should not have excessive overlap."""
+        from backend.app.chunking import chunk_text
+        
+        # Text that will be split at a known boundary
+        text = "The quick brown fox jumps over the lazy dog. " * 10
+        chunks = chunk_text(text, max_tokens=30, overlap_tokens=5)
+        
+        if len(chunks) > 1:
+            # Last few chars of first chunk shouldn't duplicate excessively
+            assert len(chunks[0]) <= len("".join(chunks)) // 2  # Rough check
+
+
+class TestChunkEdgeCases:
+    """Tests for edge cases and error conditions."""
+
+    def test_whitespace_only_text(self):
+        """Whitespace-only text should handle gracefully."""
+        from backend.app.chunking import chunk_text
+        chunks = chunk_text("   \n\n   ", max_tokens=100)
+        # May return empty or whitespace chunk, shouldn't crash
+        assert isinstance(chunks, list)
+
+    def test_very_long_paragraph(self):
+        """Long paragraph without breaks should be split."""
+        from backend.app.chunking import chunk_text
+        
+        long_para = "The quick brown fox jumps over the lazy dog. " * 100
+        chunks = chunk_text(long_para, max_tokens=50)
+        assert len(chunks) > 1  # Should be split
+
+    def test_none_input_raises(self):
+        """None input should be handled (return empty or raise)."""
+        from backend.app.chunking import chunk_text
+        with pytest.raises((TypeError, AssertionError)):
+            chunk_text(None, max_tokens=100)
+
+    def test_unicode_text(self):
+        """Unicode text should be handled."""
+        from backend.app.chunking import chunk_text
+        unicode_text = "Hello 世界 مرحبا 🎉"
+        chunks = chunk_text(unicode_text, max_tokens=50)
+        assert len(chunks) == 1  # Small enough to be single chunk
+
+
+# =============================================================================
+# SAMPLE TEXT FIXTURE
+# =============================================================================
+
+@pytest.fixture
+def heading_markdown():
+    """Sample markdown with headings for chunking tests."""
+    return """# Introduction
+
+This is the introduction section. It contains some introductory text here.
+
+## Background
+
+Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context.
+
+### Details
+
+Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation.
+
+## Conclusion
+
+The conclusion wraps up everything nicely."""
+
+
+class TestHeadingPreservation:
+    """Tests for heading-aware chunking with sample text."""
+
+    def test_headings_in_separate_chunks(self, heading_markdown):
+        """Headings should appear in their own chunks when possible."""
+        from backend.app.chunking import chunk_text
+        
+        # Very small token limit forces splits at headings
+        chunks = chunk_text(heading_markdown, max_tokens=30)
+        
+        heading_sections = [c for c in chunks if c.strip().startswith('#')]
+        assert len(heading_sections) >= 1
+
+    def test_all_content_present(self, heading_markdown):
+        """All content should be preserved when combined."""
+        from backend.app.chunking import chunk_text
+        
+        original = heading_markdown
+        chunks = chunk_text(original, max_tokens=500)
+        combined = "".join(chunks)
+        
+        # Content shouldn't be truncated or corrupted
+        assert "Introduction" in combined
+        assert "Background" in combined
+        assert "Conclusion" in combined