DocsMCP/tests/test_chunking.py

"""
Tests for backend/app/chunking.py

These are pure unit tests that don't require any external dependencies.
They test text chunking logic, token estimation, and heading-aware splitting.
"""
import pytest


class TestEstimateTokens:
    """Tests for the estimate_tokens() function."""

    def test_empty_text(self):
        """Empty text should return 0 tokens."""
        from backend.app.chunking import estimate_tokens
        assert estimate_tokens("") == 0

    def test_single_char(self):
        """Single character = 1 token (using 4 chars per token approximation)."""
        from backend.app.chunking import estimate_tokens
        assert estimate_tokens("a") == 0  # 1 char // 4 = 0 tokens

    def test_4_chars(self):
        """4 characters = 1 token."""
        from backend.app.chunking import estimate_tokens
        assert estimate_tokens("abcd") == 1

    def test_400_chars(self):
        """400 characters = 100 tokens."""
        from backend.app.chunking import estimate_tokens
        text = "a" * 400
        assert estimate_tokens(text) == 100

    def test_whitespace_only(self):
        """Whitespace-only text should be counted."""
        from backend.app.chunking import estimate_tokens
        assert estimate_tokens("   ") == 0  # 3 chars // 4 = 0


class TestChunkText:
    """Tests for the chunk_text() function."""

    def test_empty_input(self, sample_text):
        """Empty input should return empty list."""
        from backend.app.chunking import chunk_text
        assert chunk_text("") == []

    def test_small_text_single_chunk(self, sample_text):
        """Small text under limit should be single chunk."""
        from backend.app.chunking import chunk_text
        small = "This is a very short text that should be returned as a single chunk."
        chunks = chunk_text(small, max_tokens=500)
        assert len(chunks) == 1
        assert chunks[0] == small

    def test_exact_token_limit(self, sample_text):
        """Text exactly at limit should be one chunk."""
        from backend.app.chunking import chunk_text, estimate_tokens
        # Create text that is exactly 500 tokens (2000 chars)
        text = "a" * 2000
        chunks = chunk_text(text, max_tokens=500)
        assert len(chunks) == 1
        assert estimate_tokens(chunks[0]) == 500

    def test_over_limit_splits(self, sample_text):
        """Text over limit should be split into multiple chunks."""
        from backend.app.chunking import chunk_text, estimate_tokens
        # Create text that is 2500 tokens (10000 chars)
        text = "b" * 10000
        chunks = chunk_text(text, max_tokens=500)
        assert len(chunks) >= 2  # Should be split

    def test_preserves_content(self, sample_text):
        """All content should be preserved in chunks (combined)."""
        from backend.app.chunking import chunk_text
        original = "Hello world! This is a test of chunking functionality."
        chunks = chunk_text(original, max_tokens=100)
        combined = "".join(chunks)
        assert len(chunks) == 1
        assert combined == original

    def test_headings_split(self, sample_text):
        """Heading-aware splitting should preserve heading boundaries."""
        from backend.app.chunking import chunk_text
        markdown_with_headings = """# Introduction

This is the introduction section.

## Background

Background information goes here."""

        # With very small token limit, headings should cause splits
        chunks = chunk_text(markdown_with_headings, max_tokens=20)
        heading_chunks = [c for c in chunks if c.strip().startswith('#')]
        assert len(heading_chunks) >= 1  # At least some heading preserved

    def test_paragraph_split(self):
        """Paragraph splitting should respect paragraph boundaries."""
        from backend.app.chunking import chunk_text
        text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
        chunks = chunk_text(text, max_tokens=15)  # Small limit forces splits
        assert len(chunks) >= 3  # At least as many paragraphs

    def test_no_empty_chunks(self):
        """Should not return empty chunks."""
        from backend.app.chunking import chunk_text
        text = "Hello world"
        chunks = chunk_text(text, max_tokens=10)
        for chunk in chunks:
            assert chunk.strip() != ""


class TestTokenEstimationBoundaries:
    """Tests for token estimation boundaries."""

    def test_boundary_precision(self):
        """Test boundary conditions around the 4-char-per-token limit."""
        from backend.app.chunking import estimate_tokens

        # Edge cases around boundary
        assert estimate_tokens("abcd") == 1      # exactly 4 chars
        assert estimate_tokens("abcde") == 1     # 5 chars still 1 token
        assert estimate_tokens("abcdef") == 1    # 6 chars still 1 token
        assert estimate_tokens("abcdefg") == 1   # 7 chars still 1 token
        assert estimate_tokens("abcdefgh") == 2   # 8 chars = 2 tokens

    def test_various_languages_chars(self):
        """Token estimation uses character count, not unicode complexity."""
        from backend.app.chunking import estimate_tokens

        # Chinese characters (each counts as 1 char)
        chinese = "你好世界"  # 4 characters
        assert estimate_tokens(chinese) == 1

        # Emoji
        emoji = "Hello 🎉 world"  # Spaces + letters + emoji
        # emoji count varies by implementation, just check it's counted
        assert isinstance(estimate_tokens(emoji), int)


class TestChunkOverlapBehavior:
    """Tests for overlap handling between chunks."""

    def test_overlap_not_exceeded(self):
        """Chunks should not have excessive overlap."""
        from backend.app.chunking import chunk_text

        # Text that will be split at a known boundary
        text = "The quick brown fox jumps over the lazy dog. " * 10
        chunks = chunk_text(text, max_tokens=30, overlap_tokens=5)

        if len(chunks) > 1:
            # Last few chars of first chunk shouldn't duplicate excessively
            assert len(chunks[0]) <= len("".join(chunks)) // 2  # Rough check


class TestChunkEdgeCases:
    """Tests for edge cases and error conditions."""

    def test_whitespace_only_text(self):
        """Whitespace-only text should handle gracefully."""
        from backend.app.chunking import chunk_text
        chunks = chunk_text("   \n\n   ", max_tokens=100)
        # May return empty or whitespace chunk, shouldn't crash
        assert isinstance(chunks, list)

    def test_very_long_paragraph(self):
        """Long paragraph without breaks should be split."""
        from backend.app.chunking import chunk_text

        long_para = "The quick brown fox jumps over the lazy dog. " * 100
        chunks = chunk_text(long_para, max_tokens=50)
        assert len(chunks) > 1  # Should be split

    def test_none_input_raises(self):
        """None input should be handled (return empty or raise)."""
        from backend.app.chunking import chunk_text
        with pytest.raises((TypeError, AssertionError)):
            chunk_text(None, max_tokens=100)

    def test_unicode_text(self):
        """Unicode text should be handled."""
        from backend.app.chunking import chunk_text
        unicode_text = "Hello 世界 مرحبا 🎉"
        chunks = chunk_text(unicode_text, max_tokens=50)
        assert len(chunks) == 1  # Small enough to be single chunk


# =============================================================================
# SAMPLE TEXT FIXTURE
# =============================================================================

@pytest.fixture
def heading_markdown():
    """Sample markdown with headings for chunking tests."""
    return """# Introduction

This is the introduction section. It contains some introductory text here.

## Background

Background information goes here to make this longer and test chunking. This paragraph has more content about the background topic. It provides context.

### Details

Specific details about the background are provided in this subsection. More details follow here to ensure we have enough text to properly test heading preservation.

## Conclusion

The conclusion wraps up everything nicely."""


class TestHeadingPreservation:
    """Tests for heading-aware chunking with sample text."""

    def test_headings_in_separate_chunks(self, heading_markdown):
        """Headings should appear in their own chunks when possible."""
        from backend.app.chunking import chunk_text

        # Very small token limit forces splits at headings
        chunks = chunk_text(heading_markdown, max_tokens=30)

        heading_sections = [c for c in chunks if c.strip().startswith('#')]
        assert len(heading_sections) >= 1

    def test_all_content_present(self, heading_markdown):
        """All content should be preserved when combined."""
        from backend.app.chunking import chunk_text

        original = heading_markdown
        chunks = chunk_text(original, max_tokens=500)
        combined = "".join(chunks)

        # Content shouldn't be truncated or corrupted
        assert "Introduction" in combined
        assert "Background" in combined
        assert "Conclusion" in combined