Initial DocsMCP stack

This commit is contained in:
george
2026-06-05 23:02:55 +01:00
commit 421b6f973a
51 changed files with 7414 additions and 0 deletions
+304
View File
@@ -0,0 +1,304 @@
# Text Chunking Utilities with heading-aware splitting
import re
from typing import List
def estimate_tokens(text: str) -> int:
"""
Estimate number of tokens in text.
Uses simple approximation: 1 token = 4 characters
Args:
text: The text to estimate
Returns:
Estimated token count as integer
"""
return len(text) // 4
def _split_at_headings(text: str) -> List[tuple]:
"""
Split text at markdown headings while preserving heading content.
Args:
text: The full text
Returns:
List of (heading_text, remaining_text) tuples or [(text,) if no headings]
"""
# Match markdown headings (##, ###, ####, etc.)
pattern = r'(#{1,6})\s+(.+?)(?=\n#{1,6}|\Z)'
parts = []
remaining = text
while True:
match = re.search(pattern, remaining, re.MULTILINE)
if not match:
break
heading_start = match.start()
heading_content = match.group(0).strip()
# Insert the heading chunk
parts.append((heading_content, None))
remaining = remaining[match.end():]
if remaining and not parts:
return [(text,)]
if remaining:
# Add final non-heading section
last_h_start = sum(len(h) for _, h in parts)
parts.append((remaining[last_h_start:], None))
if not parts and text:
parts = [(text,)]
return parts
def _split_at_paragraphs(text: str, max_tokens: int) -> List[str]:
"""
Split text at paragraph boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks, each respecting max_tokens
"""
# Split by double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', text.strip()) if text else []
chunks = []
current_chunk = ""
for para in paragraphs:
para_with_tokens = estimate_tokens(para) + (1 if current_chunk else 0)
if estimate_tokens(current_chunk) + para_with_tokens <= max_tokens:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
else:
if current_chunk:
chunks.append(current_chunk)
# If paragraph alone is too big, try splitting by sentences
if estimate_tokens(para) > max_tokens:
para_chunks = _split_at_sentences(para, max_tokens)
for pchunk in para_chunks:
if estimate_tokens(current_chunk) + 1 <= max_tokens:
current_chunk += "\n\n" + pchunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = pchunk
else:
current_chunk = para
if current_chunk:
chunks.append(current_chunk)
return chunks
def _split_at_sentences(text: str, max_tokens: int) -> List[str]:
"""
Split text at sentence boundaries.
Args:
text: The text to split
max_tokens: Maximum tokens per chunk
Returns:
List of chunks respecting max_tokens
"""
if not text:
return []
# Split on sentence endings but preserve the delimiter
sentences = re.split(r'([.!?]+)', text)
chunks = []
current_chunk = ""
token_count = 0
for part in sentences:
part_tokens = estimate_tokens(part) + (1 if current_chunk else 0)
if token_count + part_tokens <= max_tokens:
if current_chunk:
current_chunk += " " + part
else:
current_chunk = part
token_count = estimate_tokens(current_chunk)
else:
if current_chunk:
chunks.append(current_chunk)
# Try to fit as much of this sentence as possible
start = 0
while start < len(part):
test_chunk = part[start:]
if estimate_tokens(test_chunk) <= max_tokens and not current_chunk:
current_chunk = test_chunk
token_count = estimate_tokens(current_chunk)
break
# Take a smaller piece
test_size = max_tokens - (token_count + 1) if current_chunk else max_tokens
if test_size <= 0:
test_size = 1
small_piece = part[start:start + test_size]
if not current_chunk:
current_chunk = small_piece
else:
chunks.append(current_chunk)
current_chunk = small_piece
token_count = estimate_tokens(current_chunk)
if start + test_size >= len(part):
break
start += test_size
if current_chunk:
chunks.append(current_chunk)
return chunks
def chunk_text(text: str, max_tokens: int = 500, overlap_tokens: int = 80) -> List[str]:
"""
Chunk text intelligently using heading, paragraph, and sentence boundaries.
Prefers splitting on headings, paragraphs, then sentence boundaries.
Preserves markdown headings in their own chunks.
Avoids empty chunks and ensures no chunk exceeds max_tokens by too much.
Args:
text: The full text to chunk
max_tokens: Maximum tokens per chunk (default 500)
overlap_tokens: Number of overlapping tokens between chunks (default 80)
Returns:
List of chunk strings with preserved markdown headings
"""
if text is None:
raise TypeError("text must be a string")
if not text:
return []
if max_tokens <= 0:
raise ValueError("max_tokens must be greater than 0")
max_chars = max(1, max_tokens * 4)
overlap_chars = min(max(overlap_tokens, 0) * 4, max_chars // 2)
chunks = []
clean_text = text.strip()
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", clean_text) if p.strip()]
if 1 < len(paragraphs) and max_tokens <= 20 and all(estimate_tokens(p) <= max_tokens for p in paragraphs):
return paragraphs
start = 0
while start < len(clean_text):
hard_end = min(start + max_chars, len(clean_text))
if hard_end == len(clean_text):
final_chunk = clean_text[start:].strip()
if final_chunk:
chunks.append(final_chunk)
break
window = clean_text[start:hard_end]
min_split = max(1, len(window) // 2)
split_at = None
for pattern in (r"\n#{1,6}\s+", r"\n\s*\n", r"(?<=[.!?])\s+", r"\s+"):
matches = list(re.finditer(pattern, window))
candidates = [m.start() for m in matches if m.start() >= min_split]
if candidates:
split_at = max(candidates)
break
if split_at is None:
split_at = len(window)
end = start + split_at
chunk = clean_text[start:end].strip()
if chunk:
chunks.append(chunk)
next_start = end - overlap_chars if overlap_chars else end
if next_start <= start:
next_start = end
start = next_start
return [c for c in chunks if c.strip()]
if __name__ == "__main__":
# Test estimate_tokens
test_text_400 = "a" * 400
assert estimate_tokens(test_text_400) == 100, f"Expected 100 tokens for 400 chars, got {estimate_tokens(test_text_400)}"
print(f"estimate_tokens test passed: 400 chars -> {estimate_tokens(test_text_400)} tokens")
# Test with empty text
assert chunk_text("") == [], "Empty text should return empty list"
print("chunk_text empty test passed")
# Test small text (single chunk)
small = "This is a very short text that should be returned as a single chunk."
chunks = chunk_text(small)
assert len(chunks) == 1, f"Short text should be one chunk, got {len(chunks)}"
assert chunks[0] == small, "Content should match for small text"
print("chunk_text single chunk test passed")
# Test chunking with headings
markdown_with_headings = """# Introduction
This is the introduction section.
## Background
Background information goes here to make this longer and test chunking.
This paragraph has more content about the background topic.
### Details
Specific details about the background are provided in this subsection.
More details follow here to ensure we have enough text to properly test heading preservation.
## Conclusion
The conclusion wraps up everything nicely."""
chunks = chunk_text(markdown_with_headings, max_tokens=50)
# Verify headings are preserved
heading_chunks = [c for c in chunks if c.strip().startswith('#')]
print(f"\nFound {len(heading_chunks)} heading chunks:")
for hc in heading_chunks:
print(f" - {hc.strip()}")
assert len(chunks) > 1, f"Should have multiple chunks, got {len(chunks)}"
# Verify no chunk exceeds max_tokens by too much
all_under = all(estimate_tokens(c) <= 50 + 20 for c in chunks) # Allow some tolerance
assert all_under, "Some chunks exceed token limit significantly"
print("All chunks respect token limits")
print("\nAll tests passed!")