How you split documents determines retrieval quality. Chunks too large lose precision; too small lose context.
# 1. Fixed-size with overlap (simple baseline)
def fixed_chunks(text: str, size=500, overlap=50):
chunks = []
for i in range(0, len(text), size - overlap):
chunks.append(text[i:i+size])
return chunks
# 2. Recursive character splitter (LangChain -- recommended)
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=['
','
','. ',' ','']
)
chunks = splitter.split_text(text)
# 3. Semantic chunking (split at topic boundaries)
# Compute embedding similarity between sentences
# Split when similarity drops below threshold
# 4. Document-aware chunking
# Markdown: split by heading (# ## ###)
# Code: split by class or function
# PDF: split by page or visual section break
# Best practice guidelines
# chunk_size: 200-500 tokens for most use cases
# chunk_overlap: 10-20% of chunk_size
# Add metadata: source, page, section title to each chunk
# Test retrieval quality with diverse questions