Treat prompts like code — version them, build test suites, and measure improvement rigorously.
# Evaluation framework
import anthropic, json
client = anthropic.Anthropic()
def evaluate(prompt_template: str, test_cases: list) -> float:
passed = 0
for case in test_cases:
prompt = prompt_template.format(**case['input'])
response = client.messages.create(
model='claude-opus-4-5', max_tokens=500,
messages=[{'role':'user','content':prompt}]
).content[0].text
if case['check'](response):
passed += 1
return passed / len(test_cases)
# Test cases
tests = [
{'input':{'text':'I love this!'},
'check': lambda r: 'positive' in r.lower()},
{'input':{'text':'This is terrible'},
'check': lambda r: 'negative' in r.lower()},
]
score_v1 = evaluate(prompt_v1, tests)
score_v2 = evaluate(prompt_v2, tests)
# Key metrics:
# Accuracy on test set
# Format compliance rate
# Hallucination rate
# Token count (cost proxy)
# Latency p50 / p95