Evaluating LLM output requires multiple approaches since there is rarely one correct answer.
# 1. LLM-as-Judge (most flexible)
def llm_judge(question: str, answer: str) -> dict:
prompt = f'''Rate this answer on:
- Accuracy (1-5): factually correct?
- Completeness (1-5): fully answers the question?
- Clarity (1-5): clearly written?
- Hallucination (1-5): 5=none, 1=many
Question: {question}
Answer: {answer}
Return JSON: {{"scores": {{...}}, "reasoning": str, "issues": list}}
'''
return json.loads(llm.complete(prompt))
# 2. RAGAS (RAG evaluation)
# pip install ragas
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
# faithfulness: is answer grounded in context?
# answer_relevancy: does answer address the question?
# 3. DeepEval (unit tests for LLMs)
from deepeval import assert_test
from deepeval.metrics import HallucinationMetric
metric = HallucinationMetric(threshold=0.5)
# 4. Human evaluation (ground truth for important tasks)
# A/B test prompt versions with real users
# Collect thumbs up/down in production