Evaluate agents on task success rate, tool accuracy, latency, and cost — build a labelled test dataset.
Evaluating AI Agents
from langsmith import Client
from langchain.smith import RunEvalConfig
# Define evaluation dataset
dataset = [
{
"input": "What is the capital of France?",
"expected": "Paris",
"check": lambda r: "Paris" in r
},
{
"input": "Calculate 15% tip on $84.50",
"expected": "12.68",
"check": lambda r: "12.68" in r or "12.67" in r
},
]
def evaluate_agent(agent, dataset):
results = []
for case in dataset:
response = agent.invoke({"input": case["input"]})
output = response.get("output","")
passed = case["check"](output)
results.append({
"input": case["input"],
"output": output,
"passed": passed
})
score = sum(1 for r in results if r["passed"]) / len(results)
print(f"Agent score: {score:.1%}")
return results
# Key metrics
# Task success rate, steps to completion
# Tool use accuracy, hallucination rate
# Latency per task, cost per task (tokens used)