Sanitise tool outputs, apply least privilege, cap iterations, and require approval for destructive actions.
Agent Security and Safety
# Key threats for AI agents
# 1. Prompt injection via tool results
# 2. Excessive permissions (agent does too much)
# 3. Runaway loops (infinite tool calls)
# 4. Data exfiltration via tool chaining
# 5. Code injection in code execution tools
# Defences
# 1. Sanitise tool outputs before feeding to LLM
def safe_tool_result(result: str) -> str:
suspicious = ["ignore previous","new instructions","system:"]
for pattern in suspicious:
if pattern.lower() in result.lower():
return "[Tool output sanitised: possible injection]"
return result[:5000] # truncate large outputs
# 2. Principle of least privilege for tools
# Give agents ONLY the tools they need for the task
# 3. Hard limits on iterations
def run_agent(goal, max_turns=10): # hard cap
...
# 4. Audit log every tool call
log.info("tool_called",tool=name,args=args,result=result[:100])
# 5. Confirmation for destructive actions
DESTRUCTIVE = ["delete","drop","rm","wipe","send_email"]
if any(d in tool_name for d in DESTRUCTIVE):
if not get_human_approval(f"Confirm: {tool_name}({args})"):
return "Action cancelled by safety guard"