Add input validation, output checking, Guardrails AI, and content moderation to prevent harmful LLM responses.
LLM Safety and Guardrails
# Input validation
def validate_input(text: str) -> tuple[bool, str]:
if len(text) > 2000:
return False, 'Input too long (max 2000 chars)'
injection_patterns = [
'ignore previous', 'new instructions',
'system prompt', 'jailbreak',
]
for p in injection_patterns:
if p.lower() in text.lower():
return False, 'Invalid input detected'
return True, ''
# Output validation
def validate_output(response: str, allowed_topics: list) -> str:
# Check relevance to allowed topics
for topic in allowed_topics:
if topic.lower() in response.lower():
return response
return 'I can only help with questions about our products.'
# Guardrails AI library
from guardrails import Guard
from guardrails.hub import ToxicLanguage, DetectPII
guard = Guard().use(ToxicLanguage).use(DetectPII)
result = guard.validate(llm_output)
# Content moderation API
# OpenAI moderation endpoint (free)
response = client.moderations.create(input=user_message)
if response.results[0].flagged:
return 'Content policy violation'