Semantic caching reuses LLM responses for semantically similar questions, cutting cost and latency.
# Exact cache (Redis -- for identical queries)
import redis, hashlib
r = redis.Redis(decode_responses=True)
def exact_cache(prompt: str) -> str | None:
key = 'llm:' + hashlib.sha256(prompt.encode()).hexdigest()
return r.get(key) # None if not cached
def exact_cache_set(prompt: str, response: str, ttl=3600):
key = 'llm:' + hashlib.sha256(prompt.encode()).hexdigest()
r.setex(key, ttl, response)
# Semantic cache (similar questions return cached answer)
class SemanticCache:
def __init__(self, threshold=0.95):
self.threshold = threshold
self.store = [] # list of (embedding, response)
def get(self, query: str) -> str | None:
q_embed = embed(query)
for cached_embed, response in self.store:
sim = cosine_similarity(q_embed, cached_embed)
if sim >= self.threshold:
return response
return None
def set(self, query: str, response: str):
self.store.append((embed(query), response))
# GPTCache library (production semantic cache)
# pip install gptcache