Extract structured data from PDFs, images, and documents using Claude vision — batch process thousands.
Document Processing Automation
import anthropic
from pathlib import Path
import json
client = anthropic.Anthropic()
def process_invoice(pdf_path: str) -> dict:
"""Extract structured data from PDF invoices."""
import base64
with open(pdf_path,"rb") as f:
pdf_data = base64.b64encode(f.read()).decode()
resp = client.messages.create(
model="claude-opus-4-5", max_tokens=1000,
system="Extract invoice data. Return ONLY valid JSON.",
messages=[{"role":"user","content":[
{"type":"document","source":{"type":"base64","media_type":"application/pdf","data":pdf_data}},
{"type":"text","text":"Extract: invoice_number, date, vendor, line_items[{description,qty,unit_price,total}], subtotal, tax, total. Return JSON only."}
]}]
)
return json.loads(resp.content[0].text)
def batch_process_invoices(folder: str) -> list:
results = []
for pdf in Path(folder).glob("*.pdf"):
try:
data = process_invoice(str(pdf))
data["source_file"] = str(pdf)
results.append(data)
except Exception as e:
results.append({"error":str(e),"file":str(pdf)})
return results