Full load vs incremental ingestion — CDC with Debezium captures row-level changes from database binlog.
Data Ingestion Patterns
# Full Load vs Incremental
# Full load: read all source data every time
# Good for: small tables, no change tracking on source
spark.read.jdbc(url, "SELECT * FROM products", properties)
# Incremental: only load new/changed records
# Method 1: Timestamp-based
spark.read.jdbc(
url,
"(SELECT * FROM orders WHERE updated_at > '2025-01-14') t",
properties
)
# Method 2: CDC (Change Data Capture) with Debezium
# Debezium captures MySQL binlog and publishes to Kafka
# topic: mydb.orders -> {op:"c","r","u","d", before:{}, after:{}}
# Airbyte (managed EL tool)
# 350+ connectors: Postgres, Salesforce, Stripe, etc.
docker run airbyte/airbyte-server
# UI at http://localhost:8000
# Configure: source (Postgres) -> destination (Snowflake)
# Select: full refresh or incremental+dedup
# Fivetran (managed, paid)
# Zero-maintenance connectors with automatic schema migration
# Custom API ingestion
import httpx, pandas as pd
def ingest_api(endpoint: str, since: str) -> pd.DataFrame:
all_records = []
page = 1
while True:
resp = httpx.get(endpoint, params={"since":since,"page":page,"limit":1000})
data = resp.json()
if not data["records"]: break
all_records.extend(data["records"])
page += 1
return pd.DataFrame(all_records)