Iceberg adds ACID transactions, schema evolution, time travel, and partition evolution to data lake files.
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, LongType, TimestampType
# Create catalog
catalog = load_catalog("glue", **{
"type": "glue",
"s3.region": "us-east-1"
})
# Create Iceberg table
schema = Schema(
NestedField(1, "order_id", LongType(), required=True),
NestedField(2, "customer_id",LongType(), required=True),
NestedField(3, "status", StringType(), required=False),
NestedField(4, "created_at", TimestampType(), required=True)
)
catalog.create_table("mydb.orders", schema=schema)
# Spark + Iceberg
spark.sql("""
CREATE TABLE IF NOT EXISTS glue.mydb.orders (
order_id BIGINT,
customer_id BIGINT,
status STRING,
created_at TIMESTAMP
) USING iceberg
PARTITIONED BY (months(created_at))
""")
# Time travel
spark.read.option("as-of-timestamp","2025-01-01 00:00:00").table("glue.mydb.orders")
spark.read.option("snapshot-id","123456789").table("glue.mydb.orders")
# Schema evolution (safe -- no rewrite needed)
spark.sql("ALTER TABLE glue.mydb.orders ADD COLUMN total_amount DOUBLE")
# MERGE INTO (upsert)
spark.sql("""
MERGE INTO glue.mydb.orders t USING updates s ON t.order_id = s.order_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")