Spark processes data at scale with lazy transformations — groupBy, agg, filter, withColumn, and joins.
Apache Spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
# Create Spark session
spark = SparkSession.builder
.appName("SalesAnalysis")
.config("spark.sql.shuffle.partitions", "200")
.config("spark.executor.memory", "4g")
.getOrCreate()
# Read parquet from S3
df = spark.read.parquet("s3a://datalake/sales/2025/")
# Transformations (lazy -- nothing runs until action)
result = (
df
.filter(F.col("status") == "completed")
.withColumn("revenue", F.col("qty") * F.col("unit_price"))
.withColumn("month", F.date_format("order_date", "yyyy-MM"))
.groupBy("month", "product_id")
.agg(
F.sum("revenue").alias("total_revenue"),
F.count("order_id").alias("order_count"),
F.avg("revenue").alias("avg_order_value")
)
.orderBy("month", F.desc("total_revenue"))
)
# Action -- triggers execution
result.write.mode("overwrite").parquet("s3a://datalake/marts/monthly_sales/")
# Show execution plan
result.explain(mode="formatted")