Data catalogs enable discovery, lineage, documentation, and governance — DataHub, Atlas, or Monte Carlo.
Data Catalog and Data Lineage
# DataHub (open source) or Apache Atlas
# Why data catalog?
# - Discovery: "where is our orders data?"
# - Lineage: "what broke when source table changed?"
# - Documentation: "what does this column mean?"
# - Governance: "who owns this data?"
# DataHub Python SDK
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.schema_classes import (
DatasetPropertiesClass, SchemaMetadataClass
)
emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
# Emit dataset metadata
emitter.emit_mcp(MetadataChangeProposalWrapper(
entityType="dataset",
changeType="UPSERT",
entityUrn="urn:li:dataset:(urn:li:dataPlatform:snowflake,fct_orders,PROD)",
aspectName="datasetProperties",
aspect=DatasetPropertiesClass(
description="One row per completed order",
customProperties={"owner":"data-team","sla":"6am daily","pii":"yes"}
)
))
# dbt lineage is automatic via dbt docs
# Column-level lineage shows: raw_orders.amount -> fct_orders.revenue
# Monte Carlo (SaaS) -- data observability
# Auto-detects anomalies: volume drops, freshness delays, schema changes
# Sends Slack alert when fct_orders has 50% fewer rows than yesterday