Adversarial examples, model inversion, backdoor attacks — understand ML-specific attack vectors and defences.
Adversarial Machine Learning Attacks
# Attacks targeting ML models
# 1. Adversarial Examples
# Imperceptible perturbations that fool classifiers
import numpy as np
def fgsm_attack(model, image, true_label, epsilon=0.01):
"""Fast Gradient Sign Method adversarial attack"""
import torch
image_tensor = torch.FloatTensor(image).requires_grad_(True)
output = model(image_tensor)
loss = criterion(output, torch.tensor([true_label]))
loss.backward()
perturbed = image_tensor + epsilon * image_tensor.grad.sign()
return perturbed.detach().numpy()
# Defence: adversarial training
# Include adversarial examples in training data
# 2. Model Inversion Attack
# Reconstruct training data from model outputs
# Defence: differential privacy in training
# 3. Backdoor/Trojan Attack
# Poison training data with trigger patterns
# Model behaves normally except when trigger present
# Defence: data provenance, CleanBACK, spectral signatures
# 4. Evasion Attack (on classifiers)
# Craft malware that bypasses ML AV scanner
# Modify benign-looking features while keeping functionality
# 5. Model Stealing
# Query model repeatedly to train surrogate
# Defence: rate limiting, prediction rounding, watermarking