Use vectorised ops instead of Python loops, choose small dtypes, and understand views vs copies.
NumPy Performance Tips
import numpy as np
import timeit
# Always use vectorised operations, never Python loops!
data = np.random.rand(1_000_000)
# SLOW: Python loop
total = 0
for x in data:
total += x
# FAST: NumPy vectorised
total = np.sum(data) # 100x faster!
# Use appropriate dtypes (saves memory)
a = np.array([1, 2, 3], dtype=np.int8) # 3 bytes
b = np.array([1, 2, 3], dtype=np.int64) # 24 bytes
# Views vs copies
a = np.arange(10)
view = a[2:5] # no copy (fast)
copy = a[2:5].copy() # explicit copy
# Memory layout
a = np.ones((1000, 1000), order='C') # row-major (default)
b = np.ones((1000, 1000), order='F') # column-major