import math
import torch
import torch.nn as nn
import torch.nn.functional as F
0)
torch.manual_seed(
def gaussian_nll(y, mu, log_var):
"""
Per-sample Gaussian negative log-likelihood (up to additive constant):
0.5 * [ log σ^2(x) + (y - μ(x))^2 / σ^2(x) ]
y, mu, log_var: (B,)
"""
return 0.5 * (log_var + (y - mu) ** 2 * torch.exp(-log_var))
SGD Regression with Gaussian NLL: Mean & Variance (Aleatoric) Estimation
This notebook replaces MSE with a Gaussian negative log-likelihood (NLL) so that SGD jointly learns the predictive mean ((x)) and variance (^2(x)). It includes both heteroscedastic (input-dependent variance) and an optional homoscedastic (single variance) variant, along with visualization of 95% prediction intervals.
Gaussian NLL utilities
Data
Uses existing x_train, y_train
if already defined in the kernel; otherwise generates a sinusoidal dataset with heteroscedastic noise for a meaningful demo.
import numpy as np
def _to_tensor(x):
= torch.as_tensor(x, dtype=torch.float32)
x return x
= all(name in globals() for name in ["x_train", "y_train"])
globals_exist if not globals_exist:
= 256
n_train = 400
n_test = np.random.uniform(-3.0, 3.0, size=(n_train, 1)).astype(np.float32)
x # heteroscedastic noise: sigma grows with |x|
= 0.15 + 0.25 * (np.abs(x).squeeze())
sigma = np.sin(1.3 * x).squeeze() + np.random.normal(0.0, sigma)
y = _to_tensor(x)
x_train = _to_tensor(y)
y_train
# grid for visualization
= np.linspace(-3.5, 3.5, n_test).reshape(-1, 1).astype(np.float32)
xs = _to_tensor(xs)
x_test else:
if "x_test" not in globals():
= float(torch.min(x_train))
x_min = float(torch.max(x_train))
x_max = (
xs - 0.25 * (x_max - x_min), x_max + 0.25 * (x_max - x_min), 400)
np.linspace(x_min -1, 1)
.reshape(
.astype(np.float32)
)= _to_tensor(xs)
x_test
x_train.shape, y_train.shape, x_test.shape
(torch.Size([256, 1]), torch.Size([256]), torch.Size([400, 1]))
Heteroscedastic Regressor: predicts μ(x) and log σ²(x)
class MeanVarRegressor(nn.Module):
def __init__(self, in_dim=1, hidden=128):
super().__init__()
self.shared = nn.Sequential(
nn.Linear(in_dim, hidden),
nn.ReLU(),
nn.Linear(hidden, hidden),
nn.ReLU(),
)self.mu_head = nn.Linear(hidden, 1)
self.logvar_head = nn.Linear(hidden, 1)
# Initialize logvar head bias to a small negative value
self.logvar_head.bias, -1.0)
nn.init.constant_(
def forward(self, x):
= self.shared(x)
h = self.mu_head(h).squeeze(-1) # (B,)
mu # Ensure positive variance via softplus, then take log
= torch.log(F.softplus(self.logvar_head(h).squeeze(-1)) + 1e-6)
log_var return mu, log_var
= MeanVarRegressor(in_dim=x_train.shape[1], hidden=128)
model model
MeanVarRegressor(
(shared): Sequential(
(0): Linear(in_features=1, out_features=128, bias=True)
(1): ReLU()
(2): Linear(in_features=128, out_features=128, bias=True)
(3): ReLU()
)
(mu_head): Linear(in_features=128, out_features=1, bias=True)
(logvar_head): Linear(in_features=128, out_features=1, bias=True)
)
Train with SGD on Gaussian NLL
from torch.utils.data import TensorDataset, DataLoader
= 64
batch_size = 800
epochs = 5e-3
lr = 1e-4 # curbs variance inflation
weight_decay
= TensorDataset(x_train, y_train)
ds = DataLoader(ds, batch_size=batch_size, shuffle=True)
dl
= torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
opt
model.train()for epoch in range(1, epochs + 1):
= 0.0
running for xb, yb in dl:
= model(xb)
mu, log_var = gaussian_nll(yb, mu, log_var).mean()
loss
opt.zero_grad()
loss.backward()
opt.step()+= loss.item() * xb.size(0)
running if epoch % 100 == 0:
print(f"epoch {epoch:4d} | nll: {running / len(ds):.4f}")
epoch 100 | nll: -0.2359
epoch 200 | nll: -0.2312
epoch 300 | nll: -0.2350
epoch 400 | nll: -0.2295
epoch 500 | nll: -0.2332
epoch 600 | nll: -0.2457
epoch 700 | nll: -0.2408
epoch 800 | nll: -0.2491
Predictive mean and 95% intervals
eval()
model.with torch.no_grad():
= model(x_test)
mu_pred, log_var_pred = torch.exp(0.5 * log_var_pred)
sigma_pred = mu_pred - 1.96 * sigma_pred
lo = mu_pred + 1.96 * sigma_pred
hi
mu_pred.shape, sigma_pred.shape
(torch.Size([400]), torch.Size([400]))
Plot
import matplotlib.pyplot as plt
=(8, 5))
plt.figure(figsize=12, alpha=0.6, label="train")
plt.scatter(x_train.numpy().squeeze(), y_train.numpy().squeeze(), s=2, label="μ(x)")
plt.plot(x_test.numpy().squeeze(), mu_pred.numpy().squeeze(), linewidth
plt.fill_between(=0.25, label="95% interval"
x_test.numpy().squeeze(), lo.numpy().squeeze(), hi.numpy().squeeze(), alpha
)
plt.legend()"Heteroscedastic Gaussian NLL with SGD")
plt.title("x")
plt.xlabel("y")
plt.ylabel( plt.show()
Optional: Homoscedastic Variant (single learned σ²)
class MeanOnlyRegressor(nn.Module):
def __init__(self, in_dim=1, hidden=128):
super().__init__()
self.net = nn.Sequential(
1)
nn.Linear(in_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden), nn.ReLU(), nn.Linear(hidden,
)self.log_var = nn.Parameter(torch.tensor(0.0))
def forward(self, x):
= self.net(x).squeeze(-1)
mu = self.log_var.expand_as(mu)
log_var return mu, log_var
# Example quick train:
= MeanOnlyRegressor(in_dim=x_train.shape[1], hidden=128)
model_h = torch.optim.SGD(model_h.parameters(), lr=5e-3, momentum=0.9, weight_decay=1e-4)
opt_h
for epoch in range(300):
= model_h(x_train)
mu, log_var = gaussian_nll(y_train, mu, log_var).mean()
loss
opt_h.zero_grad()
loss.backward()
opt_h.step()
with torch.no_grad():
= model_h(x_test)
mu_h, logv_h = torch.exp(0.5 * logv_h)
sig_h = mu_h - 1.96 * sig_h, mu_h + 1.96 * sig_h
lo_h, hi_h
import matplotlib.pyplot as plt
=(8, 5))
plt.figure(figsize=12, alpha=0.6, label="train")
plt.scatter(x_train.numpy().squeeze(), y_train.numpy().squeeze(), s=2, label="μ(x)")
plt.plot(x_test.numpy().squeeze(), mu_pred.numpy().squeeze(), linewidth
plt.fill_between(=0.25, label="95% interval"
x_test.numpy().squeeze(), lo_h.numpy().squeeze(), hi.numpy().squeeze(), alpha
)
plt.legend()"Heteroscedastic Gaussian NLL with SGD")
plt.title("x")
plt.xlabel("y")
plt.ylabel( plt.show()