import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
11)
torch.manual_seed(11)
np.random.seed(
def train_test_split(X, y, test_size=0.3, shuffle=True, seed=0):
= X.shape[0]
n = np.arange(n)
idx if shuffle:
= np.random.default_rng(seed)
rng
rng.shuffle(idx)= int(round(test_size * n))
n_test = idx[:n_test]
test_idx = idx[n_test:]
train_idx return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
def standardize_fit(X):
= X.mean(axis=0, keepdims=True)
mu = X.std(axis=0, keepdims=True) + 1e-8
std return mu, std
def standardize_transform(X, mu, std):
return (X - mu) / std
def auc_roc(scores, labels):
= np.asarray(scores).ravel()
scores = np.asarray(labels).astype(int).ravel()
labels = np.argsort(-scores, kind="mergesort")
order = labels[order]
y = y.sum()
P = len(y) - P
N if P == 0 or N == 0:
return float("nan")
= np.cumsum(y)
tps = np.cumsum(1 - y)
fps = tps / P
tpr = fps / N
fpr return float(np.trapz(tpr, fpr))
def roc_curve(scores, labels):
= np.asarray(scores).ravel()
scores = np.asarray(labels).astype(int).ravel()
labels = np.argsort(-scores, kind="mergesort")
order = labels[order]
y = y.sum()
P = len(y) - P
N = np.cumsum(y)
tps = np.cumsum(1 - y)
fps = tps / (P + 1e-12)
tpr = fps / (N + 1e-12)
fpr return fpr, tpr
Logistic Regression with SGD on GMM Data: When PCA Helps (Two Regimes)
This notebook creates two synthetic binary datasets from a Gaussian Mixture Model (GMM):
- Low-noise regime (few noisy dims, strong class separation, ample data) — PCA has little effect.
- High-noise regime (many noisy dims, weaker separation, fewer samples) — PCA clearly improves performance.
For each regime we: - Generate data (informative 2D subspace + noisy dimensions). - Train logistic regression with SGD on raw features and on PCA-reduced features. - Report Accuracy and AUC on the test set. - Plot ROC curves and decision surface in 2D PC space. - Show the PCA explained variance.
Setup
GMM data generator
def make_gmm_binary(
=1000, d_total=20, d_informative=2, class_sep=2.5, noise_std=1.0, seed=0, random_rotate=True
n_per_class
):= np.random.default_rng(seed)
rng
= class_sep
sep = np.array([[-sep, 0.0], [0.0, -sep]])
mus0 = np.array([[sep, 0.0], [0.0, sep]])
mus1 = np.array([[1.0, 0.3], [0.3, 1.0]])
cov_inf
def sample_class(mus, n):
= rng.integers(0, len(mus), size=n)
z = np.stack([rng.multivariate_normal(mus[k], cov_inf) for k in z], axis=0)
X2 return X2
= sample_class(mus0, n_per_class)
X0_inf = sample_class(mus1, n_per_class)
X1_inf
= max(0, d_total - d_informative)
d_noise if d_noise > 0:
= rng.normal(0.0, noise_std, size=(n_per_class, d_noise))
X0_noise = rng.normal(0.0, noise_std, size=(n_per_class, d_noise))
X1_noise = np.concatenate([X0_inf, X0_noise], axis=1)
X0 = np.concatenate([X1_inf, X1_noise], axis=1)
X1 else:
= X0_inf
X0 = X1_inf
X1
= np.vstack([X0, X1]).astype(np.float32)
X = np.concatenate([np.zeros(n_per_class), np.ones(n_per_class)]).astype(np.int64)
y
if random_rotate:
= np.linalg.qr(rng.normal(size=(d_total, d_total)))
Q, _ = (X @ Q).astype(np.float32)
X
return X, y
PCA (Torch SVD)
def pca_fit_torch(X_np):
= X_np - X_np.mean(axis=0, keepdims=True)
Xc = torch.from_numpy(Xc)
Xc_t = torch.linalg.svd(Xc_t, full_matrices=False)
U, S, Vh = X_np.shape[0]
n = (S**2) / (n - 1)
ev = ev / ev.sum()
evr = Vh.numpy()
components = X_np.mean(axis=0, keepdims=True)
mean_vec return components, evr.numpy(), mean_vec
def pca_transform(X_np, components, k, mean_vec):
= X_np - mean_vec
Xc = components[:k].T
W return (Xc @ W).astype(np.float32)
Logistic regression (SGD)
class LogisticReg(nn.Module):
def __init__(self, in_dim):
super().__init__()
self.lin = nn.Linear(in_dim, 1)
def forward(self, x):
return self.lin(x).squeeze(-1)
def train_logreg_sgd(Xtr_np, ytr_np, Xte_np, yte_np, lr=5e-3, epochs=300, batch_size=256, l2=2e-4):
= torch.from_numpy(Xtr_np.astype(np.float32))
Xtr_t = torch.from_numpy(ytr_np.astype(np.float32))
ytr_t = torch.from_numpy(Xte_np.astype(np.float32))
Xte_t = torch.from_numpy(yte_np.astype(np.float32))
yte_t
= LogisticReg(in_dim=Xtr_t.shape[1])
model = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=l2)
opt = nn.BCEWithLogitsLoss()
loss_fn
= TensorDataset(Xtr_t, ytr_t)
ds = DataLoader(ds, batch_size=batch_size, shuffle=True)
dl
model.train()for ep in range(1, epochs + 1):
= 0.0
running for xb, yb in dl:
= model(xb)
logits = loss_fn(logits, yb)
loss
opt.zero_grad()
loss.backward()
opt.step()+= loss.item() * xb.size(0)
running if ep % 60 == 0:
print(f"epoch {ep:4d} | loss: {running / len(ds):.4f}")
eval()
model.with torch.no_grad():
= model(Xte_t).numpy()
logits_te = 1.0 / (1.0 + np.exp(-logits_te))
prob_te = (prob_te >= 0.5).astype(int)
pred_te = (pred_te == yte_np).mean()
acc_te = auc_roc(prob_te, yte_np)
auc_te return model, acc_te, auc_te, prob_te
Regime A — Low-noise (PCA expected little effect)
= dict(n_per_class=120000, d_total=100, d_informative=2, class_sep=2.5, noise_std=0.5, seed=1)
params_A = make_gmm_binary(**params_A)
XA, yA
= train_test_split(XA, yA, test_size=0.3, shuffle=True, seed=123)
XA_tr, XA_te, yA_tr, yA_te = standardize_fit(XA_tr)
muA, stdA = standardize_transform(XA_tr, muA, stdA)
XA_trs = standardize_transform(XA_te, muA, stdA)
XA_tes
= train_logreg_sgd(XA_trs, yA_tr, XA_tes, yA_te)
model_A_raw, acc_A_raw, auc_A_raw, prob_A_raw print(f"[Regime A] Raw: acc={acc_A_raw:.3f}, AUC={auc_A_raw:.3f}")
= pca_fit_torch(XA_trs.copy())
components_A, evr_A, mean_A = int(np.searchsorted(np.cumsum(evr_A), 0.95) + 1)
k95_A = pca_transform(XA_trs, components_A, k95_A, mean_A)
XA_tr_pca = pca_transform(XA_tes, components_A, k95_A, mean_A)
XA_te_pca
= train_logreg_sgd(XA_tr_pca, yA_tr, XA_te_pca, yA_te)
model_A_pca, acc_A_pca, auc_A_pca, prob_A_pca print(f"[Regime A] PCA@95% (k={k95_A}): acc={acc_A_pca:.3f}, AUC={auc_A_pca:.3f}")
epoch 60 | loss: 0.1522
epoch 120 | loss: 0.1522
epoch 180 | loss: 0.1522
epoch 240 | loss: 0.1522
epoch 300 | loss: 0.1522
[Regime A] Raw: acc=0.939, AUC=0.985
epoch 60 | loss: 0.1530
epoch 120 | loss: 0.1530
epoch 180 | loss: 0.1530
epoch 240 | loss: 0.1530
epoch 300 | loss: 0.1530
[Regime A] PCA@95% (k=91): acc=0.938, AUC=0.985
Regime A — PCA cumulative explained variance
=(6, 4))
plt.figure(figsize1, len(evr_A) + 1), np.cumsum(evr_A), marker="o")
plt.plot(np.arange("number of PCs")
plt.xlabel("cumulative explained variance")
plt.ylabel("Regime A: PCA cumulative explained variance (train)")
plt.title( plt.show()
Regime A — ROC (raw vs PCA)
= roc_curve(prob_A_raw, yA_te)
fpr_A_raw, tpr_A_raw = roc_curve(prob_A_pca, yA_te)
fpr_A_pca, tpr_A_pca
=(6, 5))
plt.figure(figsize=f"Raw (AUC={auc_A_raw:.3f})")
plt.plot(fpr_A_raw, tpr_A_raw, label=f"PCA@95% (AUC={auc_A_pca:.3f})")
plt.plot(fpr_A_pca, tpr_A_pca, label0, 1], [0, 1], linestyle="--")
plt.plot(["FPR")
plt.xlabel("TPR")
plt.ylabel("Regime A: ROC")
plt.title(
plt.legend() plt.show()
Regime B — High-noise (PCA expected to help)
= dict(n_per_class=120000, d_total=200, d_informative=2, class_sep=2.5, noise_std=0.5, seed=2)
params_B = make_gmm_binary(**params_B)
XB, yB
= train_test_split(XB, yB, test_size=0.3, shuffle=True, seed=123)
XB_tr, XB_te, yB_tr, yB_te = standardize_fit(XB_tr)
muB, stdB = standardize_transform(XB_tr, muB, stdB)
XB_trs = standardize_transform(XB_te, muB, stdB)
XB_tes
= train_logreg_sgd(XB_trs, yB_tr, XB_tes, yB_te)
model_B_raw, acc_B_raw, auc_B_raw, prob_B_raw print(f"[Regime B] Raw: acc={acc_B_raw:.3f}, AUC={auc_B_raw:.3f}")
= pca_fit_torch(XB_trs.copy())
components_B, evr_B, mean_B = int(np.searchsorted(np.cumsum(evr_B), 0.95) + 1)
k95_B = pca_transform(XB_trs, components_B, k95_B, mean_B)
XB_tr_pca = pca_transform(XB_tes, components_B, k95_B, mean_B)
XB_te_pca
= train_logreg_sgd(XB_tr_pca, yB_tr, XB_te_pca, yB_te)
model_B_pca, acc_B_pca, auc_B_pca, prob_B_pca print(f"[Regime B] PCA@95% (k={k95_B}): acc={acc_B_pca:.3f}, AUC={auc_B_pca:.3f}")
epoch 60 | loss: 0.1532
epoch 120 | loss: 0.1532
epoch 180 | loss: 0.1532
epoch 240 | loss: 0.1532
epoch 300 | loss: 0.1532
[Regime B] Raw: acc=0.939, AUC=0.986
epoch 60 | loss: 0.1536
epoch 120 | loss: 0.1536
epoch 180 | loss: 0.1536
epoch 240 | loss: 0.1536
epoch 300 | loss: 0.1536
[Regime B] PCA@95% (k=185): acc=0.939, AUC=0.985
Regime B — PCA cumulative explained variance
=(6, 4))
plt.figure(figsize1, len(evr_B) + 1), np.cumsum(evr_B), marker="o")
plt.plot(np.arange("number of PCs")
plt.xlabel("cumulative explained variance")
plt.ylabel("Regime B: PCA cumulative explained variance (train)")
plt.title( plt.show()
Regime B — ROC (raw vs PCA)
= roc_curve(prob_B_raw, yB_te)
fpr_B_raw, tpr_B_raw = roc_curve(prob_B_pca, yB_te)
fpr_B_pca, tpr_B_pca
=(6, 5))
plt.figure(figsize=f"Raw (AUC={auc_B_raw:.3f})")
plt.plot(fpr_B_raw, tpr_B_raw, label=f"PCA@95% (AUC={auc_B_pca:.3f})")
plt.plot(fpr_B_pca, tpr_B_pca, label0, 1], [0, 1], linestyle="--")
plt.plot(["FPR")
plt.xlabel("TPR")
plt.ylabel("Regime B: ROC")
plt.title(
plt.legend() plt.show()
Regime B — Decision surface on top-2 PCs
def train_logreg_return_model(Xtr_np, ytr_np, lr=5e-3, epochs=200, batch_size=256, l2=2e-4):
= torch.from_numpy(Xtr_np.astype(np.float32))
Xtr_t = torch.from_numpy(ytr_np.astype(np.float32))
ytr_t = LogisticReg(in_dim=Xtr_t.shape[1])
model = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=l2)
opt = nn.BCEWithLogitsLoss()
loss_fn = TensorDataset(Xtr_t, ytr_t)
ds = DataLoader(ds, batch_size=batch_size, shuffle=True)
dl
model.train()for ep in range(epochs):
for xb, yb in dl:
= model(xb)
logits = loss_fn(logits, yb)
loss
opt.zero_grad()
loss.backward()
opt.step()return model
= pca_transform(XB_trs, components_B, 2, mean_B)
XB_tr_pca2 = train_logreg_return_model(XB_tr_pca2, yB_tr, epochs=200)
model_B_pca2_viz
= np.meshgrid(
xx, yy 0].min() - 1.0, XB_tr_pca2[:, 0].max() + 1.0, 200),
np.linspace(XB_tr_pca2[:, 1].min() - 1.0, XB_tr_pca2[:, 1].max() + 1.0, 200),
np.linspace(XB_tr_pca2[:,
)= np.stack([xx.ravel(), yy.ravel()], axis=1).astype(np.float32)
grid with torch.no_grad():
= model_B_pca2_viz.lin(torch.from_numpy(grid)).numpy().reshape(xx.shape)
logits_grid = 1.0 / (1.0 + np.exp(-logits_grid))
probs_grid
=(6, 5))
plt.figure(figsize=20, alpha=0.3)
plt.contourf(xx, yy, probs_grid, levels0], XB_tr_pca2[:, 1], s=10, c=yB_tr, alpha=0.6)
plt.scatter(XB_tr_pca2[:, "Regime B: Decision surface (logistic on top-2 PCs)")
plt.title("PC1")
plt.xlabel("PC2")
plt.ylabel( plt.show()
Summary
- Regime A (low noise): Logistic regression on raw features already performs near-optimally; PCA changes little.
- Regime B (high noise): Raw high-dimensional noise degrades logistic regression; PCA denoises and reduces dimension, yielding higher accuracy/AUC.
- This illustrates that PCA is not a magic bullet, but it helps when signal lies in a low-dimensional subspace and noise dominates elsewhere.