Eigenfaces + PCA for Logistic Regression (SGD)

This notebook uses the Olivetti faces (AT&T) dataset to demonstrate how PCA (Eigenfaces) can significantly improve a logistic regression classifier trained with SGD.

  • Load faces (64×64 grayscale) and build a binary task: Person A vs. Person B (or A vs. rest).
  • Train logistic regression on raw pixels vs. on PCA features (eigenfaces).
  • Compare accuracy and ROC/AUC.
  • Visualize eigenfaces and cumulative explained variance.

Setup & data loading

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

torch.manual_seed(0)
np.random.seed(0)

# Try to fetch the Olivetti faces dataset (will download if missing)
try:
    faces = fetch_olivetti_faces(shuffle=True, random_state=42, download_if_missing=True)
    X = faces.images  # (n_samples, 64,64)
    y_person = faces.target  # [0..39], 10 images per person
    print("Loaded Olivetti faces:", X.shape, "unique persons:", len(np.unique(y_person)))
except Exception as e:
    print("❌ Could not fetch Olivetti faces.")
    print("Reason:", e)
    print("Please ensure internet access, or pre-download the dataset via scikit-learn:")
    print("from sklearn.datasets import fetch_olivetti_faces; fetch_olivetti_faces(download_if_missing=True)")
    raise
downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/vscode/scikit_learn_data
Loaded Olivetti faces: (400, 64, 64) unique persons: 40

Build a binary task

# Choose two persons for a balanced, clear binary task. Each person has 10 images.
person_A, person_B = 3, 7  # feel free to change (0..39)
idx = np.where((y_person == person_A) | (y_person == person_B))[0]
X_bin = X[idx]  # (n, 64, 64)
y_bin = (y_person[idx] == person_B).astype(np.int64)  # label: person_B = 1, person_A = 0

n_samples, H, W = X_bin.shape
D = H * W
X_flat = X_bin.reshape(n_samples, -1).astype(np.float32)

# Train/Test split (stratified)
X_tr, X_te, y_tr, y_te = train_test_split(X_flat, y_bin, test_size=0.4, stratify=y_bin, random_state=123)

# Standardize raw features (common best practice for logistic/SGD)
scaler = StandardScaler(with_mean=True, with_std=True)
Xtr_std = scaler.fit_transform(X_tr)
Xte_std = scaler.transform(X_te)

Xtr_std.shape, Xte_std.shape, y_tr.shape, y_te.shape
((12, 4096), (8, 4096), (12,), (8,))

PCA (Eigenfaces) on training set

# Fit PCA on TRAIN ONLY (Eigenfaces)
n_components = 12
pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True, random_state=42)
Xtr_pca = pca.fit_transform(Xtr_std)  # (n_train, k)
Xte_pca = pca.transform(Xte_std)  # (n_test, k)

print("PCA components:", pca.components_.shape)
explained = np.cumsum(pca.explained_variance_ratio_)
print("Cumulative explained variance @k=50:", explained[-1])

# Visualize top eigenfaces
n_show = 12
plt.figure(figsize=(8, 6))
for i in range(n_show):
    ax = plt.subplot(3, 4, i + 1)
    ax.imshow(pca.components_[i].reshape(64, 64), cmap="gray")
    ax.set_title(f"PC{i + 1}")
    ax.axis("off")
plt.suptitle("Top Eigenfaces (principal components)")
plt.tight_layout()
plt.show()

# Plot cumulative explained variance
plt.figure(figsize=(6, 4))
plt.plot(np.arange(1, n_components + 1), explained, marker="o")
plt.xlabel("number of PCs")
plt.ylabel("cumulative explained variance")
plt.title("Cumulative explained variance (train)")
plt.show()
PCA components: (12, 4096)
Cumulative explained variance @k=50: 0.9999999

Logistic regression (PyTorch, SGD)

class LogisticReg(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.lin = nn.Linear(in_dim, 1)

    def forward(self, x):
        return self.lin(x).squeeze(-1)  # logits


def train_logreg_sgd(Xtr, ytr, Xte, yte, lr=5e-3, epochs=200, batch=16, l2=1e-4):
    Xtr_t = torch.from_numpy(Xtr.astype(np.float32))
    ytr_t = torch.from_numpy(ytr.astype(np.float32))
    Xte_t = torch.from_numpy(Xte.astype(np.float32))
    yte_t = torch.from_numpy(yte.astype(np.float32))

    model = LogisticReg(in_dim=Xtr_t.shape[1])
    opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=l2)
    loss_fn = nn.BCEWithLogitsLoss()

    ds = TensorDataset(Xtr_t, ytr_t)
    dl = DataLoader(ds, batch_size=batch, shuffle=True)

    model.train()
    for ep in range(1, epochs + 1):
        running = 0.0
        for xb, yb in dl:
            logits = model(xb)
            loss = loss_fn(logits, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            running += loss.item() * xb.size(0)
        if ep % 50 == 0:
            print(f"epoch {ep:4d} | loss: {running / len(ds):.4f}")

    model.eval()
    with torch.no_grad():
        logits_te = model(Xte_t).numpy()
    prob_te = 1.0 / (1.0 + np.exp(-logits_te))
    pred_te = (prob_te >= 0.5).astype(int)
    acc_te = (pred_te == yte).mean()
    # Simple AUC
    order = np.argsort(-prob_te, axis=0).ravel()
    y_sorted = yte[order]
    P = y_sorted.sum()
    N = len(y_sorted) - P
    tps = np.cumsum(y_sorted)
    fps = np.cumsum(1 - y_sorted)
    tpr = tps / (P + 1e-12)
    fpr = fps / (N + 1e-12)
    auc = float(np.trapz(tpr, fpr))
    return model, acc_te, auc, (fpr, tpr)

Train & compare: raw pixels vs PCA features

# RAW PIXELS
model_raw, acc_raw, auc_raw, roc_raw = train_logreg_sgd(
    Xtr_std, y_tr, Xte_std, y_te, lr=5e-3, epochs=250, batch=16, l2=1e-4
)
print(f"[Raw pixels]  acc={acc_raw:.3f}, AUC={auc_raw:.3f}")

# PCA FEATURES
model_pca, acc_pca, auc_pca, roc_pca = train_logreg_sgd(
    Xtr_pca, y_tr, Xte_pca, y_te, lr=5e-3, epochs=250, batch=16, l2=1e-4
)
print(f"[PCA features] acc={acc_pca:.3f}, AUC={auc_pca:.3f}")

# Plot ROC
fpr_raw, tpr_raw = roc_raw
fpr_pca, tpr_pca = roc_pca
plt.figure(figsize=(6, 5))
plt.plot(fpr_raw, tpr_raw, label=f"Raw (AUC={auc_raw:.3f})")
plt.plot(fpr_pca, tpr_pca, label=f"PCA (AUC={auc_pca:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC — Logistic (SGD) on Eigenfaces vs Raw")
plt.legend()
plt.show()
epoch   50 | loss: 0.0000
epoch  100 | loss: 0.0000
epoch  150 | loss: 0.0000
epoch  200 | loss: 0.0000
epoch  250 | loss: 0.0000
[Raw pixels]  acc=1.000, AUC=1.000
epoch   50 | loss: 0.3613
epoch  100 | loss: 0.2105
epoch  150 | loss: 0.1461
epoch  200 | loss: 0.1113
epoch  250 | loss: 0.0896
[PCA features] acc=0.250, AUC=0.250
/tmp/ipykernel_658704/276701422.py:39: RuntimeWarning: overflow encountered in exp
  prob_te = 1.0 / (1.0 + np.exp(-logits_te))

(Optional) Sweep PCs to see performance vs dimensionality

k_list = [10, 20, 30, 40, 50, 75, 100, 150]
accs, aucs = [], []

for k in k_list:
    pca_k = PCA(n_components=k, svd_solver="randomized", whiten=True, random_state=42)
    Xtr_k = pca_k.fit_transform(Xtr_std)
    Xte_k = pca_k.transform(Xte_std)
    _, acc_k, auc_k, _ = train_logreg_sgd(Xtr_k, y_tr, Xte_k, y_te, epochs=150, batch=16)
    accs.append(acc_k)
    aucs.append(auc_k)

plt.figure(figsize=(6, 4))
plt.plot(k_list, accs, marker="o")
plt.xlabel("#PCs")
plt.ylabel("Accuracy")
plt.title("Accuracy vs #PCs")
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(k_list, aucs, marker="o")
plt.xlabel("#PCs")
plt.ylabel("AUC")
plt.title("AUC vs #PCs")
plt.show()
epoch   50 | loss: 0.4356
epoch  100 | loss: 0.2408
epoch  150 | loss: 0.1630
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[8], line 6
      4 for k in k_list:
      5     pca_k = PCA(n_components=k, svd_solver="randomized", whiten=True, random_state=42)
----> 6     Xtr_k = pca_k.fit_transform(Xtr_std)
      7     Xte_k = pca_k.transform(Xte_std)
      8     _, acc_k, auc_k, _ = train_logreg_sgd(Xtr_k, y_tr, Xte_k, y_te, epochs=150, batch=16)

File /workspaces/engineering-ai-agents/.venv/lib/python3.11/site-packages/sklearn/utils/_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    314 @wraps(f)
    315 def wrapped(self, X, *args, **kwargs):
--> 316     data_to_wrap = f(self, X, *args, **kwargs)
    317     if isinstance(data_to_wrap, tuple):
    318         # only wrap the first output for cross decomposition
    319         return_tuple = (
    320             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    321             *data_to_wrap[1:],
    322         )

File /workspaces/engineering-ai-agents/.venv/lib/python3.11/site-packages/sklearn/base.py:1363, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1356     estimator._validate_params()
   1358 with config_context(
   1359     skip_parameter_validation=(
   1360         prefer_skip_nested_validation or global_skip_validation
   1361     )
   1362 ):
-> 1363     return fit_method(estimator, *args, **kwargs)

File /workspaces/engineering-ai-agents/.venv/lib/python3.11/site-packages/sklearn/decomposition/_pca.py:466, in PCA.fit_transform(self, X, y)
    443 @_fit_context(prefer_skip_nested_validation=True)
    444 def fit_transform(self, X, y=None):
    445     """Fit the model with X and apply the dimensionality reduction on X.
    446 
    447     Parameters
   (...)    464     C-ordered array, use 'np.ascontiguousarray'.
    465     """
--> 466     U, S, _, X, x_is_centered, xp = self._fit(X)
    467     if U is not None:
    468         U = U[:, : self.n_components_]

File /workspaces/engineering-ai-agents/.venv/lib/python3.11/site-packages/sklearn/decomposition/_pca.py:542, in PCA._fit(self, X)
    540     return self._fit_full(X, n_components, xp, is_array_api_compliant)
    541 elif self._fit_svd_solver in ["arpack", "randomized"]:
--> 542     return self._fit_truncated(X, n_components, xp)

File /workspaces/engineering-ai-agents/.venv/lib/python3.11/site-packages/sklearn/decomposition/_pca.py:717, in PCA._fit_truncated(self, X, n_components, xp)
    712     raise ValueError(
    713         "n_components=%r cannot be a string with svd_solver='%s'"
    714         % (n_components, svd_solver)
    715     )
    716 elif not 1 <= n_components <= min(n_samples, n_features):
--> 717     raise ValueError(
    718         "n_components=%r must be between 1 and "
    719         "min(n_samples, n_features)=%r with "
    720         "svd_solver='%s'"
    721         % (n_components, min(n_samples, n_features), svd_solver)
    722     )
    723 elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
    724     raise ValueError(
    725         "n_components=%r must be strictly less than "
    726         "min(n_samples, n_features)=%r with "
    727         "svd_solver='%s'"
    728         % (n_components, min(n_samples, n_features), svd_solver)
    729     )

ValueError: n_components=20 must be between 1 and min(n_samples, n_features)=12 with svd_solver='randomized'