Decision tree from scratch

Decision tree from scratch#

This example is provided to illustrate the classification tree algorithm for features that are continuous.

# https://medium.com/swlh/decision-tree-from-scratch-a72069240293
import numpy as np
from collections import Counter
def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])
class Node:

    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None
class DecisionTree:

    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (depth >= self.max_depth
                or n_labels == 1
                or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
        
        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred)/len(y_true)
  return accuracy
data = datasets.load_breast_cancer()
X = data.data
y = data.target
data_df = pd.DataFrame(data.data, columns=data.feature_names)
data_df.head()
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst radius worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 30 columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)
clf = DecisionTree(max_depth = 15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
acc = accuracy(y_train, y_pred)
print(acc)
1.0
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)
acc
0.9736842105263158

Hyperparameter Tuning with Optuna#

import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    dt_classifier = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )
    dt_classifier.fit(X_train, y_train)
    y_pred = dt_classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
[I 2023-09-26 16:43:41,813] A new study created in memory with name: no-name-c2eb4bb2-dabb-4971-b565-23e962d9bf15
[I 2023-09-26 16:43:41,819] Trial 0 finished with value: 0.9736842105263158 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,823] Trial 1 finished with value: 0.9736842105263158 and parameters: {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,828] Trial 2 finished with value: 0.956140350877193 and parameters: {'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,833] Trial 3 finished with value: 0.9736842105263158 and parameters: {'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,838] Trial 4 finished with value: 0.9736842105263158 and parameters: {'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,842] Trial 5 finished with value: 0.9736842105263158 and parameters: {'max_depth': 19, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,847] Trial 6 finished with value: 0.9736842105263158 and parameters: {'max_depth': 32, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,851] Trial 7 finished with value: 0.9736842105263158 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,856] Trial 8 finished with value: 0.9649122807017544 and parameters: {'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,860] Trial 9 finished with value: 0.9649122807017544 and parameters: {'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.