Decision tree from scratch

Contents

Decision tree from scratch#

This example is provided to illustrate the classification tree algorithm for features that are continuous.

# https://medium.com/swlh/decision-tree-from-scratch-a72069240293
import numpy as np
from collections import Counter

def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

class Node:

    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTree:

    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (depth >= self.max_depth
                or n_labels == 1
                or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
        
        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred)/len(y_true)
  return accuracy

data = datasets.load_breast_cancer()
X = data.data
y = data.target
data_df = pd.DataFrame(data.data, columns=data.feature_names)
data_df.head()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 30 columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)
clf = DecisionTree(max_depth = 15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
acc = accuracy(y_train, y_pred)

print(acc)

1.0

y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

acc

0.9736842105263158

Hyperparameter Tuning with Optuna#

import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def objective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)

    dt_classifier = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )
    dt_classifier.fit(X_train, y_train)
    y_pred = dt_classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[I 2023-09-26 16:43:41,813] A new study created in memory with name: no-name-c2eb4bb2-dabb-4971-b565-23e962d9bf15
[I 2023-09-26 16:43:41,819] Trial 0 finished with value: 0.9736842105263158 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,823] Trial 1 finished with value: 0.9736842105263158 and parameters: {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,828] Trial 2 finished with value: 0.956140350877193 and parameters: {'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,833] Trial 3 finished with value: 0.9736842105263158 and parameters: {'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,838] Trial 4 finished with value: 0.9736842105263158 and parameters: {'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,842] Trial 5 finished with value: 0.9736842105263158 and parameters: {'max_depth': 19, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,847] Trial 6 finished with value: 0.9736842105263158 and parameters: {'max_depth': 32, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,851] Trial 7 finished with value: 0.9736842105263158 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,856] Trial 8 finished with value: 0.9649122807017544 and parameters: {'max_depth': 12, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.
[I 2023-09-26 16:43:41,860] Trial 9 finished with value: 0.9649122807017544 and parameters: {'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9736842105263158.