» Code examples / Structured Data / Classification with Neural Decision Forests

Classification with Neural Decision Forests

Author: Khalid Salama
Date created: 2021/01/15
Last modified: 2021/01/15
Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.

View in Colab GitHub source


Introduction

This example provides an implementation of the Deep Neural Decision Forest model introduced by P. Kontschieder et al. for structured data classification. It demonstrates how to build a stochastic and differentiable decision tree model, train it end-to-end, and unify decision trees with deep representation learning.


The dataset

This example uses the United States Census Income Dataset provided by the UC Irvine Machine Learning Repository. The task is binary classification to predict whether a person is likely to be making over USD 50,000 a year.

The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features and 9 categorical features.


Setup

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import math

Prepare the data

CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

test_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")
Train dataset shape: (32561, 15)
Test dataset shape: (16282, 15)

Remove the first record (because it is not a valid data example) and a trailing 'dot' in the class labels.

test_data = test_data[1:]
test_data.income_bracket = test_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

We store the training and test data splits locally as CSV files.

train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

Define dataset metadata

Here, we define the metadata of the dataset that will be useful for reading and parsing and encoding input features.

# A list of the numerical feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]
# A dictionary of the categorical features and their vocabulary.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sorted(list(train_data["workclass"].unique())),
    "education": sorted(list(train_data["education"].unique())),
    "marital_status": sorted(list(train_data["marital_status"].unique())),
    "occupation": sorted(list(train_data["occupation"].unique())),
    "relationship": sorted(list(train_data["relationship"].unique())),
    "race": sorted(list(train_data["race"].unique())),
    "gender": sorted(list(train_data["gender"].unique())),
    "native_country": sorted(list(train_data["native_country"].unique())),
}
# A list of the columns to ignore from the dataset.
IGNORE_COLUMN_NAMES = ["fnlwgt"]
# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
    for feature_name in CSV_HEADER
]
# The name of the target feature.
TARGET_FEATURE_NAME = "income_bracket"
# A list of the labels of the target features.
TARGET_LABELS = [" <=50K", " >50K"]

Create tf.data.Dataset objects for training and validation

We create an input function to read and parse the file, and convert features and labels into a tf.data.Dataset for training and validation. We also preprocess the input by mapping the target label to an index.

from tensorflow.keras.layers.experimental.preprocessing import StringLookup

target_label_lookup = StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
    return dataset.cache()

Create model inputs

def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

Encode input features

from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup


def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token, nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
            index = StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_index = index(inputs[feature_name])
            if use_embedding:
                embedding_dims = int(math.sqrt(len(vocabulary)))
                # Create an embedding layer with the specified dimensions.
                embedding_ecoder = layers.Embedding(
                    input_dim=len(vocabulary), output_dim=embedding_dims
                )
                # Convert the index values to embedding representations.
                encoded_feature = embedding_ecoder(value_index)
            else:
                # Create a one-hot encoder.
                onehot_encoder = CategoryEncoding(output_mode="binary")
                onehot_encoder.adapt(index(vocabulary))
                # Convert the index values to a one-hot representation.
                encoded_feature = onehot_encoder(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features

Deep Neural Decision Tree

A neural decision tree model has two sets of weights to learn. The first set is pi, which represents the probability distribution of the classes in the tree leaves. The second set is the weights of the routing layer decision_fn, which represents the probability of going to each leave. The forward pass of the model works as follows:

  1. The model expects input features as a single vector encoding all the features of an instance in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images or dense transformations applied to structured data features.
  2. The model first applies a used_features_mask to randomly select a subset of input features to use.
  3. Then, the model computes the probabilities (mu) for the input instances to reach the tree leaves by iteratively performing a stochastic routing throughout the tree levels.
  4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the leaves to produce the final outputs.
class NeuralDecisionTree(keras.Model):
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes

        # Create a mask for the randomly selected features.
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), num_used_features, replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        # Initialize the weights of the classes in leaves.
        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )

        # Initialize the stochastic routing layer.
        self.decision_fn = layers.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]

        # Apply the feature mask to the input features.
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )  # [batch_size, num_used_features]
        # Compute the routing probabilities.
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )  # [batch_size, num_leaves, 1]
        # Concatenate the routing probabilities with their complements.
        decisions = layers.concatenate(
            [decisions, 1 - decisions], axis=2
        )  # [batch_size, num_leaves, 2]

        mu = tf.ones([batch_size, 1, 1])

        begin_idx = 1
        end_idx = 2
        # Traverse the tree in breadth-first order.
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
            mu = tf.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]  # [batch_size, 2 ** level, 2]
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
        outputs = tf.matmul(mu, probabilities)  # [batch_size, num_classes]
        return outputs

Deep Neural Decision Forest

The neural decision forest model consists of a set of neural decision trees that are trained simultaneously. The output of the forest model is the average outputs of its trees.

class NeuralDecisionForest(keras.Model):
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []
        # Initialize the ensemble by adding NeuralDecisionTree instances.
        # Each tree will have its own randomly selected input features to use.
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):
        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, num_classes])

        # Aggregate the outputs of trees in the ensemble.
        for tree in self.ensemble:
            outputs += tree(inputs)
        # Divide the outputs by the ensemble size to get the average.
        outputs /= len(self.ensemble)
        return outputs

Finally, let's set up the code that will train and evaluate the model.

learning_rate = 0.01
batch_size = 265
num_epochs = 10
hidden_units = [64, 64]


def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    print("Start training the model...")
    train_dataset = get_dataset_from_csv(
        train_data_file, shuffle=True, batch_size=batch_size
    )

    model.fit(train_dataset, epochs=num_epochs)
    print("Model training finished")

    print("Evaluating the model on the test data...")
    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)

    _, accuracy = model.evaluate(test_dataset)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

Experiment 1: train a decision tree model

In this experiment, we train a single neural decision tree model where we use all input features.

num_trees = 10
depth = 10
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)


def create_tree_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs, use_embedding=True)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)

    outputs = tree(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tree_model = create_tree_model()
run_experiment(tree_model)
Start training the model...
Epoch 1/10

/Users/khalidsalama/Technology/python-venvs/keras-env/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:595: UserWarning: Input dict contained keys ['fnlwgt'] which did not match any model input. They will be ignored by the model.
  [n for n in tensors.keys() if n not in ref_input_names])

123/123 [==============================] - 3s 15ms/step - loss: 0.5351 - sparse_categorical_accuracy: 0.7858
Epoch 2/10
123/123 [==============================] - 2s 17ms/step - loss: 0.3359 - sparse_categorical_accuracy: 0.8499
Epoch 3/10
123/123 [==============================] - 2s 19ms/step - loss: 0.3200 - sparse_categorical_accuracy: 0.8547
Epoch 4/10
123/123 [==============================] - 2s 19ms/step - loss: 0.3138 - sparse_categorical_accuracy: 0.8570
Epoch 5/10
123/123 [==============================] - 2s 20ms/step - loss: 0.3096 - sparse_categorical_accuracy: 0.8594
Epoch 6/10
123/123 [==============================] - 2s 18ms/step - loss: 0.3059 - sparse_categorical_accuracy: 0.8601
Epoch 7/10
123/123 [==============================] - 2s 20ms/step - loss: 0.3023 - sparse_categorical_accuracy: 0.8625
Epoch 8/10
123/123 [==============================] - 3s 21ms/step - loss: 0.2982 - sparse_categorical_accuracy: 0.8659
Epoch 9/10
123/123 [==============================] - 3s 22ms/step - loss: 0.2934 - sparse_categorical_accuracy: 0.8698
Epoch 10/10
123/123 [==============================] - 3s 27ms/step - loss: 0.2887 - sparse_categorical_accuracy: 0.8720
Model training finished
Evaluating the model on the test data...
62/62 [==============================] - 1s 6ms/step - loss: 0.3213 - sparse_categorical_accuracy: 0.8490
Test accuracy: 84.9%

Experiment 2: train a forest model

In this experiment, we train a neural decision forest with num_trees trees where each tree uses randomly selected 50% of the input features. You can control the number of features to be used in each tree by setting the used_features_rate variable. In addition, we set the depth to 5 instead of 10 compared to the previous experiment.

num_trees = 25
depth = 5
used_features_rate = 0.5


def create_forest_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs, use_embedding=True)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    forest_model = NeuralDecisionForest(
        num_trees, depth, num_features, used_features_rate, num_classes
    )

    outputs = forest_model(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


forest_model = create_forest_model()

run_experiment(forest_model)
Start training the model...
Epoch 1/10
123/123 [==============================] - 10s 6ms/step - loss: 0.5519 - sparse_categorical_accuracy: 0.7654
Epoch 2/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3468 - sparse_categorical_accuracy: 0.8432
Epoch 3/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3301 - sparse_categorical_accuracy: 0.8469
Epoch 4/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3240 - sparse_categorical_accuracy: 0.8497
Epoch 5/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3204 - sparse_categorical_accuracy: 0.8505
Epoch 6/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3177 - sparse_categorical_accuracy: 0.8521
Epoch 7/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3154 - sparse_categorical_accuracy: 0.8529
Epoch 8/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3133 - sparse_categorical_accuracy: 0.8531
Epoch 9/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3114 - sparse_categorical_accuracy: 0.8542
Epoch 10/10
123/123 [==============================] - 1s 5ms/step - loss: 0.3097 - sparse_categorical_accuracy: 0.8546
Model training finished
Evaluating the model on the test data...
62/62 [==============================] - 2s 4ms/step - loss: 0.3092 - sparse_categorical_accuracy: 0.8582
Test accuracy: 85.82%