Examples

Stacked classifiers (naive protocol)

Similar to the the example in the quick-start guide, (a naive) stacks of classifiers (or regressors) can be built like shown below. Note that you can specify the function the step should use for computation, in this case compute_func='predict_proba' to use the label probabilities as the features of the meta-classifier.

x = Input()
y_t = Input()
y_p1 = LogisticRegression()(x, y_t, compute_func="predict_proba")
y_p2 = RandomForestClassifier()(x, y_t, compute_func="predict_proba")
# predict_proba returns arrays whose columns sum to one, so we drop one column
drop_first_col = Lambda(lambda array: array[:, 1:])
y_p1 = drop_first_col(y_p1)
y_p2 = drop_first_col(y_p2)
ensemble_features = ColumnStack()([y_p1, y_p2])
y_p = ExtraTreesClassifier()(ensemble_features, y_t)

model = Model(x, y_p, y_t)
import sklearn.datasets
import sklearn.ensemble
import sklearn.linear_model
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import ColumnStack, Lambda


# ------- Define steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)

# ------- Load dataset
data = sklearn.datasets.load_breast_cancer()
X, y_p = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y_p, test_size=0.2, random_state=0
)

# ------- Build model
x = Input()
y_t = Input()
y_p1 = LogisticRegression(solver="liblinear", random_state=0)(
    x, y_t, compute_func="predict_proba"
)
y_p2 = RandomForestClassifier(random_state=0)(x, y_t, compute_func="predict_proba")
# predict_proba returns arrays whose columns sum to one, so we drop one column
drop_first_col = Lambda(lambda array: array[:, 1:])
y_p1 = drop_first_col(y_p1)
y_p2 = drop_first_col(y_p2)
stacked_features = ColumnStack()([y_p1, y_p2])
y_p = ExtraTreesClassifier(random_state=0)(stacked_features, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="stacked_classifiers_naive.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("F1 score on train data:", f1_score(y_train, y_train_pred))
print("F1 score on test data:", f1_score(y_test, y_test_pred))

Stacked classifiers (standard protocol)

In the naive stack above, each classifier in the 1st level will calculate the predictions for the 2nd level using the same data it used for fitting its parameters. This is prone to overfitting as the 2nd level classifier will tend to give more weight to an overfit classifier in the 1st level. To avoid this, the standard protocol recommends that, during fit, the 1st level classifiers are still trained on the original data, but instead they provide out-of-fold (OOF) predictions to the 2nd level classifier. To achieve this special behavior, we leverage the fit_compute_func API: we define a fit_predict method that does the fitting and the OOF predictions, and add it as a method of the 1st level classifiers (LogisticRegression and RandomForestClassifier, in the example) when making the steps. baikal will then detect and use this method during fit.

from sklearn.model_selection import cross_val_predict


def fit_predict(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="predict_proba")


attr_dict = {"fit_predict": fit_predict}

# 1st level classifiers
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression, attr_dict)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier, attr_dict)

# 2nd level classifier
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)

The rest of the stack is built exactly the same as in the naive example.

import sklearn.datasets
import sklearn.ensemble
import sklearn.linear_model
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict, train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import ColumnStack, Lambda


# ------- Define steps
# During fit, the 1st level classifiers must be trained on the original data, but must
# provide out-of-fold (OOF) predictions to the 2nd level classifier. To achieve this we
# leverage the fit_compute_func API to configure this behavior. In this case we define
# a fit_predict method that does the fitting and the OOF predictions, and add it as a
# method of the 1st level classifiers (LogisticRegression and RandomForestClassifier)
# when making the steps. baikal will then detect and use this method during fit.


def fit_predict(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="predict_proba")


attr_dict = {"fit_predict": fit_predict}
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression, attr_dict)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier, attr_dict)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)

# ------- Load dataset
data = sklearn.datasets.load_breast_cancer()
X, y_p = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y_p, test_size=0.2, random_state=0
)

# ------- Build model
# The model is built similarly as the naive case. The difference is that during fit
# baikal will detect and use the fit_predict method above.
x = Input()
y_t = Input()
y_p1 = LogisticRegression(solver="liblinear", random_state=0)(
    x, y_t, compute_func="predict_proba"
)
y_p2 = RandomForestClassifier(random_state=0)(x, y_t, compute_func="predict_proba")
# predict_proba returns arrays whose columns sum to one, so we drop one column
drop_first_col = Lambda(lambda array: array[:, 1:])
y_p1 = drop_first_col(y_p1)
y_p2 = drop_first_col(y_p2)
stacked_features = ColumnStack()([y_p1, y_p2])
y_p = ExtraTreesClassifier(random_state=0)(stacked_features, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="stacked_classifiers_standard.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("F1 score on train data:", f1_score(y_train, y_train_pred))
print("F1 score on test data:", f1_score(y_test, y_test_pred))

Classifier chain

The API also lends itself for more interesting configurations, such as that of classifier chains. By leveraging the API and Python’s own control flow, a classifier chain model can be built as follows:

x = Input()
y_t = Input()
order = list(range(n_targets))
random.shuffle(order)

squeeze = Lambda(np.squeeze, axis=1)

ys_t = Split(n_targets, axis=1)(y_t)
ys_p = []
for j, k in enumerate(order):
    x_stacked = ColumnStack()([x, *ys_p[:j]])
    ys_t[k] = squeeze(ys_t[k])
    ys_p.append(LogisticRegression()(x_stacked, ys_t[k]))

ys_p = [ys_p[order.index(j)] for j in range(n_targets)]
y_p = ColumnStack()(ys_p)

model = Model(x, y_p, y_t)

Sure, scikit-learn already does have ClassifierChain and RegressorChain classes for this. But with baikal you could, for example, mix classifiers and regressors to predict multilabels that include both categorical and continuous labels.

import numpy as np
import random

import sklearn.linear_model
from sklearn.datasets import fetch_openml
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import ColumnStack, Split, Lambda

# ------- Define steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)

# ------- Load a multi-label dataset
# (from https://www.openml.org/d/40597)
X, Y = fetch_openml("yeast", version=4, return_X_y=True)
Y = Y == "TRUE"
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

n_targets = Y.shape[1]
random.seed(87)
order = list(range(n_targets))
random.shuffle(order)

# ------- Build model
x = Input()
y_t = Input()

squeeze = Lambda(np.squeeze, axis=1)

ys_t = Split(n_targets, axis=1)(y_t)
ys_p = []
for j, k in enumerate(order):
    x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]])
    ys_t[k] = squeeze(ys_t[k])
    ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k]))

ys_p = [ys_p[order.index(j)] for j in range(n_targets)]
y_p = ColumnStack()(ys_p)

model = Model(inputs=x, outputs=y_p, targets=y_t)
# This might take a few seconds
plot_model(model, filename="classifier_chain.png", dpi=96)

# ------- Train model
model.fit(X_train, Y_train)

# ------- Evaluate model
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

print(
    "Jaccard score on train data:",
    jaccard_score(Y_train, Y_train_pred, average="samples"),
)
print(
    "Jaccard score on test data:",
    jaccard_score(Y_test, Y_test_pred, average="samples"),
)

Transformed target

You can also call steps on the targets to apply transformations on them. Note that by making the transformer a shared step, you can re-use learned parameters to apply the inverse transform later in the pipeline.

transformer = QuantileTransformer(n_quantiles=300, output_distribution="normal")

x = Input()
y_t = Input()
# QuantileTransformer requires an explicit feature dimension, hence the Lambda step
y_t_trans = Lambda(np.reshape, newshape=(-1, 1))(y_t)
y_t_trans = transformer(y_t_trans)
y_p_trans = RidgeCV()(x, y_t_trans)
y_p = transformer(y_p_trans, compute_func="inverse_transform", trainable=False)
# Note that transformer is a shared step since it was called twice

model = Model(x, y_p, y_t)
# Adapted from the scikit-learn example in:
# https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py

import numpy as np
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.datasets import load_boston
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from baikal import make_step, Input, Model
from baikal.plot import plot_model
from baikal.steps import Lambda


# ------- Define steps
RidgeCV = make_step(sklearn.linear_model.RidgeCV)
QuantileTransformer = make_step(sklearn.preprocessing.QuantileTransformer)

# ------- Load dataset
dataset = load_boston()
target = np.array(dataset.feature_names) == "DIS"
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# ------- Build model
transformer = QuantileTransformer(n_quantiles=300, output_distribution="normal")

x = Input()
y_t = Input()
# QuantileTransformer requires an explicit feature dimension, hence the Lambda step
y_t_trans = Lambda(np.reshape, newshape=(-1, 1))(y_t)
y_t_trans = transformer(y_t_trans)
y_p_trans = RidgeCV()(x, y_t_trans)
y_p = transformer(y_p_trans, compute_func="inverse_transform", trainable=False)

model = Model(x, y_p, y_t)
plot_model(model, filename="transformed_target.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = median_absolute_error(y_test, y_pred)
print("R^2={}\nMAE={}".format(r2, mae))

Tune a model with GridSearchCV

Below is an example showing how to use the scikit-learn wrapper to tune the parameters of a baikal model using GridSearchCV.

import sklearn.decomposition
import sklearn.ensemble
import sklearn.decomposition
import sklearn.linear_model
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from baikal import Input, Model, make_step
from baikal.sklearn import SKLearnWrapper


LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
PCA = make_step(sklearn.decomposition.PCA)


def build_fn():
    x = Input()
    y_t = Input()
    h = PCA(random_state=random_state, name="pca")(x)
    y_p = LogisticRegression(random_state=random_state, name="classifier")(h, y_t)
    model = Model(x, y_p, y_t)
    return model


iris = datasets.load_iris()
x_data = iris.data
y_data = iris.target
random_state = 123
verbose = 0

# cv will default to KFold if the estimator is a baikal Model
# so we have to pass StratifiedKFold directly
cv = StratifiedKFold(n_splits=3, random_state=random_state)

param_grid = [
    {
        "classifier": [
            LogisticRegression(
                random_state=random_state, solver="lbfgs", multi_class="multinomial"
            )
        ],
        "classifier__C": [0.01, 0.1, 1],
        "pca__n_components": [1, 2, 3, 4],
    },
    {
        "classifier": [RandomForestClassifier(random_state=random_state)],
        "classifier__n_estimators": [10, 50, 100],
        "pca__n_components": [1, 2, 3, 4],
    },
]

sk_model = SKLearnWrapper(build_fn)
gscv_baikal = GridSearchCV(
    sk_model,
    param_grid,
    cv=cv,
    scoring="accuracy",
    return_train_score=True,
    verbose=verbose,
)
gscv_baikal.fit(x_data, y_data)
print("Best score:", gscv_baikal.best_score_)
print("Best parameters", gscv_baikal.best_params_)
# The model with the best parameters can be accessed via:
# gscv_baikal.best_estimator_.model