Note
Go to the end to download the full example code.
Wrapping H2O Models#
This example requires full licence, and the program will break if you use the trial licence.
Installation
# To install the required package, use the following command:
# !pip install modeva
Authentication
# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')
Import required modules
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from modeva import DataSet
from modeva import TestSuite
from modeva.models.wrappers.api import modeva_arbitrary_classifier
Scripts for building a H2O model#
Initialize H2O
try:
h2o.shutdown()
except:
pass
h2o.init()
h2o.no_progress()
Load a sample binary classification dataset
data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv")
data["CAPSULE"] = data["CAPSULE"].asfactor() # Convert target column to factor
# Split the dataset into train and test sets
train, test = data.split_frame(ratios=[0.8], seed=1234)
# Define feature and target columns
X_columns = data.columns[2:-1] # All columns except the target
y_column = "CAPSULE" # Target column
Train H2O model
h2o_model = H2OGradientBoostingEstimator()
h2o_model.train(x=X_columns, y=y_column, training_frame=train)
Wrap the data into Modeva#
ds = DataSet()
ds.load_dataframe(data=data.as_data_frame()[X_columns + [y_column]])
ds.set_train_idx(train["ID"].as_data_frame().values.flatten() - 1)
ds.set_test_idx(test["ID"].as_data_frame().values.flatten() - 1)
ds.set_task_type("Classification")
Wrap the model into Modeva#
def predict_func(X):
X_h2o = h2o.H2OFrame(X) # Convert input to H2O Frame
X_h2o.col_names = X_columns
predictions = h2o_model.predict(X_h2o)["predict"]
return predictions.as_data_frame(use_multi_thread=True).values.flatten()
def predict_proba_func(X):
X_h2o = h2o.H2OFrame(X) # Convert input to H2O Frame
X_h2o.col_names = X_columns
probabilities = h2o_model.predict(X_h2o)
return probabilities.as_data_frame(use_multi_thread=True).values[:, 1:]
model = modeva_arbitrary_classifier(
name="H2O-BinaryClassifier",
predict_function=predict_func,
predict_proba_function=predict_proba_func
)
Create test suite for diagnostics#
ts = TestSuite(ds, model)
Basic accuracy analysis
results = ts.diagnose_accuracy_table()
results.table