Reliability Analysis (Regression)#

This example demonstrates how to analyze model reliability and calibration for regression problems using various methods and metrics.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import required modules

from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoLGBMRegressor
from modeva.models import MoXGBRegressor
from modeva.testsuite.utils.slicing_utils import get_data_info

Load and prepare dataset

ds = DataSet()
ds.load(name="BikeSharing")
ds.set_random_split(random_state=0)

ds.scale_numerical(features=("cnt",), method="log1p")
ds.preprocess()

Train models

model1 = MoXGBRegressor(max_depth=2)
model1.fit(ds.train_x, ds.train_y)

model2 = MoLGBMRegressor(max_depth=2, verbose=-1, random_state=0)
model2.fit(ds.train_x, ds.train_y.ravel().astype(float))
MoLGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=2,
                min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=100, n_jobs=None,
                num_leaves=31, objective=None, random_state=0, reg_alpha=0.0,
                reg_lambda=0.0, subsample=1.0, subsample_for_bin=200000,
                subsample_freq=0, verbose=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


Basic reliability analysis#

ts = TestSuite(ds, model1)

As train_dataset == test_dataset, we would split the test data, one for training (calculating the non-conformal scores) and another for evaluation the test_size (0.5) is the proportion of the test data used for training.

results = ts.diagnose_reliability(
    train_dataset="test",
    test_dataset="test",
    test_size=0.5,
    alpha=0.1,
    max_depth=5,
    random_state=0
)
results.table
Avg.Width Avg.Coverage
0 1.1589 0.8907


Analyze data drift

data_results = ds.data_drift_test(
    **results.value["data_info"],
    distance_metric="PSI",
    psi_method="uniform",
    psi_bins=10
)

Summary PSI of each feature

data_results.plot("summary")


Single feature density plot

data_results.plot(("density", "hr"))


Slicing reliability#

Single feature reliability analysis

results = ts.diagnose_slicing_reliability(
    features="hr",
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    metric="coverage",
    random_state=0
)
results.plot()


Multiple 1D feature reliability analysis

results = ts.diagnose_slicing_reliability(
    features=(("hr",), ("temp",), ("season",)),
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    metric="coverage",
    random_state=0
)
results.plot("hr")


Batch mode 1D Slicing (all features by setting features=None)

results = ts.diagnose_slicing_reliability(
    features=None,
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    metric="coverage",
    random_state=0
)
results.table
Feature Segment Size Coverage Threshold Weak
0 windspeed [0.68, 0.77) 2 1.0000 0.8924 True
1 windspeed [0.60, 0.68) 16 1.0000 0.8924 True
2 weathersit 4.0 1 1.0000 0.8924 True
3 temp [0.90, 1.00] 7 1.0000 0.8924 True
4 holiday 1.0 78 0.9359 0.8924 True
... ... ... ... ... ... ...
79 mnth [10.90, 12.00] 0 NaN 0.8924 False
80 weekday [1.20, 1.80) 0 NaN 0.8924 False
81 weekday [2.40, 3.00) 0 NaN 0.8924 False
82 weekday [4.20, 4.80) 0 NaN 0.8924 False
83 atemp [0.90, 1.00] 0 NaN 0.8924 False

84 rows × 6 columns



Analyze data drift between samples above and under the threshold

data_info = get_data_info(res_value=results.value)
data_results = ds.data_drift_test(
    **data_info["hr"],
    distance_metric="PSI",
    psi_method="uniform",
    psi_bins=10
)
data_results.plot("summary")


Single feature density plot

data_results.plot(("density", "hr"))


2D feature interaction reliability analysis

results = ts.diagnose_slicing_reliability(
    features=("hr", "temp"),
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    random_state=0
)
results.plot()


Model reliability comparison#

tsc = TestSuite(ds, models=[model1, model2])
results = tsc.compare_reliability(
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    alpha=0.1,
    max_depth=5,
    random_state=0
)
results.table
MoXGBRegressor MoLGBMRegressor
Avg.Width Avg.Coverage Avg.Width Avg.Coverage
0 0.9803 0.8924 1.04 0.9042


Model slicing reliability comparison

results = tsc.compare_slicing_reliability(
    features="hr",
    train_dataset="train",
    test_dataset="test",
    test_size=0.5,
    alpha=0.1,
    max_depth=5,
    metric="width",
    random_state=0
)
results.plot()


Total running time of the script: (2 minutes 12.890 seconds)

Gallery generated by Sphinx-Gallery