Note
Go to the end to download the full example code.
Residual Analysis (Classification)#
Evaluate model residuals.
Installation
# To install the required package, use the following command:
# !pip install modeva
Authentication
# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')
Import modeva modules
from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoLGBMClassifier
Load BikeSharing Dataset
ds = DataSet()
ds.load(name="TaiwanCredit")
ds.set_random_split()
ds.set_target("FlagDefault")
Fit a LGBM model
model = MoLGBMClassifier(name="LGBM-2", max_depth=2, verbose=-1, random_state=0)
model.fit(ds.train_x, ds.train_y.ravel())
Analyzes residuals feature importance#
ts = TestSuite(ds, model)
results = ts.diagnose_residual_interpret(dataset="train")
results.plot()
Visualize the residual against predictor#
results = ts.diagnose_residual_analysis(features="PAY_1", dataset="train")
results.plot()
Visualize the residual against response variable#
results = ts.diagnose_residual_analysis(features="FlagDefault", dataset="train")
results.plot()
Visualize the residual against model prediction (predict proba)#
results = ts.diagnose_residual_analysis(use_prediction=True, dataset="train")
results.plot()
Interpret residual by a XGB depth-2 model#
results = ts.diagnose_residual_interpret(dataset='test', n_estimators=100, max_depth=2)
XGB-2 feature performance
results.plot("feature_importance")
XGB-2 effect performance
results.plot("effect_importance")
Further interpretation (main effect plot)
ts_residual = results.value["TestSuite"]
ts_residual.interpret_effects("PAY_1", dataset="test").plot()
Further interpretation (local interpretation)
ts_residual.interpret_local_fi(sample_index=20).plot()
Random forest-based residual clustering analysis (absolute residual)#
results = ts.diagnose_residual_cluster(
dataset="test",
response_type="abs_residual",
metric="AUC",
n_clusters=10,
cluster_method="pam",
sample_size=2000,
rf_n_estimators=100,
rf_max_depth=5,
)
results.table
Residual value for each cluster
results.plot("cluster_residual")
Performance metric for each cluster
results.plot("cluster_performance")
Feature importance of the random forest model
results.plot("feature_importance")
Analyze data drift for a specific cluster
data_results = ds.data_drift_test(
**results.value["clusters"][0]["data_info"],
distance_metric="PSI",
psi_method="uniform",
psi_bins=10
)
data_results.plot("summary")
data_results.plot(name=('density', 'PAY_1'))
Random forest-based residual clustering analysis (perturbed residual)#
results = ts.diagnose_residual_cluster(
dataset="test",
response_type="abs_residual_perturb",
metric="AUC",
n_clusters=10,
cluster_method="pam",
sample_size=2000,
rf_n_estimators=100,
rf_max_depth=5,
)
results.table
Random forest-based residual clustering analysis (prediction interval width)#
results = ts.diagnose_residual_cluster(
dataset="test",
response_type="pi_width",
metric="AUC",
n_clusters=10,
cluster_method="pam",
sample_size=2000,
rf_n_estimators=100,
rf_max_depth=5,
)
results.table
Compare residuals cluster of multiple models#
benchmark = MoLGBMClassifier(name="LGBM-5", max_depth=5, verbose=-1, random_state=0)
benchmark.fit(ds.train_x, ds.train_y.ravel())
tsc = TestSuite(ds, models=[model, benchmark])
results = tsc.compare_residual_cluster(dataset="test")
results.table
results.plot("cluster_performance")
Total running time of the script: (2 minutes 36.199 seconds)