Note

Go to the end to download the full example code.

Data with Model Predictions#

This example requires full licence, and the program will break if you use the trial licence.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

import numpy as np
from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoXGBRegressor
from modeva.models import MoScoredRegressor

Load data

ds = DataSet()
ds.load("BikeSharing")
ds.set_random_split()

Fit a XGB model

model = MoXGBRegressor(max_depth=2)
model.fit(ds.train_x, ds.train_y)

MoXGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=2, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, objective='reg:squarederror', ...)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Get XGB predictions and combine it to original dataframe

data = ds.to_df()
data["prediction"] = model.predict(ds.x)
data

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt	prediction
0	1	0	1	0	0	6	0	1	0.24	0.2879	0.81	0.0000	16	36.931103
1	1	0	1	1	0	6	0	1	0.22	0.2727	0.80	0.0000	40	-1.341274
2	1	0	1	2	0	6	0	1	0.22	0.2727	0.80	0.0000	32	-26.962809
3	1	0	1	3	0	6	0	1	0.24	0.2879	0.75	0.0000	13	-72.667824
4	1	0	1	4	0	6	0	1	0.24	0.2879	0.75	0.0000	1	-72.667824
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17374	1	1	12	19	0	1	1	2	0.26	0.2576	0.60	0.1642	119	237.800262
17375	1	1	12	20	0	1	1	2	0.26	0.2576	0.60	0.1642	89	137.656616
17376	1	1	12	21	0	1	1	1	0.26	0.2576	0.60	0.1642	90	93.125214
17377	1	1	12	22	0	1	1	1	0.26	0.2727	0.56	0.1343	61	57.753876
17378	1	1	12	23	0	1	1	1	0.26	0.2727	0.65	0.1343	49	26.636641

17379 rows × 14 columns

Next, we will use this combined data to do model validation

new_ds = DataSet(name="scored-test-demo")
new_ds.load_dataframe(data)
new_ds.set_train_idx(train_idx=np.array(ds.train_idx))
new_ds.set_test_idx(test_idx=np.array(ds.test_idx))
new_ds.set_target(feature="cnt")
new_ds.register(override=True)
new_ds.set_inactive_features(features=("prediction", ))

Reload the model (optional)

reload_ds = DataSet(name="scored-test-demo")
reload_ds.load_registered_data(name="scored-test-demo")

Run tests without the model object, note that the robustness test is not available for scored model

model = MoScoredRegressor(dataset=new_ds, prediction_name="prediction")
ts = TestSuite(ds, model)

Run accuracy test without the model object

results = ts.diagnose_accuracy_table()
results.table

	MSE	MAE	R2
train	6124.369609	55.853476	0.813179
test	6557.755482	57.549641	0.803478
GAP	433.385873	1.696166	-0.009701

Run residual analysis test without the model object

results = ts.diagnose_residual_analysis(features="hr")
results.table

	hr	Residual
0	11.0	-38.097382
1	17.0	59.346985
2	3.0	-32.558922
3	15.0	-51.624382
4	7.0	-12.996155
...	...	...
1995	6.0	-7.306258
1996	10.0	14.951874
1997	6.0	-50.298096
1998	10.0	-115.138412
1999	21.0	-68.111206

2000 rows × 2 columns

Run reliability test without the model object

results = ts.diagnose_reliability()
results.table

	Avg.Width	Avg.Coverage
0	189.687531	0.901036

Run resilience test without the model object

results = ts.diagnose_resilience()
results.table

	MSE
0.1	39957.515154
0.2	25754.177892
0.3	19179.696219
0.4	15260.727856
0.5	12634.640444
0.6	10743.632617
0.7	9303.855556
0.8	8183.324067
0.9	7285.629466
1.0	6557.755482

Run slicing accuracy test without the model object

results = ts.diagnose_slicing_accuracy(features="hr", dataset="main", metric="MAE", threshold=0)
results.table

	Feature	Segment	Size	MAE	Weak
7	hr	[16.10, 18.40)	1458	115.177261	True
3	hr	[6.90, 9.20)	2181	100.083151	True
6	hr	[13.80, 16.10)	2188	57.893123	True
5	hr	[11.50, 13.80)	1457	56.588372	True
4	hr	[9.20, 11.50)	1454	55.421357	True
8	hr	[18.40, 20.70)	1456	48.355448	True
9	hr	[20.70, 23.00]	2184	40.399835	True
1	hr	[2.30, 4.60)	1394	39.957594	True
0	hr	[0.00, 2.30)	2165	24.055148	True
2	hr	[4.60, 6.90)	1442	23.746540	True

Run slicing overfit test without the model object

results = ts.diagnose_slicing_overfit(features="hr", train_dataset="train", test_dataset="test", metric="MAE")
results.table

	Feature	Segment	train-Size	train-MAE	test-Size	test-MAE	GAP	Threshold	Weak
8	hr	[6.90, 9.20)	1736	99.093091	445	103.945496	4.852405	1.696166	True
4	hr	[18.40, 20.70)	1171	47.751288	285	50.837806	3.086518	1.696166	True
2	hr	[13.80, 16.10)	1743	57.266649	445	60.346930	3.080281	1.696166	True
0	hr	[0.00, 2.30)	1737	23.726041	428	25.390801	1.664760	1.696166	False
1	hr	[11.50, 13.80)	1166	56.310554	291	57.701555	1.391002	1.696166	False
9	hr	[9.20, 11.50)	1160	55.224693	294	56.197313	0.972620	1.696166	False
6	hr	[20.70, 23.00]	1737	40.227978	447	41.067656	0.839678	1.696166	False
7	hr	[4.60, 6.90)	1175	23.756332	267	23.703447	-0.052885	1.696166	False
5	hr	[2.30, 4.60)	1110	40.279756	284	38.698437	-1.581320	1.696166	False
3	hr	[16.10, 18.40)	1168	115.874692	290	112.368296	-3.506395	1.696166	False

Total running time of the script: (0 minutes 3.151 seconds)

Gallery generated by Sphinx-Gallery