Diagnostics Analysis with Date#

Evaluate model with date column.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

from modeva import DataSet
from modeva import TestSuite
from modeva.models import MoLGBMRegressor

Load BikeSharing Dataset

import pandas as pd
from modeva.data.utils.loading import load_builtin_data

data = load_builtin_data("BikeSharing")
data['Date'] = (pd.to_datetime('2011-01-01') + pd.to_timedelta(data.index / 24, unit='D')).date
data.head()
season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed cnt Date
0 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0.0 16 2011-01-01
1 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0.0 40 2011-01-01
2 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0.0 32 2011-01-01
3 1 0 1 3 0 6 0 1 0.24 0.2879 0.75 0.0 13 2011-01-01
4 1 0 1 4 0 6 0 1 0.24 0.2879 0.75 0.0 1 2011-01-01


Load the data into Modeva DataSet

ds = DataSet()
ds.load_dataframe(data)
ds.set_target("cnt")
ds.set_inactive_features(features=('Date', ))
ds.set_random_split()

Fit a LGBM model

model1 = MoLGBMRegressor(name="LGBM1", max_depth=1, n_estimators=20)
model1.fit(ds.train_x, ds.train_y)

model2 = MoLGBMRegressor(name="LGBM2", max_depth=2, n_estimators=20)
model2.fit(ds.train_x, ds.train_y)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 283
[LightGBM] [Info] Number of data points in the train set: 13903, number of used features: 12
[LightGBM] [Info] Start training from score 189.263324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 283
[LightGBM] [Info] Number of data points in the train set: 13903, number of used features: 12
[LightGBM] [Info] Start training from score 189.263324
MoLGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=2,
                min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=20, n_jobs=None, num_leaves=31,
                objective=None, random_state=None, reg_alpha=0.0,
                reg_lambda=0.0, subsample=1.0, subsample_for_bin=200000,
                subsample_freq=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.


Visualize the residual against date#

ts = TestSuite(ds, model1)
results = ts.diagnose_residual_analysis(features="Date", dataset="train")
results.plot()


Slicing accuracy diagnostics against date#

results = ts.diagnose_slicing_accuracy(features="Date",
                                       method="uniform")
results.plot(figsize=(5, 4))


Custom date as split points

dates = pd.to_datetime(["2011-06-30", "2011-12-31", "2012-06-30"])
results = ts.diagnose_slicing_accuracy(features="Date",
                                       method="precompute",
                                       bins= {"Date": dates.tolist()})
results.plot(figsize=(5, 4))


2D slicing with date

results = ts.diagnose_slicing_accuracy(features=("Date", "hr"),
                                       method="uniform")
results.plot()


Compare slicing performance with date

tsc = TestSuite(dataset=ds, models=[model1, model1])
results = tsc.compare_slicing_accuracy(features="Date",
                                       method="uniform")
results.plot(figsize=(5, 4))


Slicing overfit with date#

results = ts.diagnose_slicing_overfit(features="Date",
                                      method="uniform")
results.plot(figsize=(5, 4))


Compare slicing overfit with date

results = tsc.compare_slicing_overfit(features="Date",
                                      method="uniform")
results.plot(figsize=(5, 4))


Slicing reliability with date#

results = ts.diagnose_slicing_reliability(features="Date",
                                          method="uniform")
results.plot(figsize=(5, 4))


Compare slicing reliability with date

results = tsc.compare_slicing_reliability(features="Date",
                                          method="uniform")
results.plot(figsize=(5, 4))


Slicing robustness with date#

results = ts.diagnose_slicing_robustness(features="Date",
                                         method="uniform")
results.plot(figsize=(5, 4))


Compare slicing robustness with date

results = tsc.compare_slicing_robustness(features="Date",
                                         method="uniform")
results.plot(figsize=(5, 4))


Total running time of the script: (0 minutes 8.118 seconds)

Gallery generated by Sphinx-Gallery