Dealing with Extra Data Sets#

This example requires full licence, and the program will break if you use the trial licence.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

from modeva import DataSet
from modeva.data.utils.loading import load_builtin_data

Load BikeSharing dataset as pandas dataframe

data = load_builtin_data("BikeSharing")
data

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	1	0	1	0	0	6	0	1	0.24	0.2879	0.81	0.0000	16
1	1	0	1	1	0	6	0	1	0.22	0.2727	0.80	0.0000	40
2	1	0	1	2	0	6	0	1	0.22	0.2727	0.80	0.0000	32
3	1	0	1	3	0	6	0	1	0.24	0.2879	0.75	0.0000	13
4	1	0	1	4	0	6	0	1	0.24	0.2879	0.75	0.0000	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
17374	1	1	12	19	0	1	1	2	0.26	0.2576	0.60	0.1642	119
17375	1	1	12	20	0	1	1	2	0.26	0.2576	0.60	0.1642	89
17376	1	1	12	21	0	1	1	1	0.26	0.2576	0.60	0.1642	90
17377	1	1	12	22	0	1	1	1	0.26	0.2727	0.56	0.1343	61
17378	1	1	12	23	0	1	1	1	0.26	0.2727	0.65	0.1343	49

17379 rows × 13 columns

Load the first 5000 rows into Modeva#

ds = DataSet()
ds.load_dataframe(data.iloc[:5000])
ds.set_random_split()
ds.set_inactive_features(features=['yr', 'temp'])
ds.set_target("cnt")

Load the samples indexed from 5000 to 8000 as “oot1” data split#

ds.set_raw_extra_data(name="oot1", data=data.iloc[5000:8000])
ds.raw_extra_data['oot1']

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	3	0	8	6	0	1	1	1	0.66	0.6061	0.83	0.0896	100
1	3	0	8	7	0	1	1	1	0.74	0.6818	0.62	0.0000	282
2	3	0	8	8	0	1	1	1	0.80	0.7273	0.43	0.1940	382
3	3	0	8	9	0	1	1	1	0.82	0.7424	0.41	0.0000	166
4	3	0	8	10	0	1	1	1	0.86	0.7576	0.36	0.1642	97
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2995	4	0	12	19	0	0	0	1	0.38	0.3939	0.76	0.0000	173
2996	4	0	12	20	0	0	0	1	0.36	0.3485	0.76	0.1343	124
2997	4	0	12	21	0	0	0	1	0.36	0.3636	0.81	0.1045	72
2998	4	0	12	22	0	0	0	1	0.34	0.3485	0.87	0.1045	64
2999	4	0	12	23	0	0	0	1	0.34	0.3485	0.87	0.1045	45

3000 rows × 13 columns

Load the samples indexed from 8000 to 9000 as “oot2” data split#

ds.set_raw_extra_data(name="oot2", data=data.iloc[8000:9000])
ds.raw_extra_data['oot2']

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	4	0	12	0	0	1	1	1	0.32	0.3333	0.87	0.0896	24
1	4	0	12	1	0	1	1	1	0.32	0.3485	0.87	0.0000	12
2	4	0	12	2	0	1	1	1	0.32	0.3485	0.87	0.0000	8
3	4	0	12	3	0	1	1	1	0.32	0.3485	0.87	0.0000	2
4	4	0	12	4	0	1	1	1	0.30	0.3182	0.87	0.1045	7
...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	1	1	1	16	0	0	0	1	0.22	0.2121	0.29	0.2985	205
996	1	1	1	17	0	0	0	1	0.22	0.2121	0.25	0.2537	146
997	1	1	1	18	0	0	0	1	0.18	0.1818	0.29	0.2239	120
998	1	1	1	19	0	0	0	1	0.16	0.1667	0.37	0.1642	110
999	1	1	1	20	0	0	0	1	0.16	0.1818	0.37	0.1045	95

1000 rows × 13 columns

Load the samples indexed from 9000 to the last one as “oot3” data split#

ds.set_raw_extra_data(name="oot3", data=data.iloc[9000:])
ds.raw_extra_data['oot3']

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	1	1	1	21	0	0	0	1	0.16	0.1667	0.40	0.1642	72
1	1	1	1	22	0	0	0	1	0.18	0.2121	0.37	0.0896	60
2	1	1	1	23	0	0	0	1	0.16	0.1970	0.43	0.0896	29
3	1	1	1	0	1	1	0	1	0.14	0.1515	0.46	0.1343	25
4	1	1	1	1	1	1	0	1	0.14	0.1667	0.43	0.1045	20
...	...	...	...	...	...	...	...	...	...	...	...	...	...
8374	1	1	12	19	0	1	1	2	0.26	0.2576	0.60	0.1642	119
8375	1	1	12	20	0	1	1	2	0.26	0.2576	0.60	0.1642	89
8376	1	1	12	21	0	1	1	1	0.26	0.2576	0.60	0.1642	90
8377	1	1	12	22	0	1	1	1	0.26	0.2727	0.56	0.1343	61
8378	1	1	12	23	0	1	1	1	0.26	0.2727	0.65	0.1343	49

8379 rows × 13 columns

Show the available data splits#

ds.get_data_list()

['main', 'train', 'test', 'oot1', 'oot2', 'oot3']

Delete data split (if needed)#

ds.delete_extra_data("oot3")
ds.get_data_list()

['main', 'train', 'test', 'oot1', 'oot2']

Get data split by name#

ds.get_data("oot1")

array([[3.000e+00, 0.000e+00, 8.000e+00, ..., 8.300e-01, 8.960e-02,
        1.000e+02],
       [3.000e+00, 0.000e+00, 8.000e+00, ..., 6.200e-01, 0.000e+00,
        2.820e+02],
       [3.000e+00, 0.000e+00, 8.000e+00, ..., 4.300e-01, 1.940e-01,
        3.820e+02],
       ...,
       [4.000e+00, 0.000e+00, 1.200e+01, ..., 8.100e-01, 1.045e-01,
        7.200e+01],
       [4.000e+00, 0.000e+00, 1.200e+01, ..., 8.700e-01, 1.045e-01,
        6.400e+01],
       [4.000e+00, 0.000e+00, 1.200e+01, ..., 8.700e-01, 1.045e-01,
        4.500e+01]])

Total running time of the script: (0 minutes 0.072 seconds)

Gallery generated by Sphinx-Gallery