Data Processing and Feature Engineering#

This example requires full licence, and the program will break if you use the trial licence.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

import pandas as pd
from modeva import DataSet
from modeva.data.utils.loading import load_builtin_data

Manually create data with special and missing values

data = load_builtin_data("TaiwanCredit")
data["LIMIT_BAL"].iloc[:10] = "SV1"
data["PAY_1"].iloc[10:15] = "SV2"
data["EDUCATION"].iloc[5:20] = pd.NA
data["AGE"].iloc[0:5] = pd.NA
data

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	FlagDefault
0	SV1	2.0	2.0	1.0	NaN	2.0	2.0	-1.0	-1.0	0.0	0.0	3.592621	3.491782	2.838849	0.000000	0.000000	0.000000	0.000000	2.838849	0.000000	0.000000	0.000000	0.000000	1.0
1	SV1	2.0	2.0	2.0	NaN	-1.0	2.0	0.0	0.0	0.0	2.0	3.428621	3.237041	3.428621	3.514946	3.538574	3.513484	0.000000	3.000434	3.000434	3.000434	0.000000	3.301247	1.0
2	SV1	2.0	2.0	2.0	NaN	0.0	0.0	0.0	0.0	0.0	0.0	4.465977	4.146996	4.132260	4.156307	4.174612	4.191731	3.181558	3.176381	3.000434	3.000434	3.000434	3.699057	0.0
3	SV1	2.0	2.0	1.0	NaN	0.0	0.0	0.0	0.0	0.0	0.0	4.672015	4.683353	4.692776	4.452016	4.461799	4.470528	3.301247	3.305351	3.079543	3.041787	3.029384	3.000434	0.0
4	SV1	1.0	2.0	1.0	NaN	-1.0	0.0	-1.0	0.0	0.0	0.0	3.935406	3.753660	4.554319	4.320997	4.282101	4.281760	3.301247	4.564453	4.000043	3.954291	2.838849	2.832509	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
29995	220000.0	1.0	3.0	1.0	39.0	0.0	0.0	0.0	0.0	0.0	0.0	5.276345	5.285143	5.318827	4.944507	4.494683	4.203604	3.929470	4.301052	3.699317	3.484015	3.699057	3.000434	0.0
29996	150000.0	1.0	3.0	2.0	43.0	-1.0	-1.0	-1.0	-1.0	0.0	0.0	3.226342	3.262214	3.544440	3.953276	3.715251	0.000000	3.264345	3.547405	3.954194	2.113943	0.000000	0.000000	0.0
29997	30000.0	1.0	2.0	2.0	37.0	4.0	3.0	2.0	-1.0	0.0	0.0	3.552181	3.525951	3.440752	4.319710	4.313509	4.286861	0.000000	0.000000	4.342442	3.623353	3.301247	3.491502	1.0
29998	80000.0	1.0	3.0	1.0	41.0	1.0	-1.0	0.0	0.0	0.0	-1.0	-3.216430	4.894205	4.882553	4.722428	4.073938	4.689708	4.933998	3.532754	3.071514	3.284882	4.723989	3.256477	1.0
29999	50000.0	1.0	2.0	1.0	46.0	0.0	0.0	0.0	0.0	0.0	0.0	4.680607	4.689362	4.696924	4.562721	4.510933	4.185089	3.317854	3.255514	3.155640	3.000434	3.000434	3.000434	1.0

30000 rows × 24 columns

Data load and summary#

Load the dataframe into Modeva

ds = DataSet(name="TW-Credit")
ds.load_dataframe(data)
ds.set_random_split()
ds.data.head(20).iloc[:, :10]

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_1	PAY_2	PAY_3	PAY_4	PAY_5
0	SV1	2.0	2.0	1.0	NaN	2.0	2.0	-1.0	-1.0	0.0
1	SV1	2.0	2.0	2.0	NaN	-1.0	2.0	0.0	0.0	0.0
2	SV1	2.0	2.0	2.0	NaN	0.0	0.0	0.0	0.0	0.0
3	SV1	2.0	2.0	1.0	NaN	0.0	0.0	0.0	0.0	0.0
4	SV1	1.0	2.0	1.0	NaN	-1.0	0.0	-1.0	0.0	0.0
5	SV1	1.0	NaN	2.0	37.0	0.0	0.0	0.0	0.0	0.0
6	SV1	1.0	NaN	2.0	29.0	0.0	0.0	0.0	0.0	0.0
7	SV1	2.0	NaN	2.0	23.0	0.0	-1.0	-1.0	0.0	0.0
8	SV1	2.0	NaN	1.0	28.0	0.0	0.0	2.0	0.0	0.0
9	SV1	1.0	NaN	2.0	35.0	0.0	0.0	0.0	0.0	-1.0
10	200000.0	2.0	NaN	2.0	34.0	SV2	0.0	2.0	0.0	0.0
11	260000.0	2.0	NaN	2.0	51.0	SV2	-1.0	-1.0	-1.0	-1.0
12	630000.0	2.0	NaN	2.0	41.0	SV2	0.0	-1.0	-1.0	-1.0
13	70000.0	1.0	NaN	2.0	30.0	SV2	2.0	2.0	0.0	0.0
14	250000.0	1.0	NaN	2.0	29.0	SV2	0.0	0.0	0.0	0.0
15	50000.0	2.0	NaN	0.0	23.0	1.0	2.0	0.0	0.0	0.0
16	20000.0	1.0	NaN	2.0	24.0	0.0	0.0	2.0	2.0	2.0
17	320000.0	1.0	NaN	1.0	49.0	0.0	0.0	0.0	-1.0	-1.0
18	360000.0	2.0	NaN	1.0	49.0	1.0	0.0	0.0	0.0	0.0
19	180000.0	2.0	NaN	2.0	29.0	1.0	0.0	0.0	0.0	0.0

Check if the data has missing values

results = ds.summary()
results.table["summary"]

	samples	features	numerical	categorical	mixed	date	duplicated	missing cells	missing cells (%)	infinite cells	infinite cells (%)
0	30000	24	19	3	2	0	68	20	0.000028	0	0.0

Check the features with special values.

results.table["mixed"]

	missing	inf	unique	mean	std	min	25%	median	75%	max	categories
name
LIMIT_BAL	0	0	81	167502.156719	129740.266958	10000.0	50000.0	140000.0	240000.0	1000000.0	[SV1]
PAY_1	0	0	10	0.167295	0.931323	-1.0	0.0	0.0	0.0	8.0	[SV2]

Reset preprocessing#

ds.reset_preprocess()

Set the data steps#

Impute numerical features, and add an indicator column for missing values

ds.impute_missing(features=ds.feature_names_numerical, method='mean',
                  add_indicators=True)

# Impute categorical features, and add an indicator column for missing values
ds.impute_missing(features=ds.feature_names_categorical, method='most_frequent',
                  add_indicators=True)

# Impute mixed features, and add an indicator column for missing and special values
# The list of special values need to be configured here manually.
ds.impute_missing(features=ds.feature_names_mixed, method='mean',
                  add_indicators=True, special_values=["SV1", "SV2"])

Encoding categorical features

ds.encode_categorical(features=("EDUCATION", "SEX"), method="onehot")

Encoding categorical features by target encoding. (Note that this will use y, so it’s better to use training data.)

ds.encode_categorical(dataset="train", features=("MARRIAGE", ), method="target", target="FlagDefault")

Scaling numerical features

ds.scale_numerical(features=("PAY_1", "PAY_2"), method="minmax")
ds.scale_numerical(features=("LIMIT_BAL", ), method="log1p")
ds.scale_numerical(features=("AGE", ), method="square")
ds.scale_numerical(features=("PAY_AMT1", ), method="quantile")
ds.scale_numerical(features=("PAY_1", "PAY_2",), method="log1p")

Binning numerical features

ds.bin_numerical(features=("AGE", "PAY_3", ), bins=10)

Execute the preprocessing steps defined above#

ds.preprocess()
ds.to_df()

	LIMIT_BAL	LIMIT_BAL_special_SV1	SEX_2.0	EDUCATION_1.0	EDUCATION_1.7798899449724863	EDUCATION_2.0	EDUCATION_3.0	EDUCATION_missing_nan	MARRIAGE	AGE	AGE_missing_nan	PAY_1	PAY_1_special_SV2	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	FlagDefault
0	12.028757	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.234716	1	1.0	0.287682	0.0	0.287682	0	-1.0	0.0	0.0	3.592621	3.491782	2.838849	0.000000	0.000000	0.000000	0.000000	2.838849	0.000000	0.000000	0.000000	0.000000	1.0
1	12.028757	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.209284	1	1.0	0.000000	0.0	0.287682	1	0.0	0.0	2.0	3.428621	3.237041	3.428621	3.514946	3.538574	3.513484	0.000000	3.000434	3.000434	3.000434	0.000000	3.301247	1.0
2	12.028757	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.209284	1	1.0	0.105361	0.0	0.105361	1	0.0	0.0	0.0	4.465977	4.146996	4.132260	4.156307	4.174612	4.191731	0.358358	3.176381	3.000434	3.000434	3.000434	3.699057	0.0
3	12.028757	1.0	1.0	0.0	0.0	1.0	0.0	0.0	0.234716	1	1.0	0.105361	0.0	0.105361	1	0.0	0.0	0.0	4.672015	4.683353	4.692776	4.452016	4.461799	4.470528	0.451952	3.305351	3.079543	3.041787	3.029384	3.000434	0.0
4	12.028757	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.234716	1	1.0	0.000000	0.0	0.105361	0	0.0	0.0	0.0	3.935406	3.753660	4.554319	4.320997	4.282101	4.281760	0.451952	4.564453	4.000043	3.954291	2.838849	2.832509	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
29995	12.301387	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.234716	1	0.0	0.105361	0.0	0.105361	1	0.0	0.0	0.0	5.276345	5.285143	5.318827	4.944507	4.494683	4.203604	0.854855	4.301052	3.699317	3.484015	3.699057	3.000434	0.0
29996	11.918397	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.209284	2	0.0	0.000000	0.0	0.000000	0	-1.0	0.0	0.0	3.226342	3.262214	3.544440	3.953276	3.715251	0.000000	0.415164	3.547405	3.954194	2.113943	0.000000	0.000000	0.0
29997	10.308986	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.209284	1	0.0	0.441833	0.0	0.367725	3	-1.0	0.0	0.0	3.552181	3.525951	3.440752	4.319710	4.313509	4.286861	0.000000	0.000000	4.342442	3.623353	3.301247	3.491502	1.0
29998	11.289794	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.234716	2	0.0	0.200671	0.0	0.000000	1	0.0	0.0	-1.0	-3.216430	4.894205	4.882553	4.722428	4.073938	4.689708	0.993234	3.532754	3.071514	3.284882	4.723989	3.256477	1.0
29999	10.819798	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.234716	2	0.0	0.105361	0.0	0.105361	1	0.0	0.0	0.0	4.680607	4.689362	4.696924	4.562721	4.510933	4.185089	0.487169	3.255514	3.155640	3.000434	3.000434	3.000434	1.0

30000 rows × 31 columns

Total running time of the script: (0 minutes 4.055 seconds)

Gallery generated by Sphinx-Gallery