Data Processing and Feature Engineering
This example requires full licence, and the program will break if you use the trial licence.
Installation
# To install the required package, use the following command:
# !pip install modeva
Authentication
# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')
Import modeva modules
import pandas as pd
from modeva import DataSet
from modeva.data.utils.loading import load_builtin_data
Manually create data with special and missing values
data = load_builtin_data("TaiwanCredit")
data["LIMIT_BAL"].iloc[:10] = "SV1"
data["PAY_1"].iloc[10:15] = "SV2"
data["EDUCATION"].iloc[5:20] = pd.NA
data["AGE"].iloc[0:5] = pd.NA
data
|
LIMIT_BAL |
SEX |
EDUCATION |
MARRIAGE |
AGE |
PAY_1 |
PAY_2 |
PAY_3 |
PAY_4 |
PAY_5 |
PAY_6 |
BILL_AMT1 |
BILL_AMT2 |
BILL_AMT3 |
BILL_AMT4 |
BILL_AMT5 |
BILL_AMT6 |
PAY_AMT1 |
PAY_AMT2 |
PAY_AMT3 |
PAY_AMT4 |
PAY_AMT5 |
PAY_AMT6 |
FlagDefault |
0 |
SV1 |
2.0 |
2.0 |
1.0 |
NaN |
2.0 |
2.0 |
-1.0 |
-1.0 |
0.0 |
0.0 |
3.592621 |
3.491782 |
2.838849 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
2.838849 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.0 |
1 |
SV1 |
2.0 |
2.0 |
2.0 |
NaN |
-1.0 |
2.0 |
0.0 |
0.0 |
0.0 |
2.0 |
3.428621 |
3.237041 |
3.428621 |
3.514946 |
3.538574 |
3.513484 |
0.000000 |
3.000434 |
3.000434 |
3.000434 |
0.000000 |
3.301247 |
1.0 |
2 |
SV1 |
2.0 |
2.0 |
2.0 |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4.465977 |
4.146996 |
4.132260 |
4.156307 |
4.174612 |
4.191731 |
3.181558 |
3.176381 |
3.000434 |
3.000434 |
3.000434 |
3.699057 |
0.0 |
3 |
SV1 |
2.0 |
2.0 |
1.0 |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4.672015 |
4.683353 |
4.692776 |
4.452016 |
4.461799 |
4.470528 |
3.301247 |
3.305351 |
3.079543 |
3.041787 |
3.029384 |
3.000434 |
0.0 |
4 |
SV1 |
1.0 |
2.0 |
1.0 |
NaN |
-1.0 |
0.0 |
-1.0 |
0.0 |
0.0 |
0.0 |
3.935406 |
3.753660 |
4.554319 |
4.320997 |
4.282101 |
4.281760 |
3.301247 |
4.564453 |
4.000043 |
3.954291 |
2.838849 |
2.832509 |
0.0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
29995 |
220000.0 |
1.0 |
3.0 |
1.0 |
39.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
5.276345 |
5.285143 |
5.318827 |
4.944507 |
4.494683 |
4.203604 |
3.929470 |
4.301052 |
3.699317 |
3.484015 |
3.699057 |
3.000434 |
0.0 |
29996 |
150000.0 |
1.0 |
3.0 |
2.0 |
43.0 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
0.0 |
0.0 |
3.226342 |
3.262214 |
3.544440 |
3.953276 |
3.715251 |
0.000000 |
3.264345 |
3.547405 |
3.954194 |
2.113943 |
0.000000 |
0.000000 |
0.0 |
29997 |
30000.0 |
1.0 |
2.0 |
2.0 |
37.0 |
4.0 |
3.0 |
2.0 |
-1.0 |
0.0 |
0.0 |
3.552181 |
3.525951 |
3.440752 |
4.319710 |
4.313509 |
4.286861 |
0.000000 |
0.000000 |
4.342442 |
3.623353 |
3.301247 |
3.491502 |
1.0 |
29998 |
80000.0 |
1.0 |
3.0 |
1.0 |
41.0 |
1.0 |
-1.0 |
0.0 |
0.0 |
0.0 |
-1.0 |
-3.216430 |
4.894205 |
4.882553 |
4.722428 |
4.073938 |
4.689708 |
4.933998 |
3.532754 |
3.071514 |
3.284882 |
4.723989 |
3.256477 |
1.0 |
29999 |
50000.0 |
1.0 |
2.0 |
1.0 |
46.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4.680607 |
4.689362 |
4.696924 |
4.562721 |
4.510933 |
4.185089 |
3.317854 |
3.255514 |
3.155640 |
3.000434 |
3.000434 |
3.000434 |
1.0 |
30000 rows × 24 columns
Data load and summary
Load the dataframe into Modeva
ds = DataSet(name="TW-Credit")
ds.load_dataframe(data)
ds.set_random_split()
ds.data.head(20).iloc[:, :10]
|
LIMIT_BAL |
SEX |
EDUCATION |
MARRIAGE |
AGE |
PAY_1 |
PAY_2 |
PAY_3 |
PAY_4 |
PAY_5 |
0 |
SV1 |
2.0 |
2.0 |
1.0 |
NaN |
2.0 |
2.0 |
-1.0 |
-1.0 |
0.0 |
1 |
SV1 |
2.0 |
2.0 |
2.0 |
NaN |
-1.0 |
2.0 |
0.0 |
0.0 |
0.0 |
2 |
SV1 |
2.0 |
2.0 |
2.0 |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
3 |
SV1 |
2.0 |
2.0 |
1.0 |
NaN |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
4 |
SV1 |
1.0 |
2.0 |
1.0 |
NaN |
-1.0 |
0.0 |
-1.0 |
0.0 |
0.0 |
5 |
SV1 |
1.0 |
NaN |
2.0 |
37.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
6 |
SV1 |
1.0 |
NaN |
2.0 |
29.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
7 |
SV1 |
2.0 |
NaN |
2.0 |
23.0 |
0.0 |
-1.0 |
-1.0 |
0.0 |
0.0 |
8 |
SV1 |
2.0 |
NaN |
1.0 |
28.0 |
0.0 |
0.0 |
2.0 |
0.0 |
0.0 |
9 |
SV1 |
1.0 |
NaN |
2.0 |
35.0 |
0.0 |
0.0 |
0.0 |
0.0 |
-1.0 |
10 |
200000.0 |
2.0 |
NaN |
2.0 |
34.0 |
SV2 |
0.0 |
2.0 |
0.0 |
0.0 |
11 |
260000.0 |
2.0 |
NaN |
2.0 |
51.0 |
SV2 |
-1.0 |
-1.0 |
-1.0 |
-1.0 |
12 |
630000.0 |
2.0 |
NaN |
2.0 |
41.0 |
SV2 |
0.0 |
-1.0 |
-1.0 |
-1.0 |
13 |
70000.0 |
1.0 |
NaN |
2.0 |
30.0 |
SV2 |
2.0 |
2.0 |
0.0 |
0.0 |
14 |
250000.0 |
1.0 |
NaN |
2.0 |
29.0 |
SV2 |
0.0 |
0.0 |
0.0 |
0.0 |
15 |
50000.0 |
2.0 |
NaN |
0.0 |
23.0 |
1.0 |
2.0 |
0.0 |
0.0 |
0.0 |
16 |
20000.0 |
1.0 |
NaN |
2.0 |
24.0 |
0.0 |
0.0 |
2.0 |
2.0 |
2.0 |
17 |
320000.0 |
1.0 |
NaN |
1.0 |
49.0 |
0.0 |
0.0 |
0.0 |
-1.0 |
-1.0 |
18 |
360000.0 |
2.0 |
NaN |
1.0 |
49.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
19 |
180000.0 |
2.0 |
NaN |
2.0 |
29.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
Check if the data has missing values
results = ds.summary()
results.table["summary"]
|
samples |
features |
numerical |
categorical |
mixed |
date |
duplicated |
missing cells |
missing cells (%) |
infinite cells |
infinite cells (%) |
0 |
30000 |
24 |
19 |
3 |
2 |
0 |
68 |
20 |
0.000028 |
0 |
0.0 |
Check the features with special values.
|
missing |
inf |
unique |
mean |
std |
min |
25% |
median |
75% |
max |
categories |
name |
|
|
|
|
|
|
|
|
|
|
|
LIMIT_BAL |
0 |
0 |
81 |
167502.156719 |
129740.266958 |
10000.0 |
50000.0 |
140000.0 |
240000.0 |
1000000.0 |
[SV1] |
PAY_1 |
0 |
0 |
10 |
0.167295 |
0.931323 |
-1.0 |
0.0 |
0.0 |
0.0 |
8.0 |
[SV2] |
Set the data steps
Impute numerical features, and add an indicator column for missing values
ds.impute_missing(features=ds.feature_names_numerical, method='mean',
add_indicators=True)
# Impute categorical features, and add an indicator column for missing values
ds.impute_missing(features=ds.feature_names_categorical, method='most_frequent',
add_indicators=True)
# Impute mixed features, and add an indicator column for missing and special values
# The list of special values need to be configured here manually.
ds.impute_missing(features=ds.feature_names_mixed, method='mean',
add_indicators=True, special_values=["SV1", "SV2"])
Encoding categorical features
ds.encode_categorical(features=("EDUCATION", "SEX"), method="onehot")
Encoding categorical features by target encoding. (Note that this will use y, so it’s better to use training data.)
ds.encode_categorical(dataset="train", features=("MARRIAGE", ), method="target", target="FlagDefault")
Scaling numerical features
ds.scale_numerical(features=("PAY_1", "PAY_2"), method="minmax")
ds.scale_numerical(features=("LIMIT_BAL", ), method="log1p")
ds.scale_numerical(features=("AGE", ), method="square")
ds.scale_numerical(features=("PAY_AMT1", ), method="quantile")
ds.scale_numerical(features=("PAY_1", "PAY_2",), method="log1p")
Binning numerical features
ds.bin_numerical(features=("AGE", "PAY_3", ), bins=10)
Execute the preprocessing steps defined above
ds.preprocess()
ds.to_df()
|
LIMIT_BAL |
LIMIT_BAL_special_SV1 |
SEX_2.0 |
EDUCATION_1.0 |
EDUCATION_1.7798899449724863 |
EDUCATION_2.0 |
EDUCATION_3.0 |
EDUCATION_missing_nan |
MARRIAGE |
AGE |
AGE_missing_nan |
PAY_1 |
PAY_1_special_SV2 |
PAY_2 |
PAY_3 |
PAY_4 |
PAY_5 |
PAY_6 |
BILL_AMT1 |
BILL_AMT2 |
BILL_AMT3 |
BILL_AMT4 |
BILL_AMT5 |
BILL_AMT6 |
PAY_AMT1 |
PAY_AMT2 |
PAY_AMT3 |
PAY_AMT4 |
PAY_AMT5 |
PAY_AMT6 |
FlagDefault |
0 |
12.028757 |
1.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.234716 |
1 |
1.0 |
0.287682 |
0.0 |
0.287682 |
0 |
-1.0 |
0.0 |
0.0 |
3.592621 |
3.491782 |
2.838849 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
2.838849 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.0 |
1 |
12.028757 |
1.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.209284 |
1 |
1.0 |
0.000000 |
0.0 |
0.287682 |
1 |
0.0 |
0.0 |
2.0 |
3.428621 |
3.237041 |
3.428621 |
3.514946 |
3.538574 |
3.513484 |
0.000000 |
3.000434 |
3.000434 |
3.000434 |
0.000000 |
3.301247 |
1.0 |
2 |
12.028757 |
1.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.209284 |
1 |
1.0 |
0.105361 |
0.0 |
0.105361 |
1 |
0.0 |
0.0 |
0.0 |
4.465977 |
4.146996 |
4.132260 |
4.156307 |
4.174612 |
4.191731 |
0.358358 |
3.176381 |
3.000434 |
3.000434 |
3.000434 |
3.699057 |
0.0 |
3 |
12.028757 |
1.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.234716 |
1 |
1.0 |
0.105361 |
0.0 |
0.105361 |
1 |
0.0 |
0.0 |
0.0 |
4.672015 |
4.683353 |
4.692776 |
4.452016 |
4.461799 |
4.470528 |
0.451952 |
3.305351 |
3.079543 |
3.041787 |
3.029384 |
3.000434 |
0.0 |
4 |
12.028757 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.234716 |
1 |
1.0 |
0.000000 |
0.0 |
0.105361 |
0 |
0.0 |
0.0 |
0.0 |
3.935406 |
3.753660 |
4.554319 |
4.320997 |
4.282101 |
4.281760 |
0.451952 |
4.564453 |
4.000043 |
3.954291 |
2.838849 |
2.832509 |
0.0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
29995 |
12.301387 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.234716 |
1 |
0.0 |
0.105361 |
0.0 |
0.105361 |
1 |
0.0 |
0.0 |
0.0 |
5.276345 |
5.285143 |
5.318827 |
4.944507 |
4.494683 |
4.203604 |
0.854855 |
4.301052 |
3.699317 |
3.484015 |
3.699057 |
3.000434 |
0.0 |
29996 |
11.918397 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.209284 |
2 |
0.0 |
0.000000 |
0.0 |
0.000000 |
0 |
-1.0 |
0.0 |
0.0 |
3.226342 |
3.262214 |
3.544440 |
3.953276 |
3.715251 |
0.000000 |
0.415164 |
3.547405 |
3.954194 |
2.113943 |
0.000000 |
0.000000 |
0.0 |
29997 |
10.308986 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.209284 |
1 |
0.0 |
0.441833 |
0.0 |
0.367725 |
3 |
-1.0 |
0.0 |
0.0 |
3.552181 |
3.525951 |
3.440752 |
4.319710 |
4.313509 |
4.286861 |
0.000000 |
0.000000 |
4.342442 |
3.623353 |
3.301247 |
3.491502 |
1.0 |
29998 |
11.289794 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.234716 |
2 |
0.0 |
0.200671 |
0.0 |
0.000000 |
1 |
0.0 |
0.0 |
-1.0 |
-3.216430 |
4.894205 |
4.882553 |
4.722428 |
4.073938 |
4.689708 |
0.993234 |
3.532754 |
3.071514 |
3.284882 |
4.723989 |
3.256477 |
1.0 |
29999 |
10.819798 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.234716 |
2 |
0.0 |
0.105361 |
0.0 |
0.105361 |
1 |
0.0 |
0.0 |
0.0 |
4.680607 |
4.689362 |
4.696924 |
4.562721 |
4.510933 |
4.185089 |
0.487169 |
3.255514 |
3.155640 |
3.000434 |
3.000434 |
3.000434 |
1.0 |
30000 rows × 31 columns
Total running time of the script: (0 minutes 4.055 seconds)
Gallery generated by Sphinx-Gallery