Data Processing and Feature Engineering#

This example requires full licence, and the program will break if you use the trial licence.

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

import pandas as pd
from modeva import DataSet
from modeva.data.utils.loading import load_builtin_data

Manually create data with special and missing values

data = load_builtin_data("TaiwanCredit")
data["LIMIT_BAL"].iloc[:10] = "SV1"
data["PAY_1"].iloc[10:15] = "SV2"
data["EDUCATION"].iloc[5:20] = pd.NA
data["AGE"].iloc[0:5] = pd.NA
data
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 FlagDefault
0 SV1 2.0 2.0 1.0 NaN 2.0 2.0 -1.0 -1.0 0.0 0.0 3.592621 3.491782 2.838849 0.000000 0.000000 0.000000 0.000000 2.838849 0.000000 0.000000 0.000000 0.000000 1.0
1 SV1 2.0 2.0 2.0 NaN -1.0 2.0 0.0 0.0 0.0 2.0 3.428621 3.237041 3.428621 3.514946 3.538574 3.513484 0.000000 3.000434 3.000434 3.000434 0.000000 3.301247 1.0
2 SV1 2.0 2.0 2.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 4.465977 4.146996 4.132260 4.156307 4.174612 4.191731 3.181558 3.176381 3.000434 3.000434 3.000434 3.699057 0.0
3 SV1 2.0 2.0 1.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 4.672015 4.683353 4.692776 4.452016 4.461799 4.470528 3.301247 3.305351 3.079543 3.041787 3.029384 3.000434 0.0
4 SV1 1.0 2.0 1.0 NaN -1.0 0.0 -1.0 0.0 0.0 0.0 3.935406 3.753660 4.554319 4.320997 4.282101 4.281760 3.301247 4.564453 4.000043 3.954291 2.838849 2.832509 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29995 220000.0 1.0 3.0 1.0 39.0 0.0 0.0 0.0 0.0 0.0 0.0 5.276345 5.285143 5.318827 4.944507 4.494683 4.203604 3.929470 4.301052 3.699317 3.484015 3.699057 3.000434 0.0
29996 150000.0 1.0 3.0 2.0 43.0 -1.0 -1.0 -1.0 -1.0 0.0 0.0 3.226342 3.262214 3.544440 3.953276 3.715251 0.000000 3.264345 3.547405 3.954194 2.113943 0.000000 0.000000 0.0
29997 30000.0 1.0 2.0 2.0 37.0 4.0 3.0 2.0 -1.0 0.0 0.0 3.552181 3.525951 3.440752 4.319710 4.313509 4.286861 0.000000 0.000000 4.342442 3.623353 3.301247 3.491502 1.0
29998 80000.0 1.0 3.0 1.0 41.0 1.0 -1.0 0.0 0.0 0.0 -1.0 -3.216430 4.894205 4.882553 4.722428 4.073938 4.689708 4.933998 3.532754 3.071514 3.284882 4.723989 3.256477 1.0
29999 50000.0 1.0 2.0 1.0 46.0 0.0 0.0 0.0 0.0 0.0 0.0 4.680607 4.689362 4.696924 4.562721 4.510933 4.185089 3.317854 3.255514 3.155640 3.000434 3.000434 3.000434 1.0

30000 rows × 24 columns



Data load and summary#

Load the dataframe into Modeva

ds = DataSet(name="TW-Credit")
ds.load_dataframe(data)
ds.set_random_split()
ds.data.head(20).iloc[:, :10]
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 PAY_5
0 SV1 2.0 2.0 1.0 NaN 2.0 2.0 -1.0 -1.0 0.0
1 SV1 2.0 2.0 2.0 NaN -1.0 2.0 0.0 0.0 0.0
2 SV1 2.0 2.0 2.0 NaN 0.0 0.0 0.0 0.0 0.0
3 SV1 2.0 2.0 1.0 NaN 0.0 0.0 0.0 0.0 0.0
4 SV1 1.0 2.0 1.0 NaN -1.0 0.0 -1.0 0.0 0.0
5 SV1 1.0 NaN 2.0 37.0 0.0 0.0 0.0 0.0 0.0
6 SV1 1.0 NaN 2.0 29.0 0.0 0.0 0.0 0.0 0.0
7 SV1 2.0 NaN 2.0 23.0 0.0 -1.0 -1.0 0.0 0.0
8 SV1 2.0 NaN 1.0 28.0 0.0 0.0 2.0 0.0 0.0
9 SV1 1.0 NaN 2.0 35.0 0.0 0.0 0.0 0.0 -1.0
10 200000.0 2.0 NaN 2.0 34.0 SV2 0.0 2.0 0.0 0.0
11 260000.0 2.0 NaN 2.0 51.0 SV2 -1.0 -1.0 -1.0 -1.0
12 630000.0 2.0 NaN 2.0 41.0 SV2 0.0 -1.0 -1.0 -1.0
13 70000.0 1.0 NaN 2.0 30.0 SV2 2.0 2.0 0.0 0.0
14 250000.0 1.0 NaN 2.0 29.0 SV2 0.0 0.0 0.0 0.0
15 50000.0 2.0 NaN 0.0 23.0 1.0 2.0 0.0 0.0 0.0
16 20000.0 1.0 NaN 2.0 24.0 0.0 0.0 2.0 2.0 2.0
17 320000.0 1.0 NaN 1.0 49.0 0.0 0.0 0.0 -1.0 -1.0
18 360000.0 2.0 NaN 1.0 49.0 1.0 0.0 0.0 0.0 0.0
19 180000.0 2.0 NaN 2.0 29.0 1.0 0.0 0.0 0.0 0.0


Check if the data has missing values

results = ds.summary()
results.table["summary"]
samples features numerical categorical mixed date duplicated missing cells missing cells (%) infinite cells infinite cells (%)
0 30000 24 19 3 2 0 68 20 0.000028 0 0.0


Check the features with special values.

results.table["mixed"]
missing inf unique mean std min 25% median 75% max categories
name
LIMIT_BAL 0 0 81 167502.156719 129740.266958 10000.0 50000.0 140000.0 240000.0 1000000.0 [SV1]
PAY_1 0 0 10 0.167295 0.931323 -1.0 0.0 0.0 0.0 8.0 [SV2]


Reset preprocessing#

ds.reset_preprocess()

Set the data steps#

Impute numerical features, and add an indicator column for missing values

ds.impute_missing(features=ds.feature_names_numerical, method='mean',
                  add_indicators=True)

# Impute categorical features, and add an indicator column for missing values
ds.impute_missing(features=ds.feature_names_categorical, method='most_frequent',
                  add_indicators=True)

# Impute mixed features, and add an indicator column for missing and special values
# The list of special values need to be configured here manually.
ds.impute_missing(features=ds.feature_names_mixed, method='mean',
                  add_indicators=True, special_values=["SV1", "SV2"])

Encoding categorical features

ds.encode_categorical(features=("EDUCATION", "SEX"), method="onehot")

Encoding categorical features by target encoding. (Note that this will use y, so it’s better to use training data.)

ds.encode_categorical(dataset="train", features=("MARRIAGE", ), method="target", target="FlagDefault")

Scaling numerical features

ds.scale_numerical(features=("PAY_1", "PAY_2"), method="minmax")
ds.scale_numerical(features=("LIMIT_BAL", ), method="log1p")
ds.scale_numerical(features=("AGE", ), method="square")
ds.scale_numerical(features=("PAY_AMT1", ), method="quantile")
ds.scale_numerical(features=("PAY_1", "PAY_2",), method="log1p")

Binning numerical features

ds.bin_numerical(features=("AGE", "PAY_3", ), bins=10)

Execute the preprocessing steps defined above#

ds.preprocess()
ds.to_df()
LIMIT_BAL LIMIT_BAL_special_SV1 SEX_2.0 EDUCATION_1.0 EDUCATION_1.7798899449724863 EDUCATION_2.0 EDUCATION_3.0 EDUCATION_missing_nan MARRIAGE AGE AGE_missing_nan PAY_1 PAY_1_special_SV2 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 FlagDefault
0 12.028757 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.234716 1 1.0 0.287682 0.0 0.287682 0 -1.0 0.0 0.0 3.592621 3.491782 2.838849 0.000000 0.000000 0.000000 0.000000 2.838849 0.000000 0.000000 0.000000 0.000000 1.0
1 12.028757 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.209284 1 1.0 0.000000 0.0 0.287682 1 0.0 0.0 2.0 3.428621 3.237041 3.428621 3.514946 3.538574 3.513484 0.000000 3.000434 3.000434 3.000434 0.000000 3.301247 1.0
2 12.028757 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.209284 1 1.0 0.105361 0.0 0.105361 1 0.0 0.0 0.0 4.465977 4.146996 4.132260 4.156307 4.174612 4.191731 0.358358 3.176381 3.000434 3.000434 3.000434 3.699057 0.0
3 12.028757 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.234716 1 1.0 0.105361 0.0 0.105361 1 0.0 0.0 0.0 4.672015 4.683353 4.692776 4.452016 4.461799 4.470528 0.451952 3.305351 3.079543 3.041787 3.029384 3.000434 0.0
4 12.028757 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.234716 1 1.0 0.000000 0.0 0.105361 0 0.0 0.0 0.0 3.935406 3.753660 4.554319 4.320997 4.282101 4.281760 0.451952 4.564453 4.000043 3.954291 2.838849 2.832509 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29995 12.301387 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.234716 1 0.0 0.105361 0.0 0.105361 1 0.0 0.0 0.0 5.276345 5.285143 5.318827 4.944507 4.494683 4.203604 0.854855 4.301052 3.699317 3.484015 3.699057 3.000434 0.0
29996 11.918397 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.209284 2 0.0 0.000000 0.0 0.000000 0 -1.0 0.0 0.0 3.226342 3.262214 3.544440 3.953276 3.715251 0.000000 0.415164 3.547405 3.954194 2.113943 0.000000 0.000000 0.0
29997 10.308986 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.209284 1 0.0 0.441833 0.0 0.367725 3 -1.0 0.0 0.0 3.552181 3.525951 3.440752 4.319710 4.313509 4.286861 0.000000 0.000000 4.342442 3.623353 3.301247 3.491502 1.0
29998 11.289794 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.234716 2 0.0 0.200671 0.0 0.000000 1 0.0 0.0 -1.0 -3.216430 4.894205 4.882553 4.722428 4.073938 4.689708 0.993234 3.532754 3.071514 3.284882 4.723989 3.256477 1.0
29999 10.819798 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.234716 2 0.0 0.105361 0.0 0.105361 1 0.0 0.0 0.0 4.680607 4.689362 4.696924 4.562721 4.510933 4.185089 0.487169 3.255514 3.155640 3.000434 3.000434 3.000434 1.0

30000 rows × 31 columns



Total running time of the script: (0 minutes 4.055 seconds)

Gallery generated by Sphinx-Gallery