Exploratory Data Analysis#

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Load TaiwanCredit Dataset

from modeva import DataSet
ds = DataSet()
ds.load("TaiwanCredit")

Data summary#

result = ds.summary()
result.table["summary"]
samples features numerical categorical mixed date duplicated missing cells missing cells (%) infinite cells infinite cells (%)
0 30000 24 20 4 0 0 70 0 0.0 0 0.0


Data summary results for numerical variables

result.table["numerical"]
missing inf unique mean std min 25% median 75% max
name
LIMIT_BAL 0 0 81 167484.322667 129745.499088 10000.000000 50000.000000 140000.000000 240000.000000 1000000.000000
AGE 0 0 56 35.485500 9.217750 21.000000 28.000000 34.000000 41.000000 79.000000
PAY_1 0 0 10 0.167233 0.931307 -1.000000 0.000000 0.000000 0.000000 8.000000
PAY_2 0 0 10 0.118367 0.965827 -1.000000 0.000000 0.000000 0.000000 8.000000
PAY_3 0 0 10 0.106133 0.950861 -1.000000 0.000000 0.000000 0.000000 8.000000
PAY_4 0 0 10 0.069200 0.911598 -1.000000 0.000000 0.000000 0.000000 8.000000
PAY_5 0 0 9 0.036867 0.864547 -1.000000 0.000000 0.000000 0.000000 8.000000
PAY_6 0 0 9 0.035233 0.867905 -1.000000 0.000000 0.000000 0.000000 8.000000
BILL_AMT1 0 0 22723 3.885958 1.547011 -5.219010 3.551419 4.349909 4.826671 5.984308
BILL_AMT2 0 0 22346 3.795420 1.654506 -4.843718 3.475053 4.326357 4.806229 5.992965
BILL_AMT3 0 0 22026 3.739384 1.696812 -5.196632 3.426064 4.302969 4.779349 6.221177
BILL_AMT4 0 0 21548 3.670062 1.733875 -5.230452 3.366936 4.279964 4.736452 5.950164
BILL_AMT5 0 0 21010 3.599323 1.762511 -4.910277 3.246499 4.257811 4.700630 5.967160
BILL_AMT6 0 0 20604 3.507501 1.833031 -5.530973 3.099335 4.232285 4.691958 5.983024
PAY_AMT1 0 0 7943 2.879374 1.411566 0.000000 3.000434 3.322426 3.699578 5.941289
PAY_AMT2 0 0 7899 2.850124 1.423921 0.000000 2.921166 3.303196 3.699057 6.226409
PAY_AMT3 0 0 7518 2.728754 1.454741 0.000000 2.592177 3.255514 3.653791 5.952328
PAY_AMT4 0 0 6937 2.639519 1.475099 0.000000 2.472756 3.176381 3.603604 5.793092
PAY_AMT5 0 0 6897 2.619748 1.495683 0.000000 2.403976 3.176381 3.605574 5.629950
PAY_AMT6 0 0 6939 2.576596 1.532205 0.000000 2.074631 3.176381 3.602169 5.723182


Data summary results for categorical variables

result.table["categorical"]
missing unique top1 top1-counts top2 top2-counts top3 top3-counts
name
SEX 0 2 2.0 18112 1.0 11888 NaN NaN
EDUCATION 0 4 2.0 14030 1.0 10585 3.0 4917.0
MARRIAGE 0 3 2.0 15964 1.0 13659 0.0 377.0
FlagDefault 0 2 0.0 23364 1.0 6636 NaN NaN


Data summary results for mixed numerical and categorical variables

result.table["mixed"]
missing inf unique mean std min 25% median 75% max categories
name


EDA 1D#

EDA 1D by density

result = ds.eda_1d(feature="PAY_1")
result.plot()


EDA 1D by histogram

result = ds.eda_1d(feature="BILL_AMT1", plot_type="histogram")
result.plot()


EDA 2D#

EDA 2D with 2 numerical features

result = ds.eda_2d(feature_x="BILL_AMT1", feature_y="PAY_1", sample_size=1000)
result.plot()


EDA 2D with color and smoothing curve

result = ds.eda_2d(feature_x="BILL_AMT1", feature_y="BILL_AMT2", feature_color="SEX", sample_size=1000,
                   smoother_order=2)
result.plot(figsize=(6, 5))


EDA 2D between numerical and categorical variables

result = ds.eda_2d(feature_x="SEX", feature_y="BILL_AMT1")
result.plot()


EDA 2D between two categorical and categorical variables

result = ds.eda_2d(feature_x="MARRIAGE", feature_y="SEX")
result.plot()


EDA 3D#

result = ds.eda_3d(feature_x="SEX", feature_y="PAY_1", feature_z="BILL_AMT1", feature_color="EDUCATION",
                   sample_size=1000)
result.plot()


Correlation#

result = ds.eda_correlation(features=('PAY_1',
                                      'PAY_2',
                                      'PAY_3',
                                      'PAY_4',
                                      'PAY_5',
                                      'PAY_6'),
                            dataset="main", sample_size=10000)
result.plot()


PCA#

result = ds.eda_pca(features=("EDUCATION",
                              "MARRIAGE",
                              'PAY_1',
                              'PAY_2',
                              'PAY_3',
                              'PAY_4',
                              'PAY_5',
                              'PAY_6'),
                    n_components=10, dataset="main", sample_size=None)
result.plot()


Umap#

result = ds.eda_umap(features=('PAY_1',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6'), n_components=2, dataset="main", sample_size=1000)
result.table
D1 D2
0 -1.251380 -14.637231
1 3.678666 0.151090
2 -4.441707 -3.906131
3 3.061519 17.337599
4 3.056328 17.484224
... ... ...
995 2.250372 16.981730
996 -14.782492 4.023606
997 1.817664 17.244024
998 8.220098 0.437721
999 -8.766402 11.237566

1000 rows × 2 columns



Total running time of the script: (0 minutes 43.831 seconds)

Gallery generated by Sphinx-Gallery