Note

Go to the end to download the full example code.

Exploratory Data Analysis#

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Load TaiwanCredit Dataset

from modeva import DataSet
ds = DataSet()
ds.load("TaiwanCredit")

Data summary#

result = ds.summary()
result.table["summary"]

	samples	features	numerical	categorical	mixed	date	duplicated	missing cells	missing cells (%)	infinite cells	infinite cells (%)
0	30000	24	20	4	0	0	70	0	0.0	0	0.0

Data summary results for numerical variables

result.table["numerical"]

	missing	inf	unique	mean	std	min	25%	median	75%	max
name
LIMIT_BAL	0	0	81	167484.322667	129745.499088	10000.000000	50000.000000	140000.000000	240000.000000	1000000.000000
AGE	0	0	56	35.485500	9.217750	21.000000	28.000000	34.000000	41.000000	79.000000
PAY_1	0	0	10	0.167233	0.931307	-1.000000	0.000000	0.000000	0.000000	8.000000
PAY_2	0	0	10	0.118367	0.965827	-1.000000	0.000000	0.000000	0.000000	8.000000
PAY_3	0	0	10	0.106133	0.950861	-1.000000	0.000000	0.000000	0.000000	8.000000
PAY_4	0	0	10	0.069200	0.911598	-1.000000	0.000000	0.000000	0.000000	8.000000
PAY_5	0	0	9	0.036867	0.864547	-1.000000	0.000000	0.000000	0.000000	8.000000
PAY_6	0	0	9	0.035233	0.867905	-1.000000	0.000000	0.000000	0.000000	8.000000
BILL_AMT1	0	0	22723	3.885958	1.547011	-5.219010	3.551419	4.349909	4.826671	5.984308
BILL_AMT2	0	0	22346	3.795420	1.654506	-4.843718	3.475053	4.326357	4.806229	5.992965
BILL_AMT3	0	0	22026	3.739384	1.696812	-5.196632	3.426064	4.302969	4.779349	6.221177
BILL_AMT4	0	0	21548	3.670062	1.733875	-5.230452	3.366936	4.279964	4.736452	5.950164
BILL_AMT5	0	0	21010	3.599323	1.762511	-4.910277	3.246499	4.257811	4.700630	5.967160
BILL_AMT6	0	0	20604	3.507501	1.833031	-5.530973	3.099335	4.232285	4.691958	5.983024
PAY_AMT1	0	0	7943	2.879374	1.411566	0.000000	3.000434	3.322426	3.699578	5.941289
PAY_AMT2	0	0	7899	2.850124	1.423921	0.000000	2.921166	3.303196	3.699057	6.226409
PAY_AMT3	0	0	7518	2.728754	1.454741	0.000000	2.592177	3.255514	3.653791	5.952328
PAY_AMT4	0	0	6937	2.639519	1.475099	0.000000	2.472756	3.176381	3.603604	5.793092
PAY_AMT5	0	0	6897	2.619748	1.495683	0.000000	2.403976	3.176381	3.605574	5.629950
PAY_AMT6	0	0	6939	2.576596	1.532205	0.000000	2.074631	3.176381	3.602169	5.723182

Data summary results for categorical variables

result.table["categorical"]

	missing	unique	top1	top1-counts	top2	top2-counts	top3	top3-counts
name
SEX	0	2	2.0	18112	1.0	11888	NaN	NaN
EDUCATION	0	4	2.0	14030	1.0	10585	3.0	4917.0
MARRIAGE	0	3	2.0	15964	1.0	13659	0.0	377.0
FlagDefault	0	2	0.0	23364	1.0	6636	NaN	NaN

Data summary results for mixed numerical and categorical variables

result.table["mixed"]

	missing	inf	unique	mean	std	min	25%	median	75%	max	categories
name

EDA 1D#

EDA 1D by density

result = ds.eda_1d(feature="PAY_1")
result.plot()

EDA 1D by histogram

result = ds.eda_1d(feature="BILL_AMT1", plot_type="histogram")
result.plot()

EDA 2D#

EDA 2D with 2 numerical features

result = ds.eda_2d(feature_x="BILL_AMT1", feature_y="PAY_1", sample_size=1000)
result.plot()

EDA 2D with color and smoothing curve

result = ds.eda_2d(feature_x="BILL_AMT1", feature_y="BILL_AMT2", feature_color="SEX", sample_size=1000,
                   smoother_order=2)
result.plot(figsize=(6, 5))

EDA 2D between numerical and categorical variables

result = ds.eda_2d(feature_x="SEX", feature_y="BILL_AMT1")
result.plot()

EDA 2D between two categorical and categorical variables

result = ds.eda_2d(feature_x="MARRIAGE", feature_y="SEX")
result.plot()

EDA 3D#

result = ds.eda_3d(feature_x="SEX", feature_y="PAY_1", feature_z="BILL_AMT1", feature_color="EDUCATION",
                   sample_size=1000)
result.plot()

Correlation#

result = ds.eda_correlation(features=('PAY_1',
                                      'PAY_2',
                                      'PAY_3',
                                      'PAY_4',
                                      'PAY_5',
                                      'PAY_6'),
                            dataset="main", sample_size=10000)
result.plot()

PCA#

result = ds.eda_pca(features=("EDUCATION",
                              "MARRIAGE",
                              'PAY_1',
                              'PAY_2',
                              'PAY_3',
                              'PAY_4',
                              'PAY_5',
                              'PAY_6'),
                    n_components=10, dataset="main", sample_size=None)
result.plot()

Umap#

result = ds.eda_umap(features=('PAY_1',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6'), n_components=2, dataset="main", sample_size=1000)
result.table

	D1	D2
0	-1.251380	-14.637231
1	3.678666	0.151090
2	-4.441707	-3.906131
3	3.061519	17.337599
4	3.056328	17.484224
...	...	...
995	2.250372	16.981730
996	-14.782492	4.023606
997	1.817664	17.244024
998	8.220098	0.437721
999	-8.766402	11.237566

1000 rows × 2 columns

Total running time of the script: (0 minutes 43.831 seconds)

Gallery generated by Sphinx-Gallery