Outlier Detection#

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

from modeva import DataSet

Load a simulated Friedman data

from sklearn.datasets import make_friedman1

ds = DataSet()
ds.load("BikeSharing")

Outlier detection by CBLOF#

results = ds.detect_outlier_cblof(dataset="main", method="kmeans", threshold=0.9)
results.plot()

Outlier detection by Isolation forest#

results = ds.detect_outlier_isolation_forest()
results.plot()

Outlier detection by PCA#

results = ds.detect_outlier_pca(dataset="main", method="reconst_error")
outliers_sample_index = results.table['outliers'].index
results.plot()

View and use outlier detection results#

Outliers table

results.table['outliers']

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed
115	1.0	0.0	1.0	0.0	0.0	4.0	1.0	1.0	0.18	0.2424	0.55	0.0000
479	1.0	0.0	1.0	0.0	0.0	6.0	0.0	1.0	0.04	0.0303	0.45	0.2537
585	1.0	0.0	1.0	16.0	0.0	3.0	1.0	4.0	0.22	0.1970	0.93	0.3284
685	1.0	0.0	1.0	21.0	0.0	1.0	1.0	3.0	0.16	0.1970	0.59	0.0896
966	1.0	0.0	2.0	21.0	0.0	6.0	0.0	1.0	0.26	0.3030	0.41	0.0000
...	...	...	...	...	...	...	...	...	...	...	...	...
17041	4.0	1.0	12.0	20.0	0.0	1.0	1.0	2.0	0.42	0.4242	0.94	0.2537
17042	4.0	1.0	12.0	21.0	0.0	1.0	1.0	2.0	0.42	0.4242	0.94	0.1343
17043	4.0	1.0	12.0	22.0	0.0	1.0	1.0	2.0	0.42	0.4242	0.94	0.1343
17319	1.0	1.0	12.0	12.0	0.0	6.0	0.0	3.0	0.20	0.2424	1.00	0.0000
17320	1.0	1.0	12.0	13.0	0.0	6.0	0.0	3.0	0.20	0.2424	1.00	0.0000

174 rows × 12 columns

non-outliers table

results.table['non-outliers']

	season	yr	mnth	hr	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed
0	1.0	0.0	1.0	0.0	0.0	6.0	0.0	1.0	0.24	0.2879	0.81	0.0000
1	1.0	0.0	1.0	1.0	0.0	6.0	0.0	1.0	0.22	0.2727	0.80	0.0000
2	1.0	0.0	1.0	2.0	0.0	6.0	0.0	1.0	0.22	0.2727	0.80	0.0000
3	1.0	0.0	1.0	3.0	0.0	6.0	0.0	1.0	0.24	0.2879	0.75	0.0000
4	1.0	0.0	1.0	4.0	0.0	6.0	0.0	1.0	0.24	0.2879	0.75	0.0000
...	...	...	...	...	...	...	...	...	...	...	...	...
17374	1.0	1.0	12.0	19.0	0.0	1.0	1.0	2.0	0.26	0.2576	0.60	0.1642
17375	1.0	1.0	12.0	20.0	0.0	1.0	1.0	2.0	0.26	0.2576	0.60	0.1642
17376	1.0	1.0	12.0	21.0	0.0	1.0	1.0	1.0	0.26	0.2576	0.60	0.1642
17377	1.0	1.0	12.0	22.0	0.0	1.0	1.0	1.0	0.26	0.2727	0.56	0.1343
17378	1.0	1.0	12.0	23.0	0.0	1.0	1.0	1.0	0.26	0.2727	0.65	0.1343

17205 rows × 12 columns

Evaluate outlier scores of samples

results.func(results.table['outliers'])

array([ 148.03891696,  182.56906283, 5733.14875539,  147.3346861 ,
        200.58798145,  222.27130596,  225.13066686,  238.00511972,
        233.38877612,  619.40173483,  609.04067592,  611.7782731 ,
        311.7196138 ,  438.32731152,  332.00499824,  184.66824562,
        211.1884769 ,  159.07236131,  150.96001879,  163.1747162 ,
        188.34190655,  200.1725592 ,  203.00823353,  372.98914508,
        385.99496903,  317.80464866,  204.14198544,  153.78091513,
        248.65313364,  163.84309127,  167.41267808,  227.3825739 ,
        252.54057249,  207.73576964,  187.31771278,  150.99470963,
        200.81036737,  228.12994237,  153.46568324,  602.66311172,
        391.07455473,  184.60372308,  442.60986068,  358.40079126,
        393.88811712,  357.35708636,  351.64117468,  443.70141498,
        377.7231485 ,  374.34549761,  422.17742966,  422.05759116,
        387.85911047,  353.30378332,  351.26452012,  363.76191292,
        363.64293911,  179.70876671,  251.23268937,  285.1596886 ,
        212.87638926,  394.78831236,  310.6595109 ,  281.4878481 ,
        218.32294844,  172.19668267,  164.58744649,  167.08304836,
        222.10344295,  219.2831581 ,  412.37398868,  640.51749181,
        460.34941552,  196.73937898,  145.35911069,  169.49080441,
        310.94301852, 5922.47391497,  218.1552055 ,  254.43645358,
        163.11877264,  171.19284549, 5854.69896608,  208.05251747,
        605.61302157,  157.45396535,  374.45206945, 1202.14583421,
        166.81561248,  166.85092333,  790.93370274,  292.11345341,
        248.81689633,  739.86580032,  287.31663391,  392.15324403,
        359.93651168,  332.3668836 ,  161.1044807 ,  155.14108088,
        371.34619687,  200.4952176 ,  150.2883102 ,  192.29962235,
        254.38169601,  266.89852583,  160.85635531,  171.68506386,
        165.67196976,  195.51036978,  320.16373902,  268.41498684,
        323.42231476,  322.12240358,  352.76589893,  486.33972852,
        440.95855359,  383.3770237 ,  424.15060264,  529.01028866,
        249.01428879,  384.63209229,  582.22424265,  248.81719511,
        280.18544136, 1626.54575816,  210.95042713,  178.06005889,
        202.55334894,  270.07630248,  240.4117694 ,  261.73172633,
        292.55287443,  207.15718208,  302.34597341,  405.35430714,
        171.19940979,  422.21508346,  418.21208736,  442.26635944,
        145.82406478,  165.715107  ,  340.0305032 ,  233.62129865,
        523.01853939, 1377.14582492,  149.22122524,  164.43404303,
        273.88344987,  261.56381725,  606.87680055,  576.58060728,
        214.98300947,  246.86689458,  217.96020776,  478.52340825,
        225.77957888,  373.75513729,  231.50986901,  202.01845475,
        901.71343716,  217.86173277,  486.64728517,  228.90490433,
        858.10862531,  598.08544422,  354.51526166,  520.43600182,
       1235.21974921,  383.07592173,  432.90398368,  912.23565667,
        150.57501659,  334.92212328])

Evaluate outlier scores of samples

results.func(results.table['non-outliers'])

array([ 6.89082292,  5.52420093, 13.43309994, ..., 16.4259185 ,
       17.02398097, 15.1060883 ])

Apply outlier removal

ds.set_inactive_samples(dataset="main", sample_idx=outliers_sample_index)
ds.x.shape

(17205, 12)

Total running time of the script: (0 minutes 21.624 seconds)

Gallery generated by Sphinx-Gallery