Outlier Detection#

Installation

# To install the required package, use the following command:
# !pip install modeva

Authentication

# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')

Import modeva modules

from modeva import DataSet

Load a simulated Friedman data

from sklearn.datasets import make_friedman1

ds = DataSet()
ds.load("BikeSharing")

Outlier detection by CBLOF#

results = ds.detect_outlier_cblof(dataset="main", method="kmeans", threshold=0.9)
results.plot()


Outlier detection by Isolation forest#

results = ds.detect_outlier_isolation_forest()
results.plot()


Outlier detection by PCA#

results = ds.detect_outlier_pca(dataset="main", method="reconst_error")
outliers_sample_index = results.table['outliers'].index
results.plot()


View and use outlier detection results#

Outliers table

results.table['outliers']
season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed
115 1.0 0.0 1.0 0.0 0.0 4.0 1.0 1.0 0.18 0.2424 0.55 0.0000
479 1.0 0.0 1.0 0.0 0.0 6.0 0.0 1.0 0.04 0.0303 0.45 0.2537
585 1.0 0.0 1.0 16.0 0.0 3.0 1.0 4.0 0.22 0.1970 0.93 0.3284
685 1.0 0.0 1.0 21.0 0.0 1.0 1.0 3.0 0.16 0.1970 0.59 0.0896
966 1.0 0.0 2.0 21.0 0.0 6.0 0.0 1.0 0.26 0.3030 0.41 0.0000
... ... ... ... ... ... ... ... ... ... ... ... ...
17041 4.0 1.0 12.0 20.0 0.0 1.0 1.0 2.0 0.42 0.4242 0.94 0.2537
17042 4.0 1.0 12.0 21.0 0.0 1.0 1.0 2.0 0.42 0.4242 0.94 0.1343
17043 4.0 1.0 12.0 22.0 0.0 1.0 1.0 2.0 0.42 0.4242 0.94 0.1343
17319 1.0 1.0 12.0 12.0 0.0 6.0 0.0 3.0 0.20 0.2424 1.00 0.0000
17320 1.0 1.0 12.0 13.0 0.0 6.0 0.0 3.0 0.20 0.2424 1.00 0.0000

174 rows × 12 columns



non-outliers table

results.table['non-outliers']
season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed
0 1.0 0.0 1.0 0.0 0.0 6.0 0.0 1.0 0.24 0.2879 0.81 0.0000
1 1.0 0.0 1.0 1.0 0.0 6.0 0.0 1.0 0.22 0.2727 0.80 0.0000
2 1.0 0.0 1.0 2.0 0.0 6.0 0.0 1.0 0.22 0.2727 0.80 0.0000
3 1.0 0.0 1.0 3.0 0.0 6.0 0.0 1.0 0.24 0.2879 0.75 0.0000
4 1.0 0.0 1.0 4.0 0.0 6.0 0.0 1.0 0.24 0.2879 0.75 0.0000
... ... ... ... ... ... ... ... ... ... ... ... ...
17374 1.0 1.0 12.0 19.0 0.0 1.0 1.0 2.0 0.26 0.2576 0.60 0.1642
17375 1.0 1.0 12.0 20.0 0.0 1.0 1.0 2.0 0.26 0.2576 0.60 0.1642
17376 1.0 1.0 12.0 21.0 0.0 1.0 1.0 1.0 0.26 0.2576 0.60 0.1642
17377 1.0 1.0 12.0 22.0 0.0 1.0 1.0 1.0 0.26 0.2727 0.56 0.1343
17378 1.0 1.0 12.0 23.0 0.0 1.0 1.0 1.0 0.26 0.2727 0.65 0.1343

17205 rows × 12 columns



Evaluate outlier scores of samples

results.func(results.table['outliers'])
array([ 148.03891696,  182.56906283, 5733.14875539,  147.3346861 ,
        200.58798145,  222.27130596,  225.13066686,  238.00511972,
        233.38877612,  619.40173483,  609.04067592,  611.7782731 ,
        311.7196138 ,  438.32731152,  332.00499824,  184.66824562,
        211.1884769 ,  159.07236131,  150.96001879,  163.1747162 ,
        188.34190655,  200.1725592 ,  203.00823353,  372.98914508,
        385.99496903,  317.80464866,  204.14198544,  153.78091513,
        248.65313364,  163.84309127,  167.41267808,  227.3825739 ,
        252.54057249,  207.73576964,  187.31771278,  150.99470963,
        200.81036737,  228.12994237,  153.46568324,  602.66311172,
        391.07455473,  184.60372308,  442.60986068,  358.40079126,
        393.88811712,  357.35708636,  351.64117468,  443.70141498,
        377.7231485 ,  374.34549761,  422.17742966,  422.05759116,
        387.85911047,  353.30378332,  351.26452012,  363.76191292,
        363.64293911,  179.70876671,  251.23268937,  285.1596886 ,
        212.87638926,  394.78831236,  310.6595109 ,  281.4878481 ,
        218.32294844,  172.19668267,  164.58744649,  167.08304836,
        222.10344295,  219.2831581 ,  412.37398868,  640.51749181,
        460.34941552,  196.73937898,  145.35911069,  169.49080441,
        310.94301852, 5922.47391497,  218.1552055 ,  254.43645358,
        163.11877264,  171.19284549, 5854.69896608,  208.05251747,
        605.61302157,  157.45396535,  374.45206945, 1202.14583421,
        166.81561248,  166.85092333,  790.93370274,  292.11345341,
        248.81689633,  739.86580032,  287.31663391,  392.15324403,
        359.93651168,  332.3668836 ,  161.1044807 ,  155.14108088,
        371.34619687,  200.4952176 ,  150.2883102 ,  192.29962235,
        254.38169601,  266.89852583,  160.85635531,  171.68506386,
        165.67196976,  195.51036978,  320.16373902,  268.41498684,
        323.42231476,  322.12240358,  352.76589893,  486.33972852,
        440.95855359,  383.3770237 ,  424.15060264,  529.01028866,
        249.01428879,  384.63209229,  582.22424265,  248.81719511,
        280.18544136, 1626.54575816,  210.95042713,  178.06005889,
        202.55334894,  270.07630248,  240.4117694 ,  261.73172633,
        292.55287443,  207.15718208,  302.34597341,  405.35430714,
        171.19940979,  422.21508346,  418.21208736,  442.26635944,
        145.82406478,  165.715107  ,  340.0305032 ,  233.62129865,
        523.01853939, 1377.14582492,  149.22122524,  164.43404303,
        273.88344987,  261.56381725,  606.87680055,  576.58060728,
        214.98300947,  246.86689458,  217.96020776,  478.52340825,
        225.77957888,  373.75513729,  231.50986901,  202.01845475,
        901.71343716,  217.86173277,  486.64728517,  228.90490433,
        858.10862531,  598.08544422,  354.51526166,  520.43600182,
       1235.21974921,  383.07592173,  432.90398368,  912.23565667,
        150.57501659,  334.92212328])

Evaluate outlier scores of samples

results.func(results.table['non-outliers'])
array([ 6.89082292,  5.52420093, 13.43309994, ..., 16.4259185 ,
       17.02398097, 15.1060883 ])

Apply outlier removal

ds.set_inactive_samples(dataset="main", sample_idx=outliers_sample_index)
ds.x.shape
(17205, 12)

Total running time of the script: (0 minutes 21.624 seconds)

Gallery generated by Sphinx-Gallery