Note
Go to the end to download the full example code.
Outlier Detection#
Installation
# To install the required package, use the following command:
# !pip install modeva
Authentication
# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')
Import modeva modules
from modeva import DataSet
Load a simulated Friedman data
from sklearn.datasets import make_friedman1
ds = DataSet()
ds.load("BikeSharing")
Outlier detection by CBLOF#
results = ds.detect_outlier_cblof(dataset="main", method="kmeans", threshold=0.9)
results.plot()
Outlier detection by Isolation forest#
results = ds.detect_outlier_isolation_forest()
results.plot()
Outlier detection by PCA#
results = ds.detect_outlier_pca(dataset="main", method="reconst_error")
outliers_sample_index = results.table['outliers'].index
results.plot()
View and use outlier detection results#
Outliers table
results.table['outliers']
non-outliers table
results.table['non-outliers']
Evaluate outlier scores of samples
results.func(results.table['outliers'])
array([ 148.03891696, 182.56906283, 5733.14875539, 147.3346861 ,
200.58798145, 222.27130596, 225.13066686, 238.00511972,
233.38877612, 619.40173483, 609.04067592, 611.7782731 ,
311.7196138 , 438.32731152, 332.00499824, 184.66824562,
211.1884769 , 159.07236131, 150.96001879, 163.1747162 ,
188.34190655, 200.1725592 , 203.00823353, 372.98914508,
385.99496903, 317.80464866, 204.14198544, 153.78091513,
248.65313364, 163.84309127, 167.41267808, 227.3825739 ,
252.54057249, 207.73576964, 187.31771278, 150.99470963,
200.81036737, 228.12994237, 153.46568324, 602.66311172,
391.07455473, 184.60372308, 442.60986068, 358.40079126,
393.88811712, 357.35708636, 351.64117468, 443.70141498,
377.7231485 , 374.34549761, 422.17742966, 422.05759116,
387.85911047, 353.30378332, 351.26452012, 363.76191292,
363.64293911, 179.70876671, 251.23268937, 285.1596886 ,
212.87638926, 394.78831236, 310.6595109 , 281.4878481 ,
218.32294844, 172.19668267, 164.58744649, 167.08304836,
222.10344295, 219.2831581 , 412.37398868, 640.51749181,
460.34941552, 196.73937898, 145.35911069, 169.49080441,
310.94301852, 5922.47391497, 218.1552055 , 254.43645358,
163.11877264, 171.19284549, 5854.69896608, 208.05251747,
605.61302157, 157.45396535, 374.45206945, 1202.14583421,
166.81561248, 166.85092333, 790.93370274, 292.11345341,
248.81689633, 739.86580032, 287.31663391, 392.15324403,
359.93651168, 332.3668836 , 161.1044807 , 155.14108088,
371.34619687, 200.4952176 , 150.2883102 , 192.29962235,
254.38169601, 266.89852583, 160.85635531, 171.68506386,
165.67196976, 195.51036978, 320.16373902, 268.41498684,
323.42231476, 322.12240358, 352.76589893, 486.33972852,
440.95855359, 383.3770237 , 424.15060264, 529.01028866,
249.01428879, 384.63209229, 582.22424265, 248.81719511,
280.18544136, 1626.54575816, 210.95042713, 178.06005889,
202.55334894, 270.07630248, 240.4117694 , 261.73172633,
292.55287443, 207.15718208, 302.34597341, 405.35430714,
171.19940979, 422.21508346, 418.21208736, 442.26635944,
145.82406478, 165.715107 , 340.0305032 , 233.62129865,
523.01853939, 1377.14582492, 149.22122524, 164.43404303,
273.88344987, 261.56381725, 606.87680055, 576.58060728,
214.98300947, 246.86689458, 217.96020776, 478.52340825,
225.77957888, 373.75513729, 231.50986901, 202.01845475,
901.71343716, 217.86173277, 486.64728517, 228.90490433,
858.10862531, 598.08544422, 354.51526166, 520.43600182,
1235.21974921, 383.07592173, 432.90398368, 912.23565667,
150.57501659, 334.92212328])
Evaluate outlier scores of samples
results.func(results.table['non-outliers'])
array([ 6.89082292, 5.52420093, 13.43309994, ..., 16.4259185 ,
17.02398097, 15.1060883 ])
Apply outlier removal
ds.set_inactive_samples(dataset="main", sample_idx=outliers_sample_index)
ds.x.shape
(17205, 12)
Total running time of the script: (0 minutes 21.624 seconds)