Dealing with Date Variables
Installation
# To install the required package, use the following command:
# !pip install modeva
Authentication
# To get authentication, use the following command: (To get full access please replace the token to your own token)
# from modeva.utils.authenticate import authenticate
# authenticate(auth_code='eaaa4301-b140-484c-8e93-f9f633c8bacb')
Load BikeSharing Dataset
import pandas as pd
from modeva import DataSet
from modeva.data.utils.loading import load_builtin_data
data = load_builtin_data("BikeSharing")
data['Date'] = (pd.to_datetime('2011-01-01') + pd.to_timedelta(data.index / 24, unit='D')).date
data.head()
|
season |
yr |
mnth |
hr |
holiday |
weekday |
workingday |
weathersit |
temp |
atemp |
hum |
windspeed |
cnt |
Date |
0 |
1 |
0 |
1 |
0 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.81 |
0.0 |
16 |
2011-01-01 |
1 |
1 |
0 |
1 |
1 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
40 |
2011-01-01 |
2 |
1 |
0 |
1 |
2 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
32 |
2011-01-01 |
3 |
1 |
0 |
1 |
3 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
13 |
2011-01-01 |
4 |
1 |
0 |
1 |
4 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
1 |
2011-01-01 |
Create some missing and special values for demo purpose
data["Date"].iloc[:10] = "SV1"
data["Date"].iloc[10:15] = "SV2"
data["Date"].iloc[5:20] = pd.NA
data.head()
|
season |
yr |
mnth |
hr |
holiday |
weekday |
workingday |
weathersit |
temp |
atemp |
hum |
windspeed |
cnt |
Date |
0 |
1 |
0 |
1 |
0 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.81 |
0.0 |
16 |
SV1 |
1 |
1 |
0 |
1 |
1 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
40 |
SV1 |
2 |
1 |
0 |
1 |
2 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0 |
32 |
SV1 |
3 |
1 |
0 |
1 |
3 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
13 |
SV1 |
4 |
1 |
0 |
1 |
4 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0 |
1 |
SV1 |
Load the data into Modeva DataSet
ds = DataSet()
ds.load_dataframe(data)
ds.set_target("cnt")
ds.set_inactive_features(features=('Date', ))
ds.set_random_split(shuffle=False)
ds.reset_preprocess()
ds.impute_missing(features="Date", method='constant', fill_value="2011-01-01",
add_indicators=True, special_values=["SV1", "SV2"])
# Uncomment the following to convert date into binned integers.
# ds.encode_categorical(features=("date", ), method="ordinal")
# ds.bin_numerical(features=("date", ), bins=5)
ds.preprocess()
ds.to_df()
|
season |
yr |
mnth |
hr |
holiday |
weekday |
workingday |
weathersit |
temp |
atemp |
hum |
windspeed |
cnt |
Date |
Date_missing_nan |
Date_special_SV1 |
0 |
1 |
0 |
1 |
0 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.81 |
0.0000 |
16 |
2011-01-01 |
0 |
True |
1 |
1 |
0 |
1 |
1 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0000 |
40 |
2011-01-01 |
0 |
True |
2 |
1 |
0 |
1 |
2 |
0 |
6 |
0 |
1 |
0.22 |
0.2727 |
0.80 |
0.0000 |
32 |
2011-01-01 |
0 |
True |
3 |
1 |
0 |
1 |
3 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0000 |
13 |
2011-01-01 |
0 |
True |
4 |
1 |
0 |
1 |
4 |
0 |
6 |
0 |
1 |
0.24 |
0.2879 |
0.75 |
0.0000 |
1 |
2011-01-01 |
0 |
True |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
17374 |
1 |
1 |
12 |
19 |
0 |
1 |
1 |
2 |
0.26 |
0.2576 |
0.60 |
0.1642 |
119 |
2012-12-24 |
0 |
False |
17375 |
1 |
1 |
12 |
20 |
0 |
1 |
1 |
2 |
0.26 |
0.2576 |
0.60 |
0.1642 |
89 |
2012-12-24 |
0 |
False |
17376 |
1 |
1 |
12 |
21 |
0 |
1 |
1 |
1 |
0.26 |
0.2576 |
0.60 |
0.1642 |
90 |
2012-12-25 |
0 |
False |
17377 |
1 |
1 |
12 |
22 |
0 |
1 |
1 |
1 |
0.26 |
0.2727 |
0.56 |
0.1343 |
61 |
2012-12-25 |
0 |
False |
17378 |
1 |
1 |
12 |
23 |
0 |
1 |
1 |
1 |
0.26 |
0.2727 |
0.65 |
0.1343 |
49 |
2012-12-25 |
0 |
False |
17379 rows × 16 columns
Data summary
result = ds.summary()
result.table["summary"]
|
samples |
features |
numerical |
categorical |
mixed |
date |
duplicated |
missing cells |
missing cells (%) |
infinite cells |
infinite cells (%) |
0 |
17379 |
16 |
8 |
7 |
0 |
1 |
0 |
0 |
0.0 |
0 |
0.0 |
Data summary results for numerical variables
result.table["numerical"]
|
missing |
inf |
unique |
mean |
std |
min |
25% |
median |
75% |
max |
name |
|
|
|
|
|
|
|
|
|
|
mnth |
0 |
0 |
12 |
6.537775 |
3.438677 |
1.00 |
4.0000 |
7.0000 |
10.0000 |
12.0000 |
hr |
0 |
0 |
24 |
11.546752 |
6.914206 |
0.00 |
6.0000 |
12.0000 |
18.0000 |
23.0000 |
weekday |
0 |
0 |
7 |
3.003683 |
2.005714 |
0.00 |
1.0000 |
3.0000 |
5.0000 |
6.0000 |
temp |
0 |
0 |
50 |
0.496987 |
0.192551 |
0.02 |
0.3400 |
0.5000 |
0.6600 |
1.0000 |
atemp |
0 |
0 |
65 |
0.475775 |
0.171845 |
0.00 |
0.3333 |
0.4848 |
0.6212 |
1.0000 |
hum |
0 |
0 |
89 |
0.627229 |
0.192924 |
0.00 |
0.4800 |
0.6300 |
0.7800 |
1.0000 |
windspeed |
0 |
0 |
30 |
0.190098 |
0.122337 |
0.00 |
0.1045 |
0.1940 |
0.2537 |
0.8507 |
cnt |
0 |
0 |
869 |
189.463088 |
181.382380 |
1.00 |
40.0000 |
142.0000 |
281.0000 |
977.0000 |
Data summary results for categorical variables
result.table["categorical"]
|
missing |
unique |
top1 |
top1-counts |
top2 |
top2-counts |
top3 |
top3-counts |
name |
|
|
|
|
|
|
|
|
season |
0 |
4 |
3 |
4496 |
2 |
4409 |
1.0 |
4242.0 |
yr |
0 |
2 |
1 |
8734 |
0 |
8645 |
NaN |
NaN |
holiday |
0 |
2 |
0 |
16879 |
1 |
500 |
NaN |
NaN |
workingday |
0 |
2 |
1 |
11865 |
0 |
5514 |
NaN |
NaN |
weathersit |
0 |
4 |
1 |
11413 |
2 |
4544 |
3.0 |
1419.0 |
Date_missing_nan |
0 |
2 |
0 |
17364 |
1 |
15 |
NaN |
NaN |
Date_special_SV1 |
0 |
2 |
False |
17374 |
True |
5 |
NaN |
NaN |
Data summary results for mixed numerical and categorical variables
|
missing |
inf |
unique |
mean |
std |
min |
25% |
median |
75% |
max |
categories |
name |
|
|
|
|
|
|
|
|
|
|
|
Data summary results for date type variables
|
missing |
unique |
min |
max |
range_days |
name |
|
|
|
|
|
Date |
0 |
725 |
2011-01-01 |
2012-12-25 |
724 |
EDA 2D
EDA 2D between Date and a numerical feature
result = ds.eda_2d(feature_x="Date", feature_y="cnt")
result.plot()
EDA 3D
result = ds.eda_3d(feature_x="Date", feature_y="hr", feature_z="cnt", sample_size=1000)
result.plot()
Total running time of the script: (0 minutes 1.739 seconds)
Gallery generated by Sphinx-Gallery