-
Notifications
You must be signed in to change notification settings - Fork 110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding a periodic encoder to the DatetimeEncoder #1235
base: main
Are you sure you want to change the base?
Changes from 5 commits
bc9a5bc
176b18f
cc6455c
99b2d37
ee24931
1a53da2
156d397
3004442
9e6e05f
542522e
3804dc9
a981526
3380f83
ed5eb46
ea209d6
af9f363
d0ab8bf
1ac5caf
5d98ee3
c96d3fc
4b660e8
4dcae67
e160dcb
7be8721
2ab6471
aa88cd1
72ae07b
5bd2ffa
2d28a18
b2349f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# %% | ||
import pandas as pd | ||
|
||
from skrub import DatetimeEncoder, TableVectorizer, datasets | ||
|
||
data = datasets.fetch_bike_sharing().bike_sharing | ||
|
||
# Extract our input data (X) and the target column (y) | ||
y = data["cnt"] | ||
X = data[["date", "holiday", "temp", "hum", "windspeed", "weathersit"]] | ||
|
||
date = pd.to_datetime(X["date"]) | ||
|
||
# %% | ||
# %% | ||
# %% | ||
de = DatetimeEncoder(add_ordinal_day=True, add_periodic="day") | ||
|
||
r = de.fit_transform(date) | ||
print(r.columns) | ||
# %% | ||
tv = TableVectorizer() | ||
|
||
# X_t = tv.fit_transform(X) | ||
|
||
# %% |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
from datetime import datetime, timezone | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import FunctionTransformer, SplineTransformer | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
try: | ||
|
@@ -26,6 +28,8 @@ | |
"nanosecond", | ||
] | ||
|
||
_DEFAULT_ENCODING_PERIODS = {"year": 4, "month": 30, "weekday": 7, "hour": 24} | ||
|
||
|
||
@dispatch | ||
def _is_date(col): | ||
|
@@ -58,6 +62,9 @@ def _get_dt_feature_pandas(col, feature): | |
return ((col - epoch) / pd.Timedelta("1s")).astype("float32") | ||
if feature == "weekday": | ||
return col.dt.day_of_week + 1 | ||
if feature == "day_of_year": | ||
return col.dt.day_of_year | ||
|
||
assert feature in _TIME_LEVELS | ||
return getattr(col.dt, feature) | ||
|
||
|
@@ -66,6 +73,8 @@ def _get_dt_feature_pandas(col, feature): | |
def _get_dt_feature_polars(col, feature): | ||
if feature == "total_seconds": | ||
return (col.dt.timestamp(time_unit="ms") / 1000).cast(pl.Float32) | ||
if feature == "day_of_year": | ||
return getattr(col.dt, "ordinal_day")() | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert feature in _TIME_LEVELS + ["weekday"] | ||
return getattr(col.dt, feature)() | ||
|
||
|
@@ -256,10 +265,28 @@ class DatetimeEncoder(SingleColumnTransformer): | |
timezone used during ``fit`` and that we get the same result for "hour". | ||
""" # noqa: E501 | ||
|
||
def __init__(self, resolution="hour", add_weekday=False, add_total_seconds=True): | ||
def __init__( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we not generate the values when we are doing the periodic encoding? eg if we have spline encoding of the time of day, maybe we shouldn't by default output the "hour" feature There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe users still need the additional columns as features for some reason? Though I guess that by the point they get to vectorizing the table they're already done with exploring the data which is what keeping those features would help with 🤔 I'll remove the old features There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm not sure what would the best interface to make the skrub code simple and to make it easy for users to specify which features they want / figure out which features they will get 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A parameter like |
||
self, | ||
resolution="hour", | ||
add_weekday=False, | ||
add_total_seconds=True, | ||
add_day_of_year=False, | ||
add_periodic=False, | ||
year_encoding="circular", | ||
month_encoding="circular", | ||
weekday_encoding="circular", | ||
hour_encoding="circular", | ||
): | ||
self.resolution = resolution | ||
self.add_weekday = add_weekday | ||
self.add_total_seconds = add_total_seconds | ||
self.add_day_of_year = add_day_of_year | ||
self.add_periodic = add_periodic | ||
|
||
self.year_encoding = year_encoding | ||
self.month_encoding = month_encoding | ||
self.weekday_encoding = weekday_encoding | ||
self.hour_encoding = hour_encoding | ||
|
||
def fit_transform(self, column, y=None): | ||
"""Fit the encoder and transform a column. | ||
|
@@ -272,7 +299,7 @@ def fit_transform(self, column, y=None): | |
y : None | ||
Ignored. | ||
|
||
Returns | ||
ReturnsHistGradientBoostingRegressor() | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
------- | ||
transformed : DataFrame | ||
The extracted features. | ||
|
@@ -294,6 +321,44 @@ def fit_transform(self, column, y=None): | |
self.extracted_features_.append("total_seconds") | ||
if self.add_weekday: | ||
self.extracted_features_.append("weekday") | ||
if self.add_day_of_year: | ||
self.extracted_features_.append("day_of_year") | ||
|
||
# Adding transformers for periodic encoding | ||
self._required_transformers = {} | ||
|
||
# Iterating over all attributes that end with _encoding to use the default | ||
# parameters | ||
if self.add_periodic: | ||
_enc_attr = [attr for attr in self.__dict__ if attr.endswith("_encoding")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are only 4 so I think an explicit list with all 4 names would be fine here. If you really want to go with the inspection to retrieve the attributes using |
||
for _enc_name in _enc_attr: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. detail: local variable names don't often start with an underscore. no special reason but it's one more character and I don't see what it adds |
||
_enc = self.__getattribute__(_enc_name) | ||
_enc_case = _enc_name.split("_")[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why "case"? |
||
|
||
# The user provided a specific instance of the encoder for this case | ||
if isinstance(_enc, (SplineEncoder, CircularEncoder)): | ||
self._required_transformers[_enc_case] = _enc | ||
else: | ||
# This encoder has been disabled | ||
if _enc is None: | ||
continue | ||
if _enc not in ["circular", "spline"]: | ||
raise ValueError(f"Unsupported option {_enc} for {_enc_name}") | ||
if _enc == "circular": | ||
self._required_transformers[_enc_case] = CircularEncoder( | ||
period=_DEFAULT_ENCODING_PERIODS[_enc_case] | ||
) | ||
elif _enc == "spline": | ||
self._required_transformers[_enc_case] = SplineEncoder( | ||
period=_DEFAULT_ENCODING_PERIODS[_enc_case] | ||
) | ||
|
||
for _case, t in self._required_transformers.items(): | ||
_feat = _get_dt_feature(column, _case) | ||
_feat_name = sbd.name(_feat) + "_" + _case | ||
_feat = sbd.rename(_feat, _feat_name) | ||
t.fit(_feat) | ||
|
||
return self.transform(column) | ||
|
||
def transform(self, column): | ||
|
@@ -316,7 +381,17 @@ def transform(self, column): | |
extracted = _get_dt_feature(column, feature).rename(f"{name}_{feature}") | ||
extracted = sbd.to_float32(extracted) | ||
all_extracted.append(extracted) | ||
return sbd.make_dataframe_like(column, all_extracted) | ||
|
||
_new_features = [] | ||
for _case, t in self._required_transformers.items(): | ||
_feat = _get_dt_feature(column, _case) | ||
_new_features.append(t.transform(_feat)) | ||
|
||
X_out = sbd.make_dataframe_like(column, all_extracted) | ||
|
||
X_out = sbd.concat_horizontal(X_out, *_new_features) | ||
|
||
return X_out | ||
|
||
def _check_params(self): | ||
allowed = _TIME_LEVELS + [None] | ||
|
@@ -325,10 +400,118 @@ def _check_params(self): | |
f"'resolution' options are {allowed}, got {self.resolution!r}." | ||
) | ||
|
||
if isinstance(self.add_periodic, list): | ||
if not all([_ in ["day", "year"] for _ in self.add_periodic]): | ||
raise ValueError(f"Unsupported value found in {self.add_periodic}.") | ||
|
||
if isinstance(self.add_periodic, str): | ||
if self.add_periodic not in ["day", "year"]: | ||
raise ValueError( | ||
f"Unsupported value {self.add_periodic} for add_periodic" | ||
) | ||
self.add_periodic = [self.add_periodic] | ||
|
||
if self.add_periodic is None: | ||
self.add_periodic = [] | ||
|
||
def _more_tags(self): | ||
return {"preserves_dtype": []} | ||
|
||
def __sklearn_tags__(self): | ||
tags = super().__sklearn_tags__() | ||
tags.transformer_tags = TransformerTags(preserves_dtype=[]) | ||
return tags | ||
|
||
|
||
class SplineEncoder(SingleColumnTransformer): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From a design perspective, this class seems to me really like a super thin wrapper on the scikit-learn SplineTransformer. Would it not be possible to get rid of it, and use the SplineTransformer? I would like to minimize almost-redundant functionnality There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should I just fold the logic of both encoders back in the main DatetimeEncoder then? CircularEncoder is just wrapping a call to np.sin/np.cos I was thinking that it could be useful to have periodic encoders for non-datetime features, but maybe I'm wrong |
||
def __init__(self, period=24, n_splines=None, degree=3): | ||
self.period = period | ||
self.n_splines = n_splines | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where you had year period = 4 before I think maybe what you wanted is period of 366 (or 12 if we use the month) and number of splines = 4 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, correct |
||
self.degree = degree | ||
|
||
def fit_transform(self, X, y=None): | ||
del y | ||
|
||
self.transformer_ = self._periodic_spline_transformer( | ||
period=self.period, n_splines=self.n_splines, degree=self.degree | ||
) | ||
|
||
X_out = self.transformer_.fit_transform(sbd.to_numpy(X).reshape(-1, 1)) | ||
|
||
self.is_fitted = True | ||
self.n_components_ = X_out.shape[1] | ||
|
||
name = sbd.name(X) | ||
self.all_outputs_ = [ | ||
f"{name}_spline_{idx}" for idx in range(self.n_components_) | ||
] | ||
|
||
return self._post_process(X, X_out) | ||
|
||
def transform(self, X): | ||
X_out = self.transformer_.transform(sbd.to_numpy(X).reshape(-1, 1)) | ||
|
||
return self._post_process(X, X_out) | ||
|
||
def _post_process(self, X, result): | ||
result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T))) | ||
result = sbd.copy_index(X, result) | ||
|
||
return result | ||
|
||
# This comes from https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html#periodic-spline-features | ||
def _periodic_spline_transformer(self, period, n_splines=None, degree=3): | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if n_splines is None: | ||
n_splines = period | ||
n_knots = n_splines + 1 # periodic and include_bias is True | ||
return SplineTransformer( | ||
degree=degree, | ||
n_knots=n_knots, | ||
knots=np.linspace(0, period, n_knots).reshape(n_knots, 1), | ||
extrapolation="periodic", | ||
include_bias=True, | ||
) | ||
|
||
|
||
class CircularEncoder(SingleColumnTransformer): | ||
def __init__(self, period=24): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we shouldn't have a default for the period. eg 24 only makes sense for hours There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense, I'll remove it |
||
self.period = period | ||
|
||
def fit_transform(self, X, y): | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
del y | ||
|
||
self._sin_transformer = FunctionTransformer( | ||
lambda x: np.sin(x / self.period * 2 * np.pi) | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
self._cos_transformer = FunctionTransformer( | ||
lambda x: np.cos(x / self.period * 2 * np.pi) | ||
) | ||
|
||
_new_features = [ | ||
self._sin_transformer.fit_transform(X), | ||
self._cos_transformer.fit_transform(X), | ||
] | ||
|
||
self.is_fitted = True | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.n_components_ = 2 | ||
|
||
name = sbd.name(X) | ||
self.all_outputs_ = [ | ||
f"{name}_circular_{idx}" for idx in range(self.n_components_) | ||
] | ||
|
||
return self._post_process(X, _new_features) | ||
|
||
def transform(self, X): | ||
_new_features = [ | ||
self._sin_transformer.transform(X), | ||
self._cos_transformer.transform(X), | ||
] | ||
|
||
return self._post_process(X, _new_features) | ||
|
||
def _post_process(self, X, result): | ||
result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result))) | ||
result = sbd.copy_index(X, result) | ||
|
||
return result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why 4 for year? shouldn't we set the period to 12 and use the month number or something like that? or set the period to 366 if we use the day of year?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the beginning I had 365 and the spline as default, but having 365 features for each year was too much, and I didn't have any better idea than setting it to a random number (4 seasons I guess)
Now that circular is the default, we can set it to 366