Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a periodic encoder to the DatetimeEncoder #1235

Open
wants to merge 30 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bc9a5bc
Initial commit for periodic encoder
rcap107 Feb 7, 2025
176b18f
Refactoring code to split Spline and Circular encoders
rcap107 Feb 10, 2025
cc6455c
WIP Refactoring code to split Spline and Circular encoders
rcap107 Feb 10, 2025
99b2d37
Renaming a constant
rcap107 Feb 10, 2025
ee24931
Finished implementation of the circular encoder
rcap107 Feb 10, 2025
1a53da2
Updating changelog
rcap107 Feb 10, 2025
156d397
✅ WIP Improving test coverage
rcap107 Feb 11, 2025
3004442
Fixing tests
rcap107 Feb 12, 2025
9e6e05f
Fixing more bugs with testing
rcap107 Feb 12, 2025
542522e
Fixing a typo
rcap107 Feb 12, 2025
3804dc9
Fixing all tests and docstring shenanigans
rcap107 Feb 12, 2025
a981526
Fixing bugs, adding tests, adding docstrings
rcap107 Feb 13, 2025
3380f83
Delete example_circular_encoding.py
rcap107 Feb 13, 2025
ed5eb46
Adding doc classes
rcap107 Feb 13, 2025
ea209d6
Implementing fixes from comments
rcap107 Feb 13, 2025
af9f363
Update skrub/_datetime_encoder.py
rcap107 Feb 17, 2025
d0ab8bf
Update skrub/_datetime_encoder.py
rcap107 Feb 17, 2025
1ac5caf
Fixing a bug with how the first value is selected
rcap107 Feb 18, 2025
5d98ee3
Fixing a fix that didn't fix
rcap107 Feb 18, 2025
c96d3fc
Adding back default values
rcap107 Feb 18, 2025
4b660e8
Merge remote-tracking branch 'upstream/main' into circular_encoding
rcap107 Mar 10, 2025
4dcae67
Simplifying code and hardcoding defaults
rcap107 Mar 10, 2025
e160dcb
fixing an import
rcap107 Mar 10, 2025
7be8721
Updating the datetime encoder example to add periodic features
rcap107 Mar 10, 2025
2ab6471
Fixing example
rcap107 Mar 10, 2025
aa88cd1
wip debugging
rcap107 Mar 11, 2025
72ae07b
Fixing a bug that was caused by pandas indexin
rcap107 Mar 11, 2025
5bd2ffa
Fixing a bug with pandas indexing, renaming variables
rcap107 Mar 11, 2025
2d28a18
Updating example to add new datetimeencoder features
rcap107 Mar 11, 2025
b2349f5
Removing a debugging script
rcap107 Mar 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions example_circular_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# %%
import pandas as pd

from skrub import DatetimeEncoder, TableVectorizer, datasets

data = datasets.fetch_bike_sharing().bike_sharing

# Extract our input data (X) and the target column (y)
y = data["cnt"]
X = data[["date", "holiday", "temp", "hum", "windspeed", "weathersit"]]

date = pd.to_datetime(X["date"])

# %%
# %%
# %%
de = DatetimeEncoder(add_ordinal_day=True, add_periodic="day")

r = de.fit_transform(date)
print(r.columns)
# %%
tv = TableVectorizer()

# X_t = tv.fit_transform(X)

# %%
4 changes: 3 additions & 1 deletion skrub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ._agg_joiner import AggJoiner, AggTarget
from ._check_dependencies import check_dependencies
from ._column_associations import column_associations
from ._datetime_encoder import DatetimeEncoder
from ._datetime_encoder import CircularEncoder, DatetimeEncoder, SplineEncoder
from ._deduplicate import compute_ngram_distance, deduplicate
from ._fuzzy_join import fuzzy_join
from ._gap_encoder import GapEncoder
Expand Down Expand Up @@ -56,4 +56,6 @@
"TextEncoder",
"StringEncoder",
"column_associations",
"SplineEncoder",
"CircularEncoder",
]
189 changes: 186 additions & 3 deletions skrub/_datetime_encoder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from datetime import datetime, timezone

import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer, SplineTransformer
from sklearn.utils.validation import check_is_fitted

try:
Expand All @@ -26,6 +28,8 @@
"nanosecond",
]

_DEFAULT_ENCODING_PERIODS = {"year": 4, "month": 30, "weekday": 7, "hour": 24}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 4 for year? shouldn't we set the period to 12 and use the month number or something like that? or set the period to 366 if we use the day of year?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the beginning I had 365 and the spline as default, but having 365 features for each year was too much, and I didn't have any better idea than setting it to a random number (4 seasons I guess)

Now that circular is the default, we can set it to 366



@dispatch
def _is_date(col):
Expand Down Expand Up @@ -58,6 +62,9 @@ def _get_dt_feature_pandas(col, feature):
return ((col - epoch) / pd.Timedelta("1s")).astype("float32")
if feature == "weekday":
return col.dt.day_of_week + 1
if feature == "day_of_year":
return col.dt.day_of_year

assert feature in _TIME_LEVELS
return getattr(col.dt, feature)

Expand All @@ -66,6 +73,8 @@ def _get_dt_feature_pandas(col, feature):
def _get_dt_feature_polars(col, feature):
if feature == "total_seconds":
return (col.dt.timestamp(time_unit="ms") / 1000).cast(pl.Float32)
if feature == "day_of_year":
return getattr(col.dt, "ordinal_day")()
assert feature in _TIME_LEVELS + ["weekday"]
return getattr(col.dt, feature)()

Expand Down Expand Up @@ -256,10 +265,28 @@ class DatetimeEncoder(SingleColumnTransformer):
timezone used during ``fit`` and that we get the same result for "hour".
""" # noqa: E501

def __init__(self, resolution="hour", add_weekday=False, add_total_seconds=True):
def __init__(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we not generate the values when we are doing the periodic encoding? eg if we have spline encoding of the time of day, maybe we shouldn't by default output the "hour" feature

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe users still need the additional columns as features for some reason? Though I guess that by the point they get to vectorizing the table they're already done with exploring the data which is what keeping those features would help with 🤔

I'll remove the old features

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure what would the best interface to make the skrub code simple and to make it easy for users to specify which features they want / figure out which features they will get 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A parameter like "keep_original" would make sense to me. This reminds me somehow of coalesce in polars join https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html, which pandas doesn't have.

self,
resolution="hour",
add_weekday=False,
add_total_seconds=True,
add_day_of_year=False,
add_periodic=False,
year_encoding="circular",
month_encoding="circular",
weekday_encoding="circular",
hour_encoding="circular",
):
self.resolution = resolution
self.add_weekday = add_weekday
self.add_total_seconds = add_total_seconds
self.add_day_of_year = add_day_of_year
self.add_periodic = add_periodic

self.year_encoding = year_encoding
self.month_encoding = month_encoding
self.weekday_encoding = weekday_encoding
self.hour_encoding = hour_encoding

def fit_transform(self, column, y=None):
"""Fit the encoder and transform a column.
Expand All @@ -272,7 +299,7 @@ def fit_transform(self, column, y=None):
y : None
Ignored.

Returns
ReturnsHistGradientBoostingRegressor()
-------
transformed : DataFrame
The extracted features.
Expand All @@ -294,6 +321,44 @@ def fit_transform(self, column, y=None):
self.extracted_features_.append("total_seconds")
if self.add_weekday:
self.extracted_features_.append("weekday")
if self.add_day_of_year:
self.extracted_features_.append("day_of_year")

# Adding transformers for periodic encoding
self._required_transformers = {}

# Iterating over all attributes that end with _encoding to use the default
# parameters
if self.add_periodic:
_enc_attr = [attr for attr in self.__dict__ if attr.endswith("_encoding")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are only 4 so I think an explicit list with all 4 names would be fine here. If you really want to go with the inspection to retrieve the attributes using getattr(self, name) is more common than self.__getattribute__, and use name.removesuffix('_encoding') rather than split('_') because we could have underscores in the feature name eg if we end up with day_of_year_encoding

for _enc_name in _enc_attr:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

detail: local variable names don't often start with an underscore. no special reason but it's one more character and I don't see what it adds

_enc = self.__getattribute__(_enc_name)
_enc_case = _enc_name.split("_")[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why "case"?


# The user provided a specific instance of the encoder for this case
if isinstance(_enc, (SplineEncoder, CircularEncoder)):
self._required_transformers[_enc_case] = _enc
else:
# This encoder has been disabled
if _enc is None:
continue
if _enc not in ["circular", "spline"]:
raise ValueError(f"Unsupported option {_enc} for {_enc_name}")
if _enc == "circular":
self._required_transformers[_enc_case] = CircularEncoder(
period=_DEFAULT_ENCODING_PERIODS[_enc_case]
)
elif _enc == "spline":
self._required_transformers[_enc_case] = SplineEncoder(
period=_DEFAULT_ENCODING_PERIODS[_enc_case]
)

for _case, t in self._required_transformers.items():
_feat = _get_dt_feature(column, _case)
_feat_name = sbd.name(_feat) + "_" + _case
_feat = sbd.rename(_feat, _feat_name)
t.fit(_feat)

return self.transform(column)

def transform(self, column):
Expand All @@ -316,7 +381,17 @@ def transform(self, column):
extracted = _get_dt_feature(column, feature).rename(f"{name}_{feature}")
extracted = sbd.to_float32(extracted)
all_extracted.append(extracted)
return sbd.make_dataframe_like(column, all_extracted)

_new_features = []
for _case, t in self._required_transformers.items():
_feat = _get_dt_feature(column, _case)
_new_features.append(t.transform(_feat))

X_out = sbd.make_dataframe_like(column, all_extracted)

X_out = sbd.concat_horizontal(X_out, *_new_features)

return X_out

def _check_params(self):
allowed = _TIME_LEVELS + [None]
Expand All @@ -325,10 +400,118 @@ def _check_params(self):
f"'resolution' options are {allowed}, got {self.resolution!r}."
)

if isinstance(self.add_periodic, list):
if not all([_ in ["day", "year"] for _ in self.add_periodic]):
raise ValueError(f"Unsupported value found in {self.add_periodic}.")

if isinstance(self.add_periodic, str):
if self.add_periodic not in ["day", "year"]:
raise ValueError(
f"Unsupported value {self.add_periodic} for add_periodic"
)
self.add_periodic = [self.add_periodic]

if self.add_periodic is None:
self.add_periodic = []

def _more_tags(self):
return {"preserves_dtype": []}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.transformer_tags = TransformerTags(preserves_dtype=[])
return tags


class SplineEncoder(SingleColumnTransformer):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From a design perspective, this class seems to me really like a super thin wrapper on the scikit-learn SplineTransformer.

Would it not be possible to get rid of it, and use the SplineTransformer? I would like to minimize almost-redundant functionnality

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I just fold the logic of both encoders back in the main DatetimeEncoder then? CircularEncoder is just wrapping a call to np.sin/np.cos

I was thinking that it could be useful to have periodic encoders for non-datetime features, but maybe I'm wrong

def __init__(self, period=24, n_splines=None, degree=3):
self.period = period
self.n_splines = n_splines
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where you had year period = 4 before I think maybe what you wanted is period of 366 (or 12 if we use the month) and number of splines = 4

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, correct

self.degree = degree

def fit_transform(self, X, y=None):
del y

self.transformer_ = self._periodic_spline_transformer(
period=self.period, n_splines=self.n_splines, degree=self.degree
)

X_out = self.transformer_.fit_transform(sbd.to_numpy(X).reshape(-1, 1))

self.is_fitted = True
self.n_components_ = X_out.shape[1]

name = sbd.name(X)
self.all_outputs_ = [
f"{name}_spline_{idx}" for idx in range(self.n_components_)
]

return self._post_process(X, X_out)

def transform(self, X):
X_out = self.transformer_.transform(sbd.to_numpy(X).reshape(-1, 1))

return self._post_process(X, X_out)

def _post_process(self, X, result):
result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T)))
result = sbd.copy_index(X, result)

return result

# This comes from https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html#periodic-spline-features
def _periodic_spline_transformer(self, period, n_splines=None, degree=3):
if n_splines is None:
n_splines = period
n_knots = n_splines + 1 # periodic and include_bias is True
return SplineTransformer(
degree=degree,
n_knots=n_knots,
knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
extrapolation="periodic",
include_bias=True,
)


class CircularEncoder(SingleColumnTransformer):
def __init__(self, period=24):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we shouldn't have a default for the period. eg 24 only makes sense for hours

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, I'll remove it

self.period = period

def fit_transform(self, X, y):
del y

self._sin_transformer = FunctionTransformer(
lambda x: np.sin(x / self.period * 2 * np.pi)
)
self._cos_transformer = FunctionTransformer(
lambda x: np.cos(x / self.period * 2 * np.pi)
)

_new_features = [
self._sin_transformer.fit_transform(X),
self._cos_transformer.fit_transform(X),
]

self.is_fitted = True
self.n_components_ = 2

name = sbd.name(X)
self.all_outputs_ = [
f"{name}_circular_{idx}" for idx in range(self.n_components_)
]

return self._post_process(X, _new_features)

def transform(self, X):
_new_features = [
self._sin_transformer.transform(X),
self._cos_transformer.transform(X),
]

return self._post_process(X, _new_features)

def _post_process(self, X, result):
result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result)))
result = sbd.copy_index(X, result)

return result
Loading