-
Notifications
You must be signed in to change notification settings - Fork 106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RFC, WIP: demo: use dataframe api in datetimeencoder #786
Changes from 6 commits
f3a054b
d8bcaae
2497b8d
5025819
8f6fd62
0791f58
9bdd449
b7335e6
77fe22a
4315e92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
from typing import Literal | ||
from __future__ import annotations | ||
|
||
from typing import Literal, TYPE_CHECKING | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -7,6 +9,8 @@ | |
from sklearn.utils.validation import check_is_fitted | ||
|
||
from skrub._utils import check_input | ||
if TYPE_CHECKING: | ||
from dataframe_api import Column | ||
|
||
WORD_TO_ALIAS: dict[str, str] = { | ||
"year": "Y", | ||
|
@@ -130,37 +134,34 @@ def _validate_keywords(self): | |
) | ||
|
||
@staticmethod | ||
def _extract_from_date(date_series: pd.Series, feature: str): | ||
def _extract_from_date(date_series: Column, feature: str): | ||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if feature == "year": | ||
return pd.DatetimeIndex(date_series).year.to_numpy() | ||
return date_series.year() | ||
elif feature == "month": | ||
return pd.DatetimeIndex(date_series).month.to_numpy() | ||
return date_series.month() | ||
elif feature == "day": | ||
return pd.DatetimeIndex(date_series).day.to_numpy() | ||
return date_series.day() | ||
elif feature == "hour": | ||
return pd.DatetimeIndex(date_series).hour.to_numpy() | ||
return date_series.hour() | ||
elif feature == "minute": | ||
return pd.DatetimeIndex(date_series).minute.to_numpy() | ||
return date_series.minute() | ||
elif feature == "second": | ||
return pd.DatetimeIndex(date_series).second.to_numpy() | ||
return date_series.second() | ||
elif feature == "microsecond": | ||
return pd.DatetimeIndex(date_series).microsecond.to_numpy() | ||
return date_series.microsecond() | ||
elif feature == "nanosecond": | ||
return pd.DatetimeIndex(date_series).nanosecond.to_numpy() | ||
if hasattr(date_series, 'nanosecond'): | ||
return date_series.nanosecond() | ||
else: | ||
raise AttributeError( | ||
f"`nanosecond` is not part of the DataFrame API and so support is not guaranteed across all libraries. " | ||
"In particular, it is not supported for {date_series.__class__.__name__}" | ||
) | ||
Comment on lines
+153
to
+159
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. technically, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you move the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by "into the dataframe API"? Into the standard? Or the To move it into the I think keeping this code there is pragmatic for the time being (to avoid introducing a regression w.r.t. what is already done in the current code-based). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I've added I'll try to get nanoseconds added to the spec though, would make things cleaner There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the clarification @MarcoGorelli |
||
elif feature == "dayofweek": | ||
return pd.DatetimeIndex(date_series).dayofweek.to_numpy() | ||
return date_series.iso_weekday() - 1 | ||
elif feature == "total_time": | ||
tz = pd.DatetimeIndex(date_series).tz | ||
# Compute the time in seconds from the epoch time UTC | ||
if tz is None: | ||
return ( | ||
pd.to_datetime(date_series) - pd.Timestamp("1970-01-01") | ||
) // pd.Timedelta("1s") | ||
else: | ||
return ( | ||
pd.DatetimeIndex(date_series).tz_convert("utc") | ||
- pd.Timestamp("1970-01-01", tz="utc") | ||
) // pd.Timedelta("1s") | ||
return date_series.unix_timestamp() # type: ignore | ||
|
||
def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": | ||
"""Fit the instance to ``X``. | ||
|
@@ -181,23 +182,25 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": | |
Fitted DatetimeEncoder instance (self). | ||
""" | ||
self._validate_keywords() | ||
if isinstance(X, pd.DataFrame): | ||
self.col_names_ = X.columns.to_list() | ||
else: | ||
self.col_names_ = None | ||
X = check_input(X) | ||
if not hasattr(X, "__dataframe_consortium_standard__"): | ||
X = check_input(X) | ||
X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) | ||
Comment on lines
+185
to
+187
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this requires some discussion too currently, you're allowing numpy arrays as inputs, but you're doing operations which numpy doesn't support (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point. We're refactoring the DatetimeEncoder in #784, and we don't plan to have efficient support of Polars in this PR since we're converting the input to numpy right away. So, to answer your question, you won't need to call a dataframe constructor. However, we will revisit this so that Polars and Pandas can be used without numpy conversions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering: do you expect many users to use skrub with non-dataframe inputs? If not maybe it's still time not to accept numpy arrays as input and only accept dataframes. That might make the code simpler and easier to maintain. |
||
X = X.__dataframe_consortium_standard__().collect() | ||
n_colums = len(X.column_names) | ||
self.col_names_ = X.column_names | ||
# Features to extract for each column, after removing constant features | ||
self.features_per_column_ = {} | ||
for i in range(X.shape[1]): | ||
for i in range(n_colums): | ||
self.features_per_column_[i] = [] | ||
# Check which columns are constant | ||
for i in range(X.shape[1]): | ||
for i in range(n_colums): | ||
column = X.col(X.column_names[i]) | ||
if self.extract_until is None: | ||
if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0: | ||
if float(self._extract_from_date(column, "total_time").std()) > 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the Dataframe API, column reductions (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's very enlightening. I guess this will encourage us to use tricks like branchless when possible to keep the laziness. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks - correction here, this can be simplified, because if we write if self._extract_from_date(column, "total_time").std(): then Python will implicitly call if self._extract_from_date(column, "total_time").std().__bool__(): which will materialise the scalar to Python anyway Yes, in general, if there's a way to avoid working with Python scalars, that's better But my suspicion is that, in general, for |
||
self.features_per_column_[i].append("total_time") | ||
else: | ||
for feature in TIME_LEVELS: | ||
if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0: | ||
if float(self._extract_from_date(column, feature).std()) > 0: | ||
if TIME_LEVELS.index(feature) <= TIME_LEVELS.index( | ||
self.extract_until | ||
): | ||
|
@@ -213,11 +216,11 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": | |
# Add day of the week feature if needed | ||
if ( | ||
self.add_day_of_the_week | ||
and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0 | ||
and float(self._extract_from_date(column, "dayofweek").std()) > 0 | ||
): | ||
self.features_per_column_[i].append("dayofweek") | ||
|
||
self.n_features_in_ = X.shape[1] | ||
self.n_features_in_ = n_colums | ||
self.n_features_out_ = len( | ||
np.concatenate(list(self.features_per_column_.values())) | ||
) | ||
|
@@ -240,26 +243,36 @@ def transform(self, X: ArrayLike, y=None) -> NDArray: | |
ndarray, shape (``n_samples``, ``n_features_out_``) | ||
Transformed input. | ||
""" | ||
if not hasattr(X, "__dataframe_consortium_standard__"): | ||
X = check_input(X) | ||
X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) | ||
X = X.__dataframe_consortium_standard__() | ||
n_columns = len(X.column_names) | ||
check_is_fitted( | ||
self, | ||
attributes=["n_features_in_", "n_features_out_", "features_per_column_"], | ||
) | ||
X = check_input(X) | ||
if X.shape[1] != self.n_features_in_: | ||
# X = check_input(X) | ||
if n_columns != self.n_features_in_: | ||
raise ValueError( | ||
f"The number of features in the input data ({X.shape[1]}) " | ||
f"The number of features in the input data ({n_columns}) " | ||
"does not match the number of features " | ||
f"seen during fit ({self.n_features_in_}). " | ||
) | ||
# Create a new array with the extracted features, | ||
# choosing only features that weren't constant during fit | ||
X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) | ||
# X_ = np.empty((X.shape()[0], self.n_features_out_), dtype=np.float64) | ||
features_to_select = [] | ||
idx = 0 | ||
for i in range(X.shape[1]): | ||
for i in range(n_columns): | ||
column = X.col(X.column_names[i]) | ||
for j, feature in enumerate(self.features_per_column_[i]): | ||
X_[:, idx + j] = self._extract_from_date(X[:, i], feature) | ||
features_to_select.append( | ||
self._extract_from_date(column, feature).rename(f"{feature}_{i}") | ||
) | ||
idx += len(self.features_per_column_[i]) | ||
return X_ | ||
X = X.assign(*features_to_select).select(*(feature.name for feature in features_to_select)) | ||
return X.collect().to_array("float64") | ||
|
||
def get_feature_names_out(self, input_features=None) -> list[str]: | ||
"""Return clean feature names. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is extremely light, it's just a bunch of pure-Python files which wrap around the pandas and Polars apis https://pypi.org/project/dataframe-api-compat/0.1.23/#files