diff --git a/setup.cfg b/setup.cfg index 4abe5b61e..79c4ec411 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,7 +28,8 @@ install_requires = scikit-learn>=1.2.1 numpy>=1.23.5 scipy>=1.9.3 - pandas>=1.5.3 + pandas>=2.1.0 + dataframe-api-compat>=0.1.28 packaging>=23.1 python_requires = >=3.10 diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index dccb39301..9e2c77b50 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,4 +1,6 @@ -from typing import Literal +from __future__ import annotations + +from typing import Literal, TYPE_CHECKING import numpy as np import pandas as pd @@ -7,6 +9,8 @@ from sklearn.utils.validation import check_is_fitted from skrub._utils import check_input +if TYPE_CHECKING: + from dataframe_api import Column WORD_TO_ALIAS: dict[str, str] = { "year": "Y", @@ -130,37 +134,34 @@ def _validate_keywords(self): ) @staticmethod - def _extract_from_date(date_series: pd.Series, feature: str): + def _extract_from_date(date_series: Column, feature: str): if feature == "year": - return pd.DatetimeIndex(date_series).year.to_numpy() + return date_series.year() elif feature == "month": - return pd.DatetimeIndex(date_series).month.to_numpy() + return date_series.month() elif feature == "day": - return pd.DatetimeIndex(date_series).day.to_numpy() + return date_series.day() elif feature == "hour": - return pd.DatetimeIndex(date_series).hour.to_numpy() + return date_series.hour() elif feature == "minute": - return pd.DatetimeIndex(date_series).minute.to_numpy() + return date_series.minute() elif feature == "second": - return pd.DatetimeIndex(date_series).second.to_numpy() + return date_series.second() elif feature == "microsecond": - return pd.DatetimeIndex(date_series).microsecond.to_numpy() + return date_series.microsecond() elif feature == "nanosecond": - return pd.DatetimeIndex(date_series).nanosecond.to_numpy() + if hasattr(date_series, 'nanosecond'): + return date_series.nanosecond() + else: + raise AttributeError( + f"`nanosecond` is not part of the DataFrame API and so support is not guaranteed across all libraries. " + "In particular, it is not supported for {date_series.__class__.__name__}" + ) elif feature == "dayofweek": - return pd.DatetimeIndex(date_series).dayofweek.to_numpy() + return date_series.iso_weekday() - 1 elif feature == "total_time": - tz = pd.DatetimeIndex(date_series).tz # Compute the time in seconds from the epoch time UTC - if tz is None: - return ( - pd.to_datetime(date_series) - pd.Timestamp("1970-01-01") - ) // pd.Timedelta("1s") - else: - return ( - pd.DatetimeIndex(date_series).tz_convert("utc") - - pd.Timestamp("1970-01-01", tz="utc") - ) // pd.Timedelta("1s") + return date_series.unix_timestamp() # type: ignore def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": """Fit the instance to ``X``. @@ -181,23 +182,26 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": Fitted DatetimeEncoder instance (self). """ self._validate_keywords() - if isinstance(X, pd.DataFrame): - self.col_names_ = X.columns.to_list() - else: - self.col_names_ = None - X = check_input(X) + if not hasattr(X, "__dataframe_consortium_standard__"): + X = check_input(X) + X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + X = X.__dataframe_consortium_standard__(api_version='2023.11-beta') + X = X.persist() + n_colums = len(X.column_names) + self.col_names_ = X.column_names # Features to extract for each column, after removing constant features self.features_per_column_ = {} - for i in range(X.shape[1]): + for i in range(n_colums): self.features_per_column_[i] = [] # Check which columns are constant - for i in range(X.shape[1]): + for i in range(n_colums): + column = X.col(X.column_names[i]) if self.extract_until is None: - if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0: + if self._extract_from_date(column, "total_time").std() > 0: self.features_per_column_[i].append("total_time") else: for feature in TIME_LEVELS: - if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0: + if self._extract_from_date(column, feature).std() > 0: if TIME_LEVELS.index(feature) <= TIME_LEVELS.index( self.extract_until ): @@ -213,11 +217,11 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": # Add day of the week feature if needed if ( self.add_day_of_the_week - and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0 + and float(self._extract_from_date(column, "dayofweek").std()) > 0 ): self.features_per_column_[i].append("dayofweek") - self.n_features_in_ = X.shape[1] + self.n_features_in_ = n_colums self.n_features_out_ = len( np.concatenate(list(self.features_per_column_.values())) ) @@ -240,26 +244,35 @@ def transform(self, X: ArrayLike, y=None) -> NDArray: ndarray, shape (``n_samples``, ``n_features_out_``) Transformed input. """ + if not hasattr(X, "__dataframe_consortium_standard__"): + X = check_input(X) + X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + X = X.__dataframe_consortium_standard__(api_version='2023.11-beta') + n_columns = len(X.column_names) check_is_fitted( self, attributes=["n_features_in_", "n_features_out_", "features_per_column_"], ) - X = check_input(X) - if X.shape[1] != self.n_features_in_: + if n_columns != self.n_features_in_: raise ValueError( - f"The number of features in the input data ({X.shape[1]}) " + f"The number of features in the input data ({n_columns}) " "does not match the number of features " f"seen during fit ({self.n_features_in_}). " ) # Create a new array with the extracted features, # choosing only features that weren't constant during fit - X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) + features_to_select = [] idx = 0 - for i in range(X.shape[1]): + for i in range(n_columns): + column = X.col(X.column_names[i]) for j, feature in enumerate(self.features_per_column_[i]): - X_[:, idx + j] = self._extract_from_date(X[:, i], feature) + features_to_select.append( + self._extract_from_date(column, feature).rename(f"{feature}_{i}") + ) idx += len(self.features_per_column_[i]) - return X_ + X = X.assign(*features_to_select).select(*(feature.name for feature in features_to_select)) + X = X.persist() + return X.to_array("float64") def get_feature_names_out(self, input_features=None) -> list[str]: """Return clean feature names.