diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 1b1690e26e..8a655bfa95 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -1,5 +1,7 @@ # Unreleased +River's mini-batch methods now support pandas v2. In particular, River conforms to pandas' new sparse API. + ## anomaly - Added `anomaly.LocalOutlierFactor`, which is an online version of the LOF algorithm for anomaly detection that matches the scikit-learn implementation. diff --git a/river/compose/product.py b/river/compose/product.py index 20291508d2..4ffa82a3be 100644 --- a/river/compose/product.py +++ b/river/compose/product.py @@ -88,27 +88,40 @@ def transform_one(self, x): def transform_many(self, X): outputs = [t.transform_many(X) for t in self.transformers.values()] - def get_fill_value(a): - if isinstance(a, pd.arrays.SparseArray): - return a.fill_value - return a.sparse.fill_value - def multiply(a, b): # Fast-track for sparse[uint8] * sparse[uint8] if a.dtype == pd.SparseDtype("uint8") and b.dtype == pd.SparseDtype("uint8"): return a & b - + # Fast-track for sparse[uint8] * numeric + if a.dtype == pd.SparseDtype("uint8"): + c = np.zeros_like(b) + true_mask = a.eq(1) + c[true_mask] = b[true_mask] + return pd.Series( + c, + index=b.index, + dtype=pd.SparseDtype(b.dtype, fill_value=0), + ) + # Fast-track for numeric * sparse[uint8] + if b.dtype == pd.SparseDtype("uint8"): + return multiply(b, a) # Fast-track for sparse * sparse - if pd.api.types.is_sparse(a) and pd.api.types.is_sparse(b): - return pd.arrays.SparseArray( - a * b, fill_value=get_fill_value(a) * get_fill_value(b) + if isinstance(a.dtype, pd.SparseDtype) and isinstance(b.dtype, pd.SparseDtype): + return pd.Series( + a * b, + index=a.index, + dtype=pd.SparseDtype( + b.dtype, fill_value=a.sparse.fill_value * b.sparse.fill_value + ), ) # Fast-track for sparse * numeric - if pd.api.types.is_sparse(a): - return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(a)) + if isinstance(a.dtype, pd.SparseDtype): + return pd.Series( + a * b, dtype=pd.SparseDtype(fill_value=a.sparse.fill_value, dtype=b.dtype) + ) # Fast-track for numeric * sparse - if pd.api.types.is_sparse(b): - return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(b)) + if isinstance(b.dtype, pd.SparseDtype): + return multiply(b, a) # Default return np.multiply(a, b) diff --git a/river/naive_bayes/bernoulli.py b/river/naive_bayes/bernoulli.py index d2cdc149ea..7ddba2acb0 100644 --- a/river/naive_bayes/bernoulli.py +++ b/river/naive_bayes/bernoulli.py @@ -288,4 +288,5 @@ def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame: X @ (flp - neg_p).T + (np.log(self.p_class_many()) + neg_p.sum(axis=1).T).values, index=index, columns=self.class_counts.keys(), + dtype=float, )