Support pandas v2 (#1321)

online-ml · Sep 12, 2023 · 0539046 · 0539046
1 parent 7245613
commit 0539046
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 13 deletions.
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -1,5 +1,7 @@
 # Unreleased
 
+River's mini-batch methods now support pandas v2. In particular, River conforms to pandas' new sparse API.
+
 ## anomaly
 
 - Added `anomaly.LocalOutlierFactor`, which is an online version of the LOF algorithm for anomaly detection that matches the scikit-learn implementation.
diff --git a/river/compose/product.py b/river/compose/product.py
@@ -88,27 +88,40 @@ def transform_one(self, x):
     def transform_many(self, X):
         outputs = [t.transform_many(X) for t in self.transformers.values()]
 
-        def get_fill_value(a):
-            if isinstance(a, pd.arrays.SparseArray):
-                return a.fill_value
-            return a.sparse.fill_value
-
         def multiply(a, b):
             # Fast-track for sparse[uint8] * sparse[uint8]
             if a.dtype == pd.SparseDtype("uint8") and b.dtype == pd.SparseDtype("uint8"):
                 return a & b
-
+            # Fast-track for sparse[uint8] * numeric
+            if a.dtype == pd.SparseDtype("uint8"):
+                c = np.zeros_like(b)
+                true_mask = a.eq(1)
+                c[true_mask] = b[true_mask]
+                return pd.Series(
+                    c,
+                    index=b.index,
+                    dtype=pd.SparseDtype(b.dtype, fill_value=0),
+                )
+            # Fast-track for numeric * sparse[uint8]
+            if b.dtype == pd.SparseDtype("uint8"):
+                return multiply(b, a)
             # Fast-track for sparse * sparse
-            if pd.api.types.is_sparse(a) and pd.api.types.is_sparse(b):
-                return pd.arrays.SparseArray(
-                    a * b, fill_value=get_fill_value(a) * get_fill_value(b)
+            if isinstance(a.dtype, pd.SparseDtype) and isinstance(b.dtype, pd.SparseDtype):
+                return pd.Series(
+                    a * b,
+                    index=a.index,
+                    dtype=pd.SparseDtype(
+                        b.dtype, fill_value=a.sparse.fill_value * b.sparse.fill_value
+                    ),
                 )
             # Fast-track for sparse * numeric
-            if pd.api.types.is_sparse(a):
-                return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(a))
+            if isinstance(a.dtype, pd.SparseDtype):
+                return pd.Series(
+                    a * b, dtype=pd.SparseDtype(fill_value=a.sparse.fill_value, dtype=b.dtype)
+                )
             # Fast-track for numeric * sparse
-            if pd.api.types.is_sparse(b):
-                return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(b))
+            if isinstance(b.dtype, pd.SparseDtype):
+                return multiply(b, a)
             # Default
             return np.multiply(a, b)
 

diff --git a/river/naive_bayes/bernoulli.py b/river/naive_bayes/bernoulli.py
@@ -288,4 +288,5 @@ def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame:
             X @ (flp - neg_p).T + (np.log(self.p_class_many()) + neg_p.sum(axis=1).T).values,
             index=index,
             columns=self.class_counts.keys(),
+            dtype=float,
         )