Skip to content

Commit

Permalink
Support pandas v2 (#1321)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxHalford authored Sep 12, 2023
1 parent 7245613 commit 0539046
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 13 deletions.
2 changes: 2 additions & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Unreleased

River's mini-batch methods now support pandas v2. In particular, River conforms to pandas' new sparse API.

## anomaly

- Added `anomaly.LocalOutlierFactor`, which is an online version of the LOF algorithm for anomaly detection that matches the scikit-learn implementation.
39 changes: 26 additions & 13 deletions river/compose/product.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,27 +88,40 @@ def transform_one(self, x):
def transform_many(self, X):
outputs = [t.transform_many(X) for t in self.transformers.values()]

def get_fill_value(a):
if isinstance(a, pd.arrays.SparseArray):
return a.fill_value
return a.sparse.fill_value

def multiply(a, b):
# Fast-track for sparse[uint8] * sparse[uint8]
if a.dtype == pd.SparseDtype("uint8") and b.dtype == pd.SparseDtype("uint8"):
return a & b

# Fast-track for sparse[uint8] * numeric
if a.dtype == pd.SparseDtype("uint8"):
c = np.zeros_like(b)
true_mask = a.eq(1)
c[true_mask] = b[true_mask]
return pd.Series(
c,
index=b.index,
dtype=pd.SparseDtype(b.dtype, fill_value=0),
)
# Fast-track for numeric * sparse[uint8]
if b.dtype == pd.SparseDtype("uint8"):
return multiply(b, a)
# Fast-track for sparse * sparse
if pd.api.types.is_sparse(a) and pd.api.types.is_sparse(b):
return pd.arrays.SparseArray(
a * b, fill_value=get_fill_value(a) * get_fill_value(b)
if isinstance(a.dtype, pd.SparseDtype) and isinstance(b.dtype, pd.SparseDtype):
return pd.Series(
a * b,
index=a.index,
dtype=pd.SparseDtype(
b.dtype, fill_value=a.sparse.fill_value * b.sparse.fill_value
),
)
# Fast-track for sparse * numeric
if pd.api.types.is_sparse(a):
return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(a))
if isinstance(a.dtype, pd.SparseDtype):
return pd.Series(
a * b, dtype=pd.SparseDtype(fill_value=a.sparse.fill_value, dtype=b.dtype)
)
# Fast-track for numeric * sparse
if pd.api.types.is_sparse(b):
return pd.arrays.SparseArray(a * b, fill_value=get_fill_value(b))
if isinstance(b.dtype, pd.SparseDtype):
return multiply(b, a)
# Default
return np.multiply(a, b)

Expand Down
1 change: 1 addition & 0 deletions river/naive_bayes/bernoulli.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,4 +288,5 @@ def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame:
X @ (flp - neg_p).T + (np.log(self.p_class_many()) + neg_p.sum(axis=1).T).values,
index=index,
columns=self.class_counts.keys(),
dtype=float,
)

0 comments on commit 0539046

Please sign in to comment.