Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CHORE: Deprecate use_inf_as_na option #689

Merged
merged 2 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions python/xorbits/_mars/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,9 +342,6 @@ def validate(x):
default_options.register_option("serialize_method", "pickle")

# dataframe-related options
default_options.register_option(
"dataframe.mode.use_inf_as_na", False, validator=is_bool
)
default_options.register_option(
"dataframe.use_arrow_dtype", None, validator=any_validator(is_null, is_bool)
)
Expand Down
25 changes: 8 additions & 17 deletions python/xorbits/_mars/dataframe/groupby/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from ...core.operand import OperandStage
from ...serialization.serializables import (
AnyField,
BoolField,
DictField,
Int32Field,
Int64Field,
Expand Down Expand Up @@ -170,7 +169,6 @@ class DataFrameGroupByAgg(DataFrameOperand, DataFrameOperandMixin):
groupby_params = DictField("groupby_params")

method = StringField("method")
use_inf_as_na = BoolField("use_inf_as_na")

# for chunk
combine_size = Int32Field("combine_size")
Expand Down Expand Up @@ -1286,18 +1284,14 @@ def _execute_agg(cls, ctx, op: "DataFrameGroupByAgg"):
@redirect_custom_log
@enter_current_session
def execute(cls, ctx, op: "DataFrameGroupByAgg"):
try:
pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
if op.stage == OperandStage.map:
cls._execute_map(ctx, op)
elif op.stage == OperandStage.combine:
cls._execute_combine(ctx, op)
elif op.stage == OperandStage.agg:
cls._execute_agg(ctx, op)
else: # pragma: no cover
raise ValueError("Aggregation operand not executable")
finally:
pd.reset_option("mode.use_inf_as_na")
if op.stage == OperandStage.map:
cls._execute_map(ctx, op)
elif op.stage == OperandStage.combine:
cls._execute_combine(ctx, op)
elif op.stage == OperandStage.agg:
cls._execute_agg(ctx, op)
else: # pragma: no cover
raise ValueError("Aggregation operand not executable")


def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs):
Expand Down Expand Up @@ -1355,8 +1349,6 @@ def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs):
func, *args, _call_agg=True, index=index_value, **kwargs
)

use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)

agg_op = DataFrameGroupByAgg(
raw_func=func,
raw_func_kw=kwargs,
Expand All @@ -1365,6 +1357,5 @@ def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs):
groupby_params=groupby.op.groupby_params,
combine_size=combine_size or options.combine_size,
chunk_store_limit=options.chunk_store_limit,
use_inf_as_na=use_inf_as_na,
)
return agg_op(groupby)
42 changes: 9 additions & 33 deletions python/xorbits/_mars/dataframe/missing/checkna.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from ... import dataframe as md
from ... import opcodes
from ... import tensor as mt
from ...config import options
from ...core import OutputType
from ...serialization.serializables import BoolField
from ..operands import (
Expand All @@ -39,14 +38,10 @@ class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin):
_op_type_ = opcodes.CHECK_NA

_positive = BoolField("positive")
_use_inf_as_na = BoolField("use_inf_as_na")

def __init__(
self, positive=None, use_inf_as_na=None, sparse=None, output_types=None, **kw
):
def __init__(self, positive=None, sparse=None, output_types=None, **kw):
super().__init__(
_positive=positive,
_use_inf_as_na=use_inf_as_na,
_output_types=output_types,
sparse=sparse,
**kw,
Expand All @@ -56,10 +51,6 @@ def __init__(
def positive(self) -> bool:
return self._positive

@property
def use_inf_as_na(self) -> bool:
return self._use_inf_as_na

def __call__(self, df):
if isinstance(df, DATAFRAME_TYPE):
self.output_types = [OutputType.dataframe]
Expand Down Expand Up @@ -107,15 +98,10 @@ def tile(cls, op: "DataFrameCheckNA"):
@classmethod
def execute(cls, ctx, op: "DataFrameCheckNA"):
in_data = ctx[op.inputs[0].key]
old_use_inf_as_na = pd.get_option("mode.use_inf_as_na")
try:
pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)
if op.positive:
ctx[op.outputs[0].key] = in_data.isna()
else:
ctx[op.outputs[0].key] = in_data.notna()
finally:
pd.set_option("mode.use_inf_as_na", old_use_inf_as_na)
if op.positive:
ctx[op.outputs[0].key] = in_data.isna()
else:
ctx[op.outputs[0].key] = in_data.notna()


def _from_pandas(obj: Any):
Expand Down Expand Up @@ -200,14 +186,9 @@ def isna(obj):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, ENTITY_TYPE):
if isinstance(obj, TENSOR_TYPE):
if options.dataframe.mode.use_inf_as_na:
return ~mt.isfinite(obj)
else:
return mt.isnan(obj)
return mt.isnan(obj)
else:
op = DataFrameCheckNA(
positive=True, use_inf_as_na=options.dataframe.mode.use_inf_as_na
)
op = DataFrameCheckNA(positive=True)
return op(obj)
else:
return _from_pandas(pd.isna(obj))
Expand Down Expand Up @@ -279,14 +260,9 @@ def notna(obj):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, ENTITY_TYPE):
if isinstance(obj, TENSOR_TYPE):
if options.dataframe.mode.use_inf_as_na:
return mt.isfinite(obj)
else:
return ~mt.isnan(obj)
return ~mt.isnan(obj)
else:
op = DataFrameCheckNA(
positive=False, use_inf_as_na=options.dataframe.mode.use_inf_as_na
)
op = DataFrameCheckNA(positive=False)
return op(obj)
else:
return _from_pandas(pd.notna(obj))
Expand Down
70 changes: 24 additions & 46 deletions python/xorbits/_mars/dataframe/missing/dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import pandas as pd

from ... import opcodes
from ...config import options
from ...core import OutputType, recursive_tile
from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField
from ...utils import no_default, pd_release_version
Expand All @@ -37,7 +36,6 @@ class DataFrameDropNA(DataFrameOperand, DataFrameOperandMixin):
_how = StringField("how")
_thresh = Int32Field("thresh")
_subset = AnyField("subset")
_use_inf_as_na = BoolField("use_inf_as_na")

# when True, dropna will be called on the input,
# otherwise non-nan counts will be used
Expand All @@ -51,7 +49,6 @@ def __init__(
how=None,
thresh=None,
subset=None,
use_inf_as_na=None,
drop_directly=None,
subset_size=None,
sparse=None,
Expand All @@ -63,7 +60,6 @@ def __init__(
_how=how,
_thresh=thresh,
_subset=subset,
_use_inf_as_na=use_inf_as_na,
_drop_directly=drop_directly,
_subset_size=subset_size,
_output_types=output_types,
Expand All @@ -87,10 +83,6 @@ def thresh(self) -> int:
def subset(self) -> list:
return self._subset

@property
def use_inf_as_na(self) -> bool:
return self._use_inf_as_na

@property
def drop_directly(self) -> bool:
return self._drop_directly
Expand Down Expand Up @@ -150,9 +142,7 @@ def tile(cls, op: "DataFrameDropNA"):
subset_df = in_df
if op.subset:
subset_df = in_df[op.subset]
count_series = yield from recursive_tile(
subset_df.agg("count", axis=1, _use_inf_as_na=op.use_inf_as_na)
)
count_series = yield from recursive_tile(subset_df.agg("count", axis=1))

nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series(
in_df, count_series, axis=0
Expand Down Expand Up @@ -185,35 +175,30 @@ def tile(cls, op: "DataFrameDropNA"):

@classmethod
def execute(cls, ctx, op: "DataFrameDropNA"):
try:
pd.set_option("mode.use_inf_as_na", op.use_inf_as_na)

in_data = ctx[op.inputs[0].key]
if op.drop_directly:
if isinstance(in_data, pd.DataFrame):
result = in_data.dropna(
axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset
)
elif isinstance(in_data, pd.Series):
result = in_data.dropna(axis=op.axis, how=op.how)
else:
result = in_data.dropna(how=op.how)
ctx[op.outputs[0].key] = result
return

in_counts = ctx[op.inputs[1].key]
if op.how == "all":
in_counts = in_counts[in_counts > 0]
in_data = ctx[op.inputs[0].key]
if op.drop_directly:
if isinstance(in_data, pd.DataFrame):
result = in_data.dropna(
axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset
)
elif isinstance(in_data, pd.Series):
result = in_data.dropna(axis=op.axis, how=op.how)
else:
if op.thresh is None or op.thresh is no_default:
thresh = op.subset_size
else: # pragma: no cover
thresh = op.thresh
in_counts = in_counts[in_counts >= thresh]
result = in_data.dropna(how=op.how)
ctx[op.outputs[0].key] = result
return

ctx[op.outputs[0].key] = in_data.reindex(in_counts.index)
finally:
pd.reset_option("mode.use_inf_as_na")
in_counts = ctx[op.inputs[1].key]
if op.how == "all":
in_counts = in_counts[in_counts > 0]
else:
if op.thresh is None or op.thresh is no_default:
thresh = op.subset_size
else: # pragma: no cover
thresh = op.thresh
in_counts = in_counts[in_counts >= thresh]

ctx[op.outputs[0].key] = in_data.reindex(in_counts.index)


def df_dropna(
Expand Down Expand Up @@ -328,14 +313,12 @@ def df_dropna(
if thresh is no_default and how is no_default:
how = "any"

use_inf_as_na = options.dataframe.mode.use_inf_as_na
op = DataFrameDropNA(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
output_types=[OutputType.dataframe],
use_inf_as_na=use_inf_as_na,
)
out_df = op(df)
if inplace:
Expand Down Expand Up @@ -417,12 +400,10 @@ def series_dropna(series, axis=0, inplace=False, how=None):
dtype: object
"""
axis = validate_axis(axis, series)
use_inf_as_na = options.dataframe.mode.use_inf_as_na
op = DataFrameDropNA(
axis=axis,
how=how,
output_types=[OutputType.series],
use_inf_as_na=use_inf_as_na,
)
out_series = op(series)
if inplace:
Expand All @@ -445,8 +426,5 @@ def index_dropna(index, how="any"):
-------
Index
"""
use_inf_as_na = options.dataframe.mode.use_inf_as_na
op = DataFrameDropNA(
axis=0, how=how, output_types=[OutputType.index], use_inf_as_na=use_inf_as_na
)
op = DataFrameDropNA(axis=0, how=how, output_types=[OutputType.index])
return op(index)
Loading
Loading