Skip to content

Commit

Permalink
FIX-#4314: Allow passing a series of dtypes to astype. (#4318)
Browse files Browse the repository at this point in the history
Co-authored-by: Yaroslav Igoshev <Poolliver868@mail.ru>
Signed-off-by: mvashishtha <mahesh@ponder.io>
  • Loading branch information
mvashishtha and YarShev authored Mar 18, 2022
1 parent e5d15f9 commit 5440f08
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 6 deletions.
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.14.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Key Features and Updates
* FIX-#4308: Add proper error handling in df.set_index (#4309)
* FIX-#4056: Allow an empty parse_date list in `read_csv_glob` (#4074)
* FIX-#4312: Fix constructing categorical frame with duplicate column names (#4313).
* FIX-#4314: Allow passing a series of dtypes to astype (#4318)
* Performance enhancements
* FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140)
* FIX-#4183: Stop shallow copies from creating global shared state. (#4184)
Expand Down
22 changes: 16 additions & 6 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from .utils import is_full_grab_slice
from modin.utils import try_cast_to_pandas, _inherit_docstrings
from modin.error_message import ErrorMessage
import modin.pandas as pd
from modin.pandas.utils import is_scalar
from modin.config import IsExperimental

Expand Down Expand Up @@ -487,8 +488,6 @@ def _default_to_pandas(self, op, *args, **kwargs):
return Series(result)
# inplace
elif result is None:
import modin.pandas as pd

return self._create_or_update_from_compiler(
getattr(pd, type(pandas_obj).__name__)(pandas_obj)._query_compiler,
inplace=True,
Expand Down Expand Up @@ -903,20 +902,31 @@ def asof(self, where, subset=None):
return result

def astype(self, dtype, copy=True, errors="raise"):
col_dtypes = {}
# dtype can be a series, a dict, or a scalar. If it's series or scalar,
# convert it to a dict before passing it to the query compiler.
if isinstance(dtype, (pd.Series, pandas.Series)):
if not dtype.index.is_unique:
raise ValueError(
"The new Series of types must have a unique index, i.e. "
+ "it must be one-to-one mapping from column names to "
+ " their new dtypes."
)
dtype = {column: dtype for column, dtype in dtype.items()}
# If we got a series or dict originally, dtype is a dict now. Its keys
# must be column names.
if isinstance(dtype, dict):
if (
not set(dtype.keys()).issubset(set(self._query_compiler.columns))
and errors == "raise"
):
raise KeyError(
"Only a column name can be used for the key in"
"Only a column name can be used for the key in "
+ "a dtype mappings argument."
)
col_dtypes = dtype
else:
for column in self._query_compiler.columns:
col_dtypes[column] = dtype
# Assume that the dtype is a scalar.
col_dtypes = {column: dtype for column in self._query_compiler.columns}

new_query_compiler = self._query_compiler.astype(col_dtypes)
return self._create_or_update_from_compiler(new_query_compiler, not copy)
Expand Down
32 changes: 32 additions & 0 deletions modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,38 @@ def test_astype():
with pytest.raises(KeyError):
modin_df.astype({"not_exists": np.uint8})

# The dtypes series must have a unique index.
eval_general(
modin_df,
expected_df,
lambda df: df.astype(
pd.Series([str, str], index=["col1", "col1"])
if isinstance(df, pd.DataFrame)
else pandas.Series([str, str], index=["col1", "col1"])
),
)


@pytest.mark.parametrize("dtypes_are_dict", [True, False])
def test_astype_dict_or_series_multiple_column_partitions(dtypes_are_dict):
# Test astype with a dtypes dict that is complex in that:
# - It applies to columns spanning multiple column partitions
# - Within a partition frame df:
# - dtypes.index is not a subset of df.columns
# - df.columns is not a subset of dtypes.index

modin_df, pandas_df = create_test_dfs(test_data["int_data"])
if dtypes_are_dict:
new_dtypes = {}
else:
new_dtypes = pandas.Series()
for i, column in enumerate(pandas_df.columns):
if i % 3 == 1:
new_dtypes[column] = "string"
elif i % 3 == 2:
new_dtypes[column] = float
eval_general(modin_df, pandas_df, lambda df: df.astype(new_dtypes))


def test_astype_category():
modin_df = pd.DataFrame(
Expand Down
12 changes: 12 additions & 0 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,8 @@ def test_asof_large(where):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_astype(data):
modin_series, pandas_series = create_test_series(data)
series_name = "test_series"
modin_series.name = pandas_series.name = series_name
try:
pandas_result = pandas_series.astype(str)
except Exception as e:
Expand All @@ -1040,6 +1042,16 @@ def test_astype(data):
else:
df_equals(modin_series.astype(np.float64), pandas_result)

df_equals(
modin_series.astype({series_name: str}),
pandas_series.astype({series_name: str}),
)

eval_general(modin_series, pandas_series, lambda df: df.astype({"wrong_name": str}))

# TODO(https://github.com/modin-project/modin/issues/4317): Test passing a
# dict to astype() for a series with no name.


def test_astype_categorical():
modin_df = pd.Series(["A", "A", "B", "B", "A"])
Expand Down

0 comments on commit 5440f08

Please sign in to comment.