FIX-#4314: Allow passing a series of dtypes to astype. (#4318)

Co-authored-by: Yaroslav Igoshev <Poolliver868@mail.ru> Signed-off-by: mvashishtha <mahesh@ponder.io>
modin-project · Mar 18, 2022 · 5440f08 · 5440f08
1 parent e5d15f9
commit 5440f08
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 6 deletions.
diff --git a/docs/release_notes/release_notes-0.14.0.rst b/docs/release_notes/release_notes-0.14.0.rst
@@ -25,6 +25,7 @@ Key Features and Updates
   * FIX-#4308: Add proper error handling in df.set_index (#4309)
   * FIX-#4056: Allow an empty parse_date list in `read_csv_glob` (#4074)
   * FIX-#4312: Fix constructing categorical frame with duplicate column names (#4313).  
+  * FIX-#4314: Allow passing a series of dtypes to astype (#4318)  
 * Performance enhancements
   * FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140)
   * FIX-#4183: Stop shallow copies from creating global shared state. (#4184)

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -51,6 +51,7 @@
 from .utils import is_full_grab_slice
 from modin.utils import try_cast_to_pandas, _inherit_docstrings
 from modin.error_message import ErrorMessage
+import modin.pandas as pd
 from modin.pandas.utils import is_scalar
 from modin.config import IsExperimental
 
@@ -487,8 +488,6 @@ def _default_to_pandas(self, op, *args, **kwargs):
             return Series(result)
         # inplace
         elif result is None:
-            import modin.pandas as pd
-
             return self._create_or_update_from_compiler(
                 getattr(pd, type(pandas_obj).__name__)(pandas_obj)._query_compiler,
                 inplace=True,
@@ -903,20 +902,31 @@ def asof(self, where, subset=None):
         return result
 
     def astype(self, dtype, copy=True, errors="raise"):
-        col_dtypes = {}
+        # dtype can be a series, a dict, or a scalar. If it's series or scalar,
+        # convert it to a dict before passing it to the query compiler.
+        if isinstance(dtype, (pd.Series, pandas.Series)):
+            if not dtype.index.is_unique:
+                raise ValueError(
+                    "The new Series of types must have a unique index, i.e. "
+                    + "it must be one-to-one mapping from column names to "
+                    + " their new dtypes."
+                )
+            dtype = {column: dtype for column, dtype in dtype.items()}
+        # If we got a series or dict originally, dtype is a dict now. Its keys
+        # must be column names.
         if isinstance(dtype, dict):
             if (
                 not set(dtype.keys()).issubset(set(self._query_compiler.columns))
                 and errors == "raise"
             ):
                 raise KeyError(
-                    "Only a column name can be used for the key in"
+                    "Only a column name can be used for the key in "
                     + "a dtype mappings argument."
                 )
             col_dtypes = dtype
         else:
-            for column in self._query_compiler.columns:
-                col_dtypes[column] = dtype
+            # Assume that the dtype is a scalar.
+            col_dtypes = {column: dtype for column in self._query_compiler.columns}
 
         new_query_compiler = self._query_compiler.astype(col_dtypes)
         return self._create_or_update_from_compiler(new_query_compiler, not copy)

diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py
@@ -483,6 +483,38 @@ def test_astype():
     with pytest.raises(KeyError):
         modin_df.astype({"not_exists": np.uint8})
 
+    # The dtypes series must have a unique index.
+    eval_general(
+        modin_df,
+        expected_df,
+        lambda df: df.astype(
+            pd.Series([str, str], index=["col1", "col1"])
+            if isinstance(df, pd.DataFrame)
+            else pandas.Series([str, str], index=["col1", "col1"])
+        ),
+    )
+
+
+@pytest.mark.parametrize("dtypes_are_dict", [True, False])
+def test_astype_dict_or_series_multiple_column_partitions(dtypes_are_dict):
+    # Test astype with a dtypes dict that is complex in that:
+    # - It applies to columns spanning multiple column partitions
+    # - Within a partition frame df:
+    #   - dtypes.index is not a subset of df.columns
+    #   - df.columns is not a subset of dtypes.index
+
+    modin_df, pandas_df = create_test_dfs(test_data["int_data"])
+    if dtypes_are_dict:
+        new_dtypes = {}
+    else:
+        new_dtypes = pandas.Series()
+    for i, column in enumerate(pandas_df.columns):
+        if i % 3 == 1:
+            new_dtypes[column] = "string"
+        elif i % 3 == 2:
+            new_dtypes[column] = float
+    eval_general(modin_df, pandas_df, lambda df: df.astype(new_dtypes))
+
 
 def test_astype_category():
     modin_df = pd.DataFrame(

diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -1016,6 +1016,8 @@ def test_asof_large(where):
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_astype(data):
     modin_series, pandas_series = create_test_series(data)
+    series_name = "test_series"
+    modin_series.name = pandas_series.name = series_name
     try:
         pandas_result = pandas_series.astype(str)
     except Exception as e:
@@ -1040,6 +1042,16 @@ def test_astype(data):
     else:
         df_equals(modin_series.astype(np.float64), pandas_result)
 
+    df_equals(
+        modin_series.astype({series_name: str}),
+        pandas_series.astype({series_name: str}),
+    )
+
+    eval_general(modin_series, pandas_series, lambda df: df.astype({"wrong_name": str}))
+
+    # TODO(https://github.com/modin-project/modin/issues/4317): Test passing a
+    # dict to astype() for a series with no name.
+
 
 def test_astype_categorical():
     modin_df = pd.Series(["A", "A", "B", "B", "A"])