From b92fce18618d80ab9f1ed314a2584e577fbbea9b Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 20 Jul 2020 23:23:19 -0700 Subject: [PATCH 001/120] Move logic of `sort_values` into the query compiler (#1754) * Move logic of `sort_values` into the query compiler Signed-off-by: Devin Petersohn * Remove dead code Signed-off-by: Devin Petersohn * Add back `"kind"` parameter Signed-off-by: Devin Petersohn * Added a test for bug #1743 This test runs inplace sort of dataframe that has non-numerical index Signed-off-by: Gregory Shimansky * Apply suggestions from code review Co-authored-by: anmyachev <45976948+anmyachev@users.noreply.github.com> Co-authored-by: Gregory Shimansky Co-authored-by: anmyachev <45976948+anmyachev@users.noreply.github.com> --- modin/backends/pandas/query_compiler.py | 77 +++++++++++++++++++++++++ modin/pandas/base.py | 55 ++++-------------- modin/pandas/test/test_dataframe.py | 21 ++++++- 3 files changed, 106 insertions(+), 47 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index b314dc9f288..14f12ff1a2c 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1874,3 +1874,80 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item): item_to_distribute=broadcasted_items, ) return self.__constructor__(new_modin_frame) + + def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): + """Reorder the rows based on the lexicographic order of the given columns. + + Parameters + ---------- + columns : scalar or list of scalar + The column or columns to sort by + ascending : bool + Sort in ascending order (True) or descending order (False) + + Returns + ------- + PandasQueryCompiler + A new query compiler that contains result of the sort + """ + na_position = kwargs.get("na_position", "last") + kind = kwargs.get("kind", "quicksort") + if not is_list_like(columns): + columns = [columns] + # Currently, sort_values will just reindex based on the sorted values. + # TODO create a more efficient way to sort + ErrorMessage.default_to_pandas("sort_values") + broadcast_value_dict = { + col: self.getitem_column_array([col]).to_pandas().squeeze(axis=1) + for col in columns + } + # Index may contain duplicates + broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index) + # Index without duplicates + broadcast_values2 = pandas.DataFrame(broadcast_value_dict) + broadcast_values2 = broadcast_values2.reset_index(drop=True) + # Index may contain duplicates + new_index1 = broadcast_values1.sort_values( + by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position, + ).index + # Index without duplicates + new_index2 = broadcast_values2.sort_values( + by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position, + ).index + + result = self.reset_index(drop=True).reindex(0, new_index2) + result.index = new_index1 + return result + + def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): + """Reorder the columns based on the lexicographic order of the given rows. + + Parameters + ---------- + rows : scalar or list of scalar + The row or rows to sort by + ascending : bool + Sort in ascending order (True) or descending order (False) + + Returns + ------- + PandasQueryCompiler + A new query compiler that contains result of the sort + """ + na_position = kwargs.get("na_position", "last") + kind = kwargs.get("kind", "quicksort") + if not is_list_like(rows): + rows = [rows] + ErrorMessage.default_to_pandas("sort_values") + broadcast_value_list = [ + self.getitem_row_array([row]).to_pandas() for row in rows + ] + index_builder = list(zip(broadcast_value_list, rows)) + broadcast_values = pandas.concat( + [row for row, idx in index_builder], copy=False + ) + broadcast_values.columns = self.columns + new_columns = broadcast_values.sort_values( + by=rows, axis=1, ascending=ascending, kind=kind, na_position=na_position, + ).columns + return self.reindex(1, new_columns) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 7102c6b3180..9b5f79066eb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2791,59 +2791,24 @@ def sort_values( A sorted DataFrame. """ axis = self._get_axis_number(axis) - if not is_list_like(by): - by = [by] - # Currently, sort_values will just reindex based on the sorted values. - # TODO create a more efficient way to sort - ErrorMessage.default_to_pandas("sort_values") + inplace = validate_bool_kwarg(inplace, "inplace") if axis == 0: - broadcast_value_dict = {col: self[col]._to_pandas() for col in by} - # Index may contain duplicates - broadcast_values1 = pandas.DataFrame(broadcast_value_dict, index=self.index) - # Index without duplicates - broadcast_values2 = pandas.DataFrame(broadcast_value_dict) - broadcast_values2 = broadcast_values2.reset_index(drop=True) - # Index may contain duplicates - new_index1 = broadcast_values1.sort_values( - by=by, - axis=axis, + result = self._query_compiler.sort_rows_by_column_values( + by, ascending=ascending, kind=kind, na_position=na_position, - ).index - # Index without duplicates - new_index2 = broadcast_values2.sort_values( - by=by, - axis=axis, - ascending=ascending, - kind=kind, - na_position=na_position, - ).index - if inplace: - self.reindex(index=new_index2, copy=False) - self.index = new_index1 - else: - result = self.reset_index(drop=True) - result = result.reindex(index=new_index2, copy=True) - result.index = new_index1 - return result - else: - broadcast_value_list = [ - self[row :: len(self.index)]._to_pandas() for row in by - ] - index_builder = list(zip(broadcast_value_list, by)) - broadcast_values = pandas.concat( - [row for row, idx in index_builder], copy=False + ignore_index=ignore_index, ) - broadcast_values.columns = self.columns - new_columns = broadcast_values.sort_values( - by=by, - axis=axis, + else: + result = self._query_compiler.sort_columns_by_row_values( + by, ascending=ascending, kind=kind, na_position=na_position, - ).columns - return self.reindex(columns=new_columns, copy=not inplace) + ignore_index=ignore_index, + ) + return self._create_or_update_from_compiler(result, inplace) def std( self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 9966aaf9795..19dab8c791f 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -6020,10 +6020,27 @@ def test_sort_values_with_duplicates(self): pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) key = modin_df.columns[0] - modin_result = modin_df.sort_values(key, inplace=False,) - pandas_result = pandas_df.sort_values(key, inplace=False,) + modin_result = modin_df.sort_values(key, inplace=False) + pandas_result = pandas_df.sort_values(key, inplace=False) df_equals(modin_result, pandas_result) + modin_df.sort_values(key, inplace=True) + pandas_df.sort_values(key, inplace=True) + df_equals(modin_df, pandas_df) + + def test_sort_values_with_string_index(self): + modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) + pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) + + key = modin_df.columns[0] + modin_result = modin_df.sort_values(key, inplace=False) + pandas_result = pandas_df.sort_values(key, inplace=False) + df_equals(modin_result, pandas_result) + + modin_df.sort_values(key, inplace=True) + pandas_df.sort_values(key, inplace=True) + df_equals(modin_df, pandas_df) + def test_where(self): frame_data = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) From c1aba384394c1ee9185edb993fa69ee2e204c2cc Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 21 Jul 2020 01:03:51 -0700 Subject: [PATCH 002/120] Improve performance of slice indexing (#1753) * Improves performance of slice indexing * Resolves #1752 Signed-off-by: Devin Petersohn * Fix issue Signed-off-by: Devin Petersohn * Fix Series cases Signed-off-by: Devin Petersohn --- modin/engines/base/frame/data.py | 149 +++++++++++++++--- .../dask/pandas_on_dask/frame/partition.py | 14 +- .../pandas_on_python/frame/partition.py | 14 +- .../ray/pandas_on_ray/frame/partition.py | 27 ++-- modin/pandas/indexing.py | 53 ++++++- 5 files changed, 192 insertions(+), 65 deletions(-) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index a5ba3b45fdb..520e9d28bd8 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -344,6 +344,14 @@ def mask( BasePandasFrame A new BasePandasFrame from the mask provided. """ + if isinstance(row_numeric_idx, slice) and ( + row_numeric_idx == slice(None) or row_numeric_idx == slice(0, None) + ): + row_numeric_idx = None + if isinstance(col_numeric_idx, slice) and ( + col_numeric_idx == slice(None) or col_numeric_idx == slice(0, None) + ): + col_numeric_idx = None if ( row_indices is None and row_numeric_idx is None @@ -354,11 +362,20 @@ def mask( if row_indices is not None: row_numeric_idx = self.index.get_indexer_for(row_indices) if row_numeric_idx is not None: - row_partitions_list = self._get_dict_of_block_index(1, row_numeric_idx) - new_row_lengths = [ - len(indices) for _, indices in row_partitions_list.items() - ] - new_index = self.index[sorted(row_numeric_idx)] + row_partitions_list = self._get_dict_of_block_index(0, row_numeric_idx) + if isinstance(row_numeric_idx, slice): + # Row lengths for slice are calculated as the length of the slice + # on the partition. Often this will be the same length as the current + # length, but sometimes it is different, thus the extra calculation. + new_row_lengths = [ + len(range(*idx.indices(self._row_lengths[p]))) + for p, idx in row_partitions_list.items() + ] + # Use the slice to calculate the new row index + new_index = self.index[row_numeric_idx] + else: + new_row_lengths = [len(idx) for _, idx in row_partitions_list.items()] + new_index = self.index[sorted(row_numeric_idx)] else: row_partitions_list = { i: slice(None) for i in range(len(self._row_lengths)) @@ -369,15 +386,37 @@ def mask( if col_indices is not None: col_numeric_idx = self.columns.get_indexer_for(col_indices) if col_numeric_idx is not None: - col_partitions_list = self._get_dict_of_block_index(0, col_numeric_idx) - new_col_widths = [ - len(indices) for _, indices in col_partitions_list.items() - ] - new_columns = self.columns[sorted(col_numeric_idx)] - if self._dtypes is not None: - new_dtypes = self.dtypes[sorted(col_numeric_idx)] + col_partitions_list = self._get_dict_of_block_index(1, col_numeric_idx) + if isinstance(col_numeric_idx, slice): + # Column widths for slice are calculated as the length of the slice + # on the partition. Often this will be the same length as the current + # length, but sometimes it is different, thus the extra calculation. + new_col_widths = [ + len(range(*idx.indices(self._column_widths[p]))) + for p, idx in col_partitions_list.items() + ] + # Use the slice to calculate the new columns + new_columns = self.columns[col_numeric_idx] + assert sum(new_col_widths) == len( + new_columns + ), "{} != {}.\n{}\n{}\n{}".format( + sum(new_col_widths), + len(new_columns), + col_numeric_idx, + self._column_widths, + col_partitions_list, + ) + if self._dtypes is not None: + new_dtypes = self.dtypes[col_numeric_idx] + else: + new_dtypes = None else: - new_dtypes = None + new_col_widths = [len(idx) for _, idx in col_partitions_list.items()] + new_columns = self.columns[sorted(col_numeric_idx)] + if self._dtypes is not None: + new_dtypes = self.dtypes[sorted(col_numeric_idx)] + else: + new_dtypes = None else: col_partitions_list = { i: slice(None) for i in range(len(self._column_widths)) @@ -415,10 +454,12 @@ def mask( # common case to keep it fast. if ( row_numeric_idx is None + or isinstance(row_numeric_idx, slice) or len(row_numeric_idx) == 1 or np.all(row_numeric_idx[1:] >= row_numeric_idx[:-1]) ) and ( col_numeric_idx is None + or isinstance(col_numeric_idx, slice) or len(col_numeric_idx) == 1 or np.all(col_numeric_idx[1:] >= col_numeric_idx[:-1]) ): @@ -627,9 +668,9 @@ def _get_dict_of_block_index(self, axis, indices): Parameters ---------- - axis : (0 - columns, 1 - rows) + axis : (0 - rows, 1 - columns) The axis along which to get the indices - indices : list of int + indices : list of int, slice A list of global indices to convert. Returns @@ -638,11 +679,70 @@ def _get_dict_of_block_index(self, axis, indices): A mapping from partition to list of internal indices to extract from that partition. """ - indices = np.sort(indices) - if not axis: - bins = np.array(self._column_widths) - else: + # Fasttrack slices + if isinstance(indices, slice): + if indices == slice(None) or indices == slice(0, None): + return OrderedDict( + zip( + range(len(self.axes[axis])), + [slice(None)] * len(self.axes[axis]), + ) + ) + if indices.start is None or indices.start == 0: + last_part, last_idx = list( + self._get_dict_of_block_index(axis, [indices.stop]).items() + )[0] + dict_of_slices = OrderedDict( + zip(range(last_part), [slice(None)] * last_part) + ) + dict_of_slices.update({last_part: slice(last_idx[0])}) + return dict_of_slices + elif indices.stop is None or indices.stop >= len(self.axes[axis]): + first_part, first_idx = list( + self._get_dict_of_block_index(axis, [indices.start]).items() + )[0] + dict_of_slices = OrderedDict({first_part: slice(first_idx[0], None)}) + num_partitions = np.size(self._partitions, axis=axis) + part_list = range(first_part + 1, num_partitions) + dict_of_slices.update( + OrderedDict(zip(part_list, [slice(None)] * len(part_list))) + ) + return dict_of_slices + else: + first_part, first_idx = list( + self._get_dict_of_block_index(axis, [indices.start]).items() + )[0] + last_part, last_idx = list( + self._get_dict_of_block_index(axis, [indices.stop]).items() + )[0] + if first_part == last_part: + return OrderedDict({first_part: slice(first_idx[0], last_idx[0])}) + else: + if last_part - first_part == 1: + return OrderedDict( + { + first_part: slice(first_idx[0], None), + last_part: slice(None, last_idx[0]), + } + ) + else: + dict_of_slices = OrderedDict( + {first_part: slice(first_idx[0], None)} + ) + part_list = range(first_part + 1, last_part) + dict_of_slices.update( + OrderedDict(zip(part_list, [slice(None)] * len(part_list))) + ) + dict_of_slices.update({last_part: slice(None, last_idx[0])}) + return dict_of_slices + # Sort and convert negative indices to positive + indices = np.sort( + [i if i >= 0 else max(0, len(self.axes[axis]) + i) for i in indices] + ) + if axis == 0: bins = np.array(self._row_lengths) + else: + bins = np.array(self._column_widths) # INT_MAX to make sure we don't try to compute on partitions that don't exist. cumulative = np.append(bins[:-1].cumsum(), np.iinfo(bins.dtype).max) @@ -1029,7 +1129,9 @@ def _apply_full_axis_select_indices( old_index = self.index if axis else self.columns if apply_indices is not None: numeric_indices = old_index.get_indexer_for(apply_indices) - dict_indices = self._get_dict_of_block_index(axis, numeric_indices) + # Get the indices for the axis being applied to (it is the opposite of axis + # being applied over) + dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) new_partitions = self._frame_mgr_cls.apply_func_to_select_indices_along_full_axis( axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining ) @@ -1084,7 +1186,8 @@ def _apply_select_indices( # Convert indices to numeric indices old_index = self.index if axis else self.columns numeric_indices = old_index.get_indexer_for(apply_indices) - dict_indices = self._get_dict_of_block_index(axis, numeric_indices) + # Get indices being applied to (opposite of indices being applied over) + dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) new_partitions = self._frame_mgr_cls.apply_func_to_select_indices( axis, self._partitions, @@ -1113,8 +1216,8 @@ def _apply_select_indices( assert row_indices is not None and col_indices is not None assert keep_remaining assert item_to_distribute is not None - row_partitions_list = self._get_dict_of_block_index(1, row_indices).items() - col_partitions_list = self._get_dict_of_block_index(0, col_indices).items() + row_partitions_list = self._get_dict_of_block_index(0, row_indices).items() + col_partitions_list = self._get_dict_of_block_index(1, col_indices).items() new_partitions = self._frame_mgr_cls.apply_func_to_indices_both_axis( self._partitions, func, diff --git a/modin/engines/dask/pandas_on_dask/frame/partition.py b/modin/engines/dask/pandas_on_dask/frame/partition.py index 67457172661..a702173abc0 100644 --- a/modin/engines/dask/pandas_on_dask/frame/partition.py +++ b/modin/engines/dask/pandas_on_dask/frame/partition.py @@ -98,16 +98,10 @@ def mask(self, row_indices, col_indices): new_obj = self.add_to_apply_calls( lambda df: pandas.DataFrame(df.iloc[row_indices, col_indices]) ) - new_obj._length_cache = ( - len(row_indices) - if not isinstance(row_indices, slice) - else self._length_cache - ) - new_obj._width_cache = ( - len(col_indices) - if not isinstance(col_indices, slice) - else self._width_cache - ) + if not isinstance(row_indices, slice): + new_obj._length_cache = len(row_indices) + if not isinstance(col_indices, slice): + new_obj._width_cache = len(col_indices) return new_obj def __copy__(self): diff --git a/modin/engines/python/pandas_on_python/frame/partition.py b/modin/engines/python/pandas_on_python/frame/partition.py index b24b8701ec1..d2cfc8f238a 100644 --- a/modin/engines/python/pandas_on_python/frame/partition.py +++ b/modin/engines/python/pandas_on_python/frame/partition.py @@ -91,16 +91,10 @@ def mask(self, row_indices=None, col_indices=None): new_obj = self.add_to_apply_calls( lambda df: pandas.DataFrame(df.iloc[row_indices, col_indices]) ) - new_obj._length_cache = ( - len(row_indices) - if not isinstance(row_indices, slice) - else self._length_cache - ) - new_obj._width_cache = ( - len(col_indices) - if not isinstance(col_indices, slice) - else self._width_cache - ) + if not isinstance(row_indices, slice): + new_obj._length_cache = len(row_indices) + if not isinstance(col_indices, slice): + new_obj._width_cache = len(col_indices) return new_obj def to_pandas(self): diff --git a/modin/engines/ray/pandas_on_ray/frame/partition.py b/modin/engines/ray/pandas_on_ray/frame/partition.py index 00c5ac34727..9cfb5f5af3e 100644 --- a/modin/engines/ray/pandas_on_ray/frame/partition.py +++ b/modin/engines/ray/pandas_on_ray/frame/partition.py @@ -103,30 +103,29 @@ def to_numpy(self): def mask(self, row_indices, col_indices): if ( - isinstance(row_indices, slice) + (isinstance(row_indices, slice) and row_indices == slice(None)) or ( - self._length_cache is not None + not isinstance(row_indices, slice) + and self._length_cache is not None and len(row_indices) == self._length_cache ) ) and ( - isinstance(col_indices, slice) - or (self._width_cache is not None and len(col_indices) == self._width_cache) + (isinstance(col_indices, slice) and col_indices == slice(None)) + or ( + not isinstance(col_indices, slice) + and self._width_cache is not None + and len(col_indices) == self._width_cache + ) ): return self.__copy__() new_obj = self.add_to_apply_calls( lambda df: pandas.DataFrame(df.iloc[row_indices, col_indices]) ) - new_obj._length_cache = ( - len(row_indices) - if not isinstance(row_indices, slice) - else self._length_cache - ) - new_obj._width_cache = ( - len(col_indices) - if not isinstance(col_indices, slice) - else self._width_cache - ) + if not isinstance(row_indices, slice): + new_obj._length_cache = len(row_indices) + if not isinstance(col_indices, slice): + new_obj._width_cache = len(col_indices) return new_obj @classmethod diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index f853ade33a7..68d60db0006 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -154,15 +154,38 @@ def __setitem__(self, row_lookup, col_lookup, item): item: The new item needs to be set. It can be any shape that's broadcast-able to the product of the lookup tables. """ - if len(row_lookup) == len(self.qc.index) and len(col_lookup) == 1: + # Convert slices to indices for the purposes of application. + # TODO (devin-petersohn): Apply to slice without conversion to list + if isinstance(row_lookup, slice): + row_lookup = range(len(self.qc.index))[row_lookup] + if isinstance(col_lookup, slice): + col_lookup = range(len(self.qc.columns))[col_lookup] + # This is True when we dealing with assignment of a full column. This case + # should be handled in a fastpath with `df[col] = item`. + if ( + len(row_lookup) == len(self.qc.index) + and len(col_lookup) == 1 + and hasattr(self.df, "columns") + ): self.df[self.df.columns[col_lookup][0]] = item + # This is True when we are assigning to a full row. We want to reuse the setitem + # mechanism to operate along only one axis for performance reasons. elif len(col_lookup) == len(self.qc.columns) and len(row_lookup) == 1: if hasattr(item, "_query_compiler"): item = item._query_compiler new_qc = self.qc.setitem(1, self.qc.index[row_lookup[0]], item) self.df._create_or_update_from_compiler(new_qc, inplace=True) + # Assignment to both axes. else: - to_shape = (len(row_lookup), len(col_lookup)) + if isinstance(row_lookup, slice): + new_row_len = len(self.df.index[row_lookup]) + else: + new_row_len = len(row_lookup) + if isinstance(col_lookup, slice): + new_col_len = len(self.df.columns[col_lookup]) + else: + new_col_len = len(col_lookup) + to_shape = new_row_len, new_col_len item = self._broadcast_item(row_lookup, col_lookup, item, to_shape) self._write_items(row_lookup, col_lookup, item) @@ -358,12 +381,26 @@ def __setitem__(self, key, item): super(_iLocIndexer, self).__setitem__(row_lookup, col_lookup, item) def _compute_lookup(self, row_loc, col_loc): - row_lookup = ( - pandas.RangeIndex(len(self.qc.index)).to_series().iloc[row_loc].index - ) - col_lookup = ( - pandas.RangeIndex(len(self.qc.columns)).to_series().iloc[col_loc].index - ) + if ( + not isinstance(row_loc, slice) + or isinstance(row_loc, slice) + and row_loc.step is not None + ): + row_lookup = ( + pandas.RangeIndex(len(self.qc.index)).to_series().iloc[row_loc].index + ) + else: + row_lookup = row_loc + if ( + not isinstance(col_loc, slice) + or isinstance(col_loc, slice) + and col_loc.step is not None + ): + col_lookup = ( + pandas.RangeIndex(len(self.qc.columns)).to_series().iloc[col_loc].index + ) + else: + col_lookup = col_loc return row_lookup, col_lookup def _check_dtypes(self, locator): From 51c3803a24e87c76768d64def6838ee2eca6f09f Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 22 Jul 2020 18:13:40 +0300 Subject: [PATCH 003/120] [FIX] Fix #1683 - losing index names in pd.concat (#1684) Signed-off-by: Dmitry Chigarev --- modin/pandas/concat.py | 47 ++++++++++++++++++++++- modin/pandas/test/test_concat.py | 66 +++++++++----------------------- modin/pandas/test/utils.py | 59 ++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 48 deletions(-) diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py index 0b0a98f4b92..38694799663 100644 --- a/modin/pandas/concat.py +++ b/modin/pandas/concat.py @@ -12,10 +12,13 @@ # governing permissions and limitations under the License. import pandas +import numpy as np from typing import Hashable, Iterable, Mapping, Optional, Union from pandas._typing import FrameOrSeriesUnion +from pandas.core.dtypes.common import is_list_like +from modin.backends.base.query_compiler import BaseQueryCompiler from .dataframe import DataFrame from .series import Series @@ -108,8 +111,18 @@ def concat( new_idx_labels = { k: v.index if axis == 0 else v.columns for k, v in zip(keys, objs) } - tuples = [(k, o) for k, obj in new_idx_labels.items() for o in obj] + tuples = [ + (k, *o) if isinstance(o, tuple) else (k, o) + for k, obj in new_idx_labels.items() + for o in obj + ] new_idx = pandas.MultiIndex.from_tuples(tuples) + if names is not None: + new_idx.names = names + else: + old_name = _determine_name(objs, axis) + if old_name is not None: + new_idx.names = [None] + old_name else: new_idx = None new_query_compiler = objs[0].concat( @@ -132,3 +145,35 @@ def concat( else: result_df.columns = new_idx return result_df + + +def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]): + """ + Determine names of index after concatenation along passed axis + + Parameters + ---------- + objs : iterable of QueryCompilers + objects to concatenate + + axis : int or str + the axis to concatenate along + + Returns + ------- + `list` with single element - computed index name, `None` if it could not + be determined + """ + axis = pandas.DataFrame()._get_axis_number(axis) + + def get_names(obj): + return obj.columns.names if axis else obj.index.names + + names = np.array([get_names(obj) for obj in objs]) + + # saving old name, only if index names of all objs are the same + if np.all(names == names[0]): + # we must do this check to avoid this calls `list(str_like_name)` + return list(names[0]) if is_list_like(names[0]) else [names[0]] + else: + return None diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 057bf1a283e..5e67427110a 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -17,57 +17,11 @@ import modin.pandas as pd from modin.pandas.utils import from_pandas -from .utils import df_equals +from .utils import df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs pd.DEFAULT_NPARTITIONS = 4 -def generate_dfs(): - df = pandas.DataFrame( - { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 10, 11], - "col4": [12, 13, 14, 15], - "col5": [0, 0, 0, 0], - } - ) - - df2 = pandas.DataFrame( - { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 10, 11], - "col6": [12, 13, 14, 15], - "col7": [0, 0, 0, 0], - } - ) - return df, df2 - - -def generate_none_dfs(): - df = pandas.DataFrame( - { - "col1": [0, 1, 2, 3], - "col2": [4, 5, None, 7], - "col3": [8, 9, 10, 11], - "col4": [12, 13, 14, 15], - "col5": [None, None, None, None], - } - ) - - df2 = pandas.DataFrame( - { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 10, 11], - "col6": [12, 13, 14, 15], - "col7": [0, 0, 0, 0], - } - ) - return df, df2 - - def test_df_concat(): df, df2 = generate_dfs() @@ -207,3 +161,21 @@ def test_concat_with_empty_frame(): pd.concat([modin_empty_df, modin_row]), pandas.concat([pandas_empty_df, pandas_row]), ) + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("names", [False, True]) +def test_concat_multiindex(axis, names): + pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis) + md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2]) + + keys = ["first", "second"] + if names: + names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)] + else: + names = None + + df_equals( + pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names), + pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names), + ) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 26ede7e902b..3ae06c5ff69 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -592,3 +592,62 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}): def create_test_dfs(*args, **kwargs): return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs) + + +def generate_dfs(): + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) + + df2 = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col6": [12, 13, 14, 15], + "col7": [0, 0, 0, 0], + } + ) + return df, df2 + + +def generate_multiindex_dfs(axis=1): + def generate_multiindex(index): + return pandas.MultiIndex.from_tuples( + [("a", x) for x in index.values], names=["name1", "name2"] + ) + + df1, df2 = generate_dfs() + df1.axes[axis], df2.axes[axis] = map( + generate_multiindex, [df1.axes[axis], df2.axes[axis]] + ) + return df1, df2 + + +def generate_none_dfs(): + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, None, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [None, None, None, None], + } + ) + + df2 = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col6": [12, 13, 14, 15], + "col7": [0, 0, 0, 0], + } + ) + return df, df2 From 5c1dff663c9044052c5279647b28f18ce494d2fb Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 22 Jul 2020 09:17:26 -0700 Subject: [PATCH 004/120] TEST-#1759: Add commitlint check on pull requests (#1760) Signed-off-by: Devin Petersohn --- .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/workflows/ci.yml | 24 +++++++++++++++++++++--- commitlint.config.js | 10 ++++++++++ docs/contributing.rst | 24 ++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 commitlint.config.js diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e7be47467ea..879bd24a622 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -8,6 +8,7 @@ if you have questions about contributing. +- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/contributing.html) - [ ] passes `flake8 modin` - [ ] passes `black --check modin` - [ ] signed commit with `git commit -s` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 808c3d8e902..9da547594df 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,24 @@ name: ci on: pull_request jobs: + lint-commit: + name: lint (commit) + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-node@v1 + with: + node-version: "10.x" + - run: npm install --save-dev @commitlint/{config-conventional,cli} commitlint-plugin-jira-rules commitlint-config-jira + - name: Add dependencies for commitlint action + run: echo "::set-env name=NODE_PATH::$GITHUB_WORKSPACE/node_modules" + - run: git remote add upstream https://github.com/modin-project/modin.git + - run: git fetch upstream + - run: npx commitlint --from upstream/master --to HEAD --verbose lint-black: name: lint (black) runs-on: ubuntu-latest @@ -68,7 +86,7 @@ jobs: - run: pip install -r requirements.txt - run: python -m pytest modin/test/test_publisher.py modin/data_management/test/test_dispatcher.py test-all: - needs: [lint-flake8, lint-black, test-api, test-headers] + needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest strategy: matrix: @@ -108,7 +126,7 @@ jobs: if: matrix.part == 3 - run: bash <(curl -s https://codecov.io/bash) test-windows: - needs: [lint-flake8, lint-black, test-api, test-headers] + needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: windows-latest strategy: matrix: @@ -144,7 +162,7 @@ jobs: - run: choco install codecov - run: codecov -f .\coverage.xml -t ${{secrets.CODECOV_TOKEN}} test-pyarrow: - needs: [lint-flake8, lint-black, test-api, test-headers] + needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest strategy: matrix: diff --git a/commitlint.config.js b/commitlint.config.js new file mode 100644 index 00000000000..c022602cab7 --- /dev/null +++ b/commitlint.config.js @@ -0,0 +1,10 @@ +module.exports = { + plugins: ['commitlint-plugin-jira-rules'], + extends: ['jira'], + rules: { + "header-max-length": [2, "always", 50], + "signed-off-by": [2, "always", "Signed-off-by"], + "jira-task-id-max-length": [0, "always", 10], + "jira-task-id-project-key": [2, "always", ["FEAT", "DOCS", "FIX", "REFACTOR", "TEST"]], + } +} diff --git a/docs/contributing.rst b/docs/contributing.rst index c157bc4be88..ff7f73a0ab2 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -80,6 +80,30 @@ commits and push them to GitHub. If you've pushed your changes to GitHub already you'll need to force push your branch after this with ``git push -f``. +Commit Message formatting +------------------------- +To ensure that all commit messages in the master branch follow a specific format, we +enforce that all commit messages must follow the following format: + +.. code-block:: bash + + FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations + +The ``FEAT`` component represents the type of commit. This component of the commit +message can be one of the following: + +* FEAT: A new feature that is added +* DOCS: Documentation improvements or updates +* FIX: A bugfix contribution +* REFACTOR: Moving or removing code without change in functionality +* TEST: Test updates or improvements + +The ``#9999`` component of the commit message should be the issue number in the Modin +GitHub issue tracker: https://github.com/modin-project/modin/issues. This is important +because it links commits to their issues. + +The commit message should follow a colon (:) and be descriptive and succinct. + Development Dependencies ------------------------ From 37b7d2c4c5b3ac04408f362b3989de30dd3df1a2 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Wed, 22 Jul 2020 20:52:18 +0300 Subject: [PATCH 005/120] REFACTOR-#1741: Use low-level api for kurt function implementation with defined level parameter (#1719) Signed-off-by: Alexander Myskov --- modin/pandas/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 9b5f79066eb..cd2cbed248a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1473,13 +1473,16 @@ def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): axis = self._get_axis_number(axis) if level is not None: func_kwargs = { - "axis": axis, "skipna": skipna, "level": level, "numeric_only": numeric_only, } - return self.apply("kurt", **func_kwargs) + return self.__constructor__( + query_compiler=self._query_compiler._apply_text_func_elementwise( + "kurt", axis, **func_kwargs + ) + ) if numeric_only: self._validate_dtypes(numeric_only=True) From bc9055a4fde43e7e48f88e7b8b7a96d5d1dfb185 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Thu, 23 Jul 2020 23:05:57 +0300 Subject: [PATCH 006/120] [FIX] Fix of inconsistent indices #1726 #1731 #1732 #1734 (#1727) --- modin/backends/pandas/query_compiler.py | 27 ++-- .../functions/binary_function.py | 1 + .../functions/groupby_function.py | 14 +- modin/engines/base/frame/data.py | 123 ++++++++++++++++-- modin/engines/ray/generic/io.py | 2 +- modin/pandas/groupby.py | 2 +- modin/pandas/test/test_general.py | 25 ++++ modin/pandas/test/test_io.py | 1 + modin/pandas/test/test_series.py | 3 + 9 files changed, 164 insertions(+), 34 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 14f12ff1a2c..57ce0f06109 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -468,13 +468,13 @@ def transpose(self, *args, **kwargs): # MapReduce operations - def _is_monotonic(self, type=None): + def _is_monotonic(self, func_type=None): funcs = { "increasing": lambda df: df.is_monotonic_increasing, "decreasing": lambda df: df.is_monotonic_decreasing, } - monotonic_fn = funcs.get(type, funcs["increasing"]) + monotonic_fn = funcs.get(func_type, funcs["increasing"]) def is_monotonic_map(df): df = df.squeeze(axis=1) @@ -494,10 +494,12 @@ def is_monotonic_reduce(df): edge_case = monotonic_fn(pandas.Series(edges_list)) return [common_case and edge_case] - return MapReduceFunction.register(is_monotonic_map, is_monotonic_reduce)(self) + return MapReduceFunction.register( + is_monotonic_map, is_monotonic_reduce, axis=0 + )(self) def is_monotonic_decreasing(self): - return self._is_monotonic(type="decreasing") + return self._is_monotonic(func_type="decreasing") is_monotonic = _is_monotonic @@ -626,7 +628,6 @@ def sort_index_for_equal_values(result, ascending): idxmin = ReductionFunction.register(pandas.DataFrame.idxmin) median = ReductionFunction.register(pandas.DataFrame.median) nunique = ReductionFunction.register(pandas.DataFrame.nunique) - nlargest = ReductionFunction.register(pandas.DataFrame.nlargest) skew = ReductionFunction.register(pandas.DataFrame.skew) kurt = ReductionFunction.register(pandas.DataFrame.kurt) std = ReductionFunction.register(pandas.DataFrame.std) @@ -1162,13 +1163,17 @@ def map_func(df, other=other, squeeze_self=squeeze_self): ) return self.__constructor__(new_modin_frame) - def nsmallest(self, n, columns=None, keep="first"): + def nsort(self, n, columns=None, keep="first", sort_type="nsmallest"): def map_func(df, n=n, keep=keep, columns=columns): if columns is None: return pandas.DataFrame( - pandas.Series.nsmallest(df.squeeze(axis=1), n=n, keep=keep) + getattr(pandas.Series, sort_type)( + df.squeeze(axis=1), n=n, keep=keep + ) ) - return pandas.DataFrame.nsmallest(df, n=n, columns=columns, keep=keep) + return getattr(pandas.DataFrame, sort_type)( + df, n=n, columns=columns, keep=keep + ) if columns is None: new_columns = ["__reduced__"] @@ -1180,6 +1185,12 @@ def map_func(df, n=n, keep=keep, columns=columns): ) return self.__constructor__(new_modin_frame) + def nsmallest(self, *args, **kwargs): + return self.nsort(sort_type="nsmallest", *args, **kwargs) + + def nlargest(self, *args, **kwargs): + return self.nsort(sort_type="nlargest", *args, **kwargs) + def eval(self, expr, **kwargs): """Returns a new QueryCompiler with expr evaluated on columns. diff --git a/modin/data_management/functions/binary_function.py b/modin/data_management/functions/binary_function.py index 5933b5b4283..b8edf8f608a 100644 --- a/modin/data_management/functions/binary_function.py +++ b/modin/data_management/functions/binary_function.py @@ -39,6 +39,7 @@ def caller(query_compiler, other, *args, **kwargs): axis, lambda l, r: func(l, r.squeeze(), *args, **kwargs), other._modin_frame, + preserve_labels=call_kwds.get("preserve_labels", False), ) ) else: diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py index a1801e17105..6d2373d83bf 100644 --- a/modin/data_management/functions/groupby_function.py +++ b/modin/data_management/functions/groupby_function.py @@ -115,19 +115,9 @@ def compute_reduce(df): except ValueError: return compute_reduce(df.copy()) - if axis == 0: - new_columns = qc.columns - new_index = None - else: - new_index = query_compiler.index - new_columns = None + # TODO: try to precompute `new_index` and `new_columns` new_modin_frame = qc._modin_frame.groupby_reduce( - axis, - by._modin_frame, - _map, - _reduce, - new_columns=new_columns, - new_index=new_index, + axis, by._modin_frame, _map, _reduce ) return query_compiler.__constructor__(new_modin_frame) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 520e9d28bd8..507f6540872 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -16,6 +16,7 @@ import pandas from pandas.core.indexes.api import ensure_index from pandas.core.dtypes.common import is_numeric_dtype +from typing import Union from modin.backends.pandas.query_compiler import PandasQueryCompiler from modin.error_message import ErrorMessage @@ -40,18 +41,22 @@ def __init__( row_lengths=None, column_widths=None, dtypes=None, + validate_axes: Union[bool, str] = False, ): """Initialize a dataframe. - Args: - partitions: A 2D NumPy array of partitions. Must contain partition objects. - index: The index object for the dataframe. Converts to a pandas.Index. - columns: The columns object for the dataframe. Converts to a pandas.Index. - row_lengths: (optional) The lengths of each partition in the rows. The + Parameters + ---------- + partitions : A 2D NumPy array of partitions. Must contain partition objects. + index : The index object for the dataframe. Converts to a pandas.Index. + columns : The columns object for the dataframe. Converts to a pandas.Index. + row_lengths : (optional) The lengths of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. - column_widths: (optional) The width of each partition in the columns. The + column_widths : (optional) The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. - dtypes: (optional) The data types for the dataframe. + dtypes : (optional) The data types for the dataframe. + validate_axes : (optional) Whether or not validate for equality + internal indices of partitions and passed `index` and `columns`. """ self._partitions = partitions self._index_cache = ensure_index(index) @@ -74,6 +79,8 @@ def __init__( self._column_widths_cache = column_widths self._dtypes = dtypes self._filter_empties() + if validate_axes is not False: + self._validate_internal_indices(mode=validate_axes) @property def _row_lengths(self): @@ -204,6 +211,21 @@ def _set_columns(self, new_columns): self._dtypes.index = new_columns self._apply_index_objs(axis=1) + def _set_axis(self, axis, new_axis): + """Replaces the current labels at the specified axis with the new one + + Parameters + ---------- + axis : int, + Axis to set labels along + new_axis : Index, + The replacement labels + """ + if axis: + self._set_columns(new_axis) + else: + self._set_index(new_axis) + columns = property(_get_columns, _set_columns) index = property(_get_index, _set_index) @@ -234,6 +256,59 @@ def _filter_empties(self): self._column_widths_cache = [w for w in self._column_widths if w > 0] self._row_lengths_cache = [r for r in self._row_lengths if r > 0] + def _validate_axis_equality(self, axis: int): + """ + Validates internal and external indices of modin_frame at the specified axis. + + Parameters + ---------- + axis : int, + Axis to validate indices along + """ + internal_axis = self._frame_mgr_cls.get_indices( + axis, self._partitions, lambda df: df.axes[axis] + ) + is_equals = self.axes[axis].equals(internal_axis) + if not is_equals: + self._set_axis(axis, self.axes[axis]) + + def _validate_internal_indices(self, mode=None, **kwargs): + """ + Validates and optionally updates internal and external indices + of modin_frame in specified mode. There is 3 modes supported: + 1. "reduced" - validates and updates indices on that axes + where external indices is ["__reduced__"] + 2. "all" - validates indices at all axes, optionally updates + internal indices if `update` parameter specified in kwargs + 3. "custom" - validation follows arguments specified in kwargs. + + Parameters + ---------- + mode : str or bool, default None + validate_index : bool, (optional, could be specified via `mode`) + validate_columns : bool, (optional, could be specified via `mode`) + """ + + if isinstance(mode, bool): + mode = "all" + + reduced_sample = pandas.Index(["__reduced__"]) + args_dict = { + "custom": kwargs, + "reduced": { + "validate_index": self.index.equals(reduced_sample), + "validate_columns": self.columns.equals(reduced_sample), + }, + "all": {"validate_index": True, "validate_columns": True}, + } + + args = args_dict.get(mode, args_dict["custom"]) + + if args.get("validate_index", True): + self._validate_axis_equality(axis=0) + if args.get("validate_columns", True): + self._validate_axis_equality(axis=1) + def _apply_index_objs(self, axis=None): """Lazily applies the index object (Index or Columns) to the partitions. @@ -872,7 +947,13 @@ def _compute_map_reduce_metadata(self, axis, new_parts): else: new_dtypes = self._dtypes return self.__constructor__( - new_parts, index, columns, new_lengths, new_widths, new_dtypes + new_parts, + index, + columns, + new_lengths, + new_widths, + new_dtypes, + validate_axes="reduced", ) def _fold_reduce(self, axis, func): @@ -936,7 +1017,9 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): 0, reduce_parts, lambda df: df.index ) new_columns = ["__reduced__"] - return self.__constructor__(reduce_parts, new_index, new_columns) + return self.__constructor__( + reduce_parts, new_index, new_columns, validate_axes="reduced" + ) def _map(self, func, dtypes=None, validate_index=False): """Perform a function that maps across the entire dataset. @@ -1095,7 +1178,13 @@ def _apply_full_axis( [np.dtype(dtypes)] * len(new_columns), index=new_columns ) return self.__constructor__( - new_partitions, new_index, new_columns, None, None, dtypes + new_partitions, + new_index, + new_columns, + None, + None, + dtypes, + validate_axes="reduced", ) def _apply_full_axis_select_indices( @@ -1247,7 +1336,6 @@ def broadcast_apply(self, axis, func, other, preserve_labels=True, dtypes=None): Returns: A new Modin DataFrame """ - assert preserve_labels, "`preserve_labels=False` Not Yet Implemented" # Only sort the indices if they do not match left_parts, right_parts, joined_index = self._copartition( axis, other, "left", sort=not self.axes[axis].equals(other.axes[axis]) @@ -1261,6 +1349,11 @@ def broadcast_apply(self, axis, func, other, preserve_labels=True, dtypes=None): dtypes = self._dtypes new_index = self.index new_columns = self.columns + if not preserve_labels: + if axis == 1: + new_columns = joined_index + else: + new_index = joined_index return self.__constructor__( new_frame, new_index, new_columns, None, None, dtypes=dtypes ) @@ -1476,7 +1569,13 @@ def to_pandas(self): df = pandas.DataFrame(columns=self.columns) else: df = pandas.DataFrame(columns=self.columns, index=self.index) - df.index.name = self.index.name + else: + ErrorMessage.catch_bugs_and_request_email( + not df.index.equals(self.index) or not df.columns.equals(self.columns), + "Internal and external indices do not match.", + ) + df.index = self.index + df.columns = self.columns return df def to_numpy(self): diff --git a/modin/engines/ray/generic/io.py b/modin/engines/ray/generic/io.py index f5723a4b633..e364903f1e0 100644 --- a/modin/engines/ray/generic/io.py +++ b/modin/engines/ray/generic/io.py @@ -41,6 +41,6 @@ def func(df): df.to_sql(**kwargs) return pandas.DataFrame() - result = qc._modin_frame._fold_reduce(1, func) + result = qc._modin_frame._apply_full_axis(1, func, new_index=[], new_columns=[]) # blocking operation result.to_pandas() diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index d87c79e2d5f..cc9ba04d280 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -295,7 +295,7 @@ def __getitem__(self, key): and all(c in self._columns for c in self._by.columns) and self._drop ): - key = [key] + list(self._by.columns) + key = list(self._by.columns) + [key] else: key = [key] if isinstance(key, list) and (make_dataframe or not self._as_index): diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index deda3512af2..95006d9d61e 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -412,3 +412,28 @@ def test_to_numeric(data, errors, downcast): modin_result = pd.to_numeric(modin_series, errors=errors, downcast=downcast) pandas_result = pandas.to_numeric(pandas_series, errors=errors, downcast=downcast) df_equals(modin_result, pandas_result) + + +def test_to_pandas_indices(): + data = test_data_values[0] + + md_df = pd.DataFrame(data) + index = pandas.MultiIndex.from_tuples( + [(i, i * 2) for i in np.arange(len(md_df) + 1)], names=["A", "B"] + ).drop(0) + columns = pandas.MultiIndex.from_tuples( + [(i, i * 2) for i in np.arange(len(md_df.columns) + 1)], names=["A", "B"] + ).drop(0) + + md_df.index = index + md_df.columns = columns + + pd_df = md_df._to_pandas() + + for axis in [0, 1]: + assert md_df.axes[axis].equals( + pd_df.axes[axis] + ), f"Indices at axis {axis} are different!" + assert md_df.axes[axis].equal_levels( + pd_df.axes[axis] + ), f"Levels of indices at axis {axis} are different!" diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index e63ff86c9af..0331f5013b3 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -559,6 +559,7 @@ def test_from_clipboard(): df_equals(modin_df, pandas_df) +@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details") def test_from_excel(): setup_excel_file(SMALL_ROW_SIZE) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 6eb3169f559..53d2764273f 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1362,6 +1362,9 @@ def test_dtype(data): df_equals(modin_series.dtype, pandas_series.dtypes) +@pytest.mark.xfail( + reason="Datetime properties is broken for now, see #1729 for details" +) def test_dt(): data = pd.date_range("2016-12-31", "2017-01-08", freq="D", tz="Europe/Berlin") modin_series = pd.Series(data) From 731816d305f821ad9998913e06ed9d16e01f158d Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 23 Jul 2020 23:07:10 +0300 Subject: [PATCH 007/120] FEAT-#1781: implement ClusterError.__str__ (#1782) Co-authored-by: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modin/experimental/cloud/base.py b/modin/experimental/cloud/base.py index e73d4b4bf9b..f78ddf30c7d 100644 --- a/modin/experimental/cloud/base.py +++ b/modin/experimental/cloud/base.py @@ -26,6 +26,11 @@ def __init__(self, *args, cause: BaseException = None, traceback: str = None, ** self.traceback = traceback super().__init__(*args, **kw) + def __str__(self): + if self.clause: + return f"clause: {self.cause}\n{super()}" + return str(super()) + class CannotSpawnCluster(ClusterError): """ From 77a29948aef0e0dce1033574555dccb86b1b04f8 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Thu, 23 Jul 2020 23:19:22 +0300 Subject: [PATCH 008/120] FEAT-#1194 #1283 #1138: Add `Series.rolling`, `DataFrame.rolling` functionality to enable rolling window operations Signed-off-by: Alexey Prutskov --- .github/workflows/ci.yml | 4 + .github/workflows/push.yml | 4 + docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/backends/pandas/query_compiler.py | 130 +++++++++ modin/pandas/base.py | 277 +++++++++++++++++++- modin/pandas/test/test_dataframe.py | 5 - modin/pandas/test/test_rolling.py | 187 +++++++++++++ modin/pandas/test/test_series.py | 7 - 9 files changed, 601 insertions(+), 17 deletions(-) create mode 100644 modin/pandas/test/test_rolling.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9da547594df..cdc38768e72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,6 +112,8 @@ jobs: if: matrix.part != 3 - run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 + - run: python -m pytest modin/pandas/test/test_rolling.py + if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_groupby.py @@ -149,6 +151,8 @@ jobs: if: matrix.part != 3 - run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 + - run: python -m pytest modin/pandas/test/test_rolling.py + if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_groupby.py diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 93f199cd914..8c8f38e2f57 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -27,6 +27,8 @@ jobs: if: matrix.part != 3 - run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 + - run: python -m pytest modin/pandas/test/test_rolling.py + if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_groupby.py @@ -63,6 +65,8 @@ jobs: if: matrix.part != 3 - run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 + - run: python -m pytest modin/pandas/test/test_rolling.py + if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - run: python -m pytest modin/pandas/test/test_groupby.py diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 88edff08374..3c8897857a5 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -313,7 +313,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``rmul`` | `rmul`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``rolling`` | `rolling`_ | D | | +| ``rolling`` | `rolling`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``round`` | `round`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index 83a6fd3924d..962aef16f87 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -348,7 +348,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``rmul`` | Y | +-----------------------------+---------------------------------+ -| ``rolling`` | D | +| ``rolling`` | Y | +-----------------------------+---------------------------------+ | ``round`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 57ce0f06109..11d67405704 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -836,6 +836,136 @@ def resample_var(self, resample_args, ddof, *args, **kwargs): def resample_quantile(self, resample_args, q, **kwargs): return self._resample_func(resample_args, "quantile", q=q, **kwargs) + window_mean = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).mean(*args, **kwargs) + ) + ) + window_sum = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).sum(*args, **kwargs) + ) + ) + window_var = FoldFunction.register( + lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs) + ) + ) + window_std = FoldFunction.register( + lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs) + ) + ) + rolling_count = FoldFunction.register( + lambda df, rolling_args: pandas.DataFrame(df.rolling(*rolling_args).count()) + ) + rolling_sum = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).sum(*args, **kwargs) + ) + ) + rolling_mean = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).mean(*args, **kwargs) + ) + ) + rolling_median = FoldFunction.register( + lambda df, rolling_args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).median(**kwargs) + ) + ) + rolling_var = FoldFunction.register( + lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs) + ) + ) + rolling_std = FoldFunction.register( + lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs) + ) + ) + rolling_min = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).min(*args, **kwargs) + ) + ) + rolling_max = FoldFunction.register( + lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).max(*args, **kwargs) + ) + ) + rolling_skew = FoldFunction.register( + lambda df, rolling_args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).skew(**kwargs) + ) + ) + rolling_kurt = FoldFunction.register( + lambda df, rolling_args, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).kurt(**kwargs) + ) + ) + rolling_apply = FoldFunction.register( + lambda df, rolling_args, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame( + df.rolling(*rolling_args).apply( + func=func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + ) + ) + rolling_quantile = FoldFunction.register( + lambda df, rolling_args, quantile, interpolation, **kwargs: pandas.DataFrame( + df.rolling(*rolling_args).quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) + ) + ) + + def rolling_corr(self, rolling_args, other, pairwise, *args, **kwargs): + if len(self.columns) > 1: + return self.default_to_pandas( + lambda df: pandas.DataFrame.rolling(df, *rolling_args).corr( + other=other, pairwise=pairwise, *args, **kwargs + ) + ) + else: + return FoldFunction.register( + lambda df: pandas.DataFrame( + df.rolling(*rolling_args).corr( + other=other, pairwise=pairwise, *args, **kwargs + ) + ) + )(self) + + def rolling_cov(self, rolling_args, other, pairwise, ddof, **kwargs): + if len(self.columns) > 1: + return self.default_to_pandas( + lambda df: pandas.DataFrame.rolling(df, *rolling_args).cov( + other=other, pairwise=pairwise, ddof=ddof, **kwargs + ) + ) + else: + return FoldFunction.register( + lambda df: pandas.DataFrame( + df.rolling(*rolling_args).cov( + other=other, pairwise=pairwise, ddof=ddof, **kwargs + ) + ) + )(self) + + def rolling_aggregate(self, rolling_args, func, *args, **kwargs): + new_modin_frame = self._modin_frame._apply_full_axis( + 0, + lambda df: pandas.DataFrame( + df.rolling(*rolling_args).aggregate(func=func, *args, **kwargs) + ), + new_index=self.index, + ) + return self.__constructor__(new_modin_frame) + # Map partitions operations # These operations are operations that apply a function to every partition. abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy") diff --git a/modin/pandas/base.py b/modin/pandas/base.py index cd2cbed248a..bcb6e5987de 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2336,9 +2336,62 @@ def rolling( axis=0, closed=None, ): - return self._default_to_pandas( - "rolling", - window, + """ + Provide rolling window calculations. + + Parameters + ---------- + window : int, offset, or BaseIndexer subclass + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + If a BaseIndexer subclass is passed, calculates the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely `min_periods`, `center`, and + `closed` will be passed to `get_window_bounds`. + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). For a window that is specified by an offset, + `min_periods` will default to 1. Otherwise, `min_periods` will default + to the size of the window. + center : bool, default False + Set the labels at the center of the window. + win_type : str, default None + Provide a window type. If ``None``, all points are evenly weighted. + See the notes below for further information. + on : str, optional + For a DataFrame, a datetime-like column or MultiIndex level on which + to calculate the rolling window, rather than the DataFrame's index. + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + axis : int or str, default 0 + closed : str, default None + Make the interval closed on the 'right', 'left', 'both' or + 'neither' endpoints. + For offset-based windows, it defaults to 'right'. + For fixed windows, defaults to 'both'. Remaining cases not implemented + for fixed windows. + Returns + ------- + a Window or Rolling sub-classed for the particular operation + """ + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + return Rolling( + self, + window=window, min_periods=min_periods, center=center, win_type=win_type, @@ -3858,3 +3911,221 @@ def quantile(self, q=0.5, **kwargs): self.resample_args, q, **kwargs ) ) + + +class Window(object): + def __init__( + self, + dataframe, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): + self._dataframe = dataframe + self._query_compiler = dataframe._query_compiler + self.window_args = [ + window, + min_periods, + center, + win_type, + on, + axis, + closed, + ] + + def mean(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.window_mean( + self.window_args, *args, **kwargs + ) + ) + + def sum(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.window_sum( + self.window_args, *args, **kwargs + ) + ) + + def var(self, ddof=1, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.window_var( + self.window_args, ddof, *args, **kwargs + ) + ) + + def std(self, ddof=1, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.window_std( + self.window_args, ddof, *args, **kwargs + ) + ) + + +class Rolling(object): + def __init__( + self, + dataframe, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): + self._dataframe = dataframe + self._query_compiler = dataframe._query_compiler + self.rolling_args = [ + window, + min_periods, + center, + win_type, + on, + axis, + closed, + ] + + def count(self): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_count(self.rolling_args) + ) + + def sum(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_sum( + self.rolling_args, *args, **kwargs + ) + ) + + def mean(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_mean( + self.rolling_args, *args, **kwargs + ) + ) + + def median(self, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_median( + self.rolling_args, **kwargs + ) + ) + + def var(self, ddof=1, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_var( + self.rolling_args, ddof, *args, **kwargs + ) + ) + + def std(self, ddof=1, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_std( + self.rolling_args, ddof, *args, **kwargs + ) + ) + + def min(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_min( + self.rolling_args, *args, **kwargs + ) + ) + + def max(self, *args, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_max( + self.rolling_args, *args, **kwargs + ) + ) + + def corr(self, other=None, pairwise=None, *args, **kwargs): + from .dataframe import DataFrame + from .series import Series + + if isinstance(other, DataFrame): + other = other._query_compiler.to_pandas() + elif isinstance(other, Series): + other = other._query_compiler.to_pandas().squeeze() + + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_corr( + self.rolling_args, other, pairwise, *args, **kwargs + ) + ) + + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + from .dataframe import DataFrame + from .series import Series + + if isinstance(other, DataFrame): + other = other._query_compiler.to_pandas() + elif isinstance(other, Series): + other = other._query_compiler.to_pandas().squeeze() + + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_cov( + self.rolling_args, other, pairwise, ddof, **kwargs + ) + ) + + def skew(self, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_skew( + self.rolling_args, **kwargs + ) + ) + + def kurt(self, **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_kurt( + self.rolling_args, **kwargs + ) + ) + + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_apply( + self.rolling_args, func, raw, engine, engine_kwargs, args, kwargs, + ) + ) + + def aggregate( + self, func, *args, **kwargs, + ): + from .dataframe import DataFrame + + dataframe = DataFrame( + query_compiler=self._query_compiler.rolling_aggregate( + self.rolling_args, func, *args, **kwargs, + ) + ) + if isinstance(self._dataframe, DataFrame): + return dataframe + elif is_list_like(func): + dataframe.columns = dataframe.columns.droplevel() + return dataframe + else: + return dataframe.squeeze() + + agg = aggregate + + def quantile(self, quantile, interpolation="linear", **kwargs): + return self._dataframe.__constructor__( + query_compiler=self._query_compiler.rolling_quantile( + self.rolling_args, quantile, interpolation, **kwargs + ) + ) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 19dab8c791f..887fa40a5bb 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2549,11 +2549,6 @@ def test_resample(self, rule, axis, closed, label, on, level): pandas_resampler.aggregate(["sum", "mean", "max"]), ) - def test_rolling(self): - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - with pytest.warns(UserWarning): - df.rolling(2, win_type="triang") - def test_sem(self): data = test_data_values[0] with pytest.warns(UserWarning): diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py new file mode 100644 index 00000000000..66193903faf --- /dev/null +++ b/modin/pandas/test/test_rolling.py @@ -0,0 +1,187 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import modin.pandas as pd + +from .utils import df_equals, test_data_values, test_data_keys + +pd.DEFAULT_NPARTITIONS = 4 + + +def create_test_series(vals): + if isinstance(vals, dict): + modin_series = pd.Series(vals[next(iter(vals.keys()))]) + pandas_series = pandas.Series(vals[next(iter(vals.keys()))]) + else: + modin_series = pd.Series(vals) + pandas_series = pandas.Series(vals) + return modin_series, pandas_series + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("window", [5, 100]) +@pytest.mark.parametrize("min_periods", [None, 5]) +@pytest.mark.parametrize("win_type", [None, "triang"]) +def test_dataframe(data, window, min_periods, win_type): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + pandas_rolled = pandas_df.rolling( + window=window, min_periods=min_periods, win_type=win_type, center=True, + ) + modin_rolled = modin_df.rolling( + window=window, min_periods=min_periods, win_type=win_type, center=True, + ) + # Testing of Window class + if win_type is not None: + df_equals(modin_rolled.mean(), pandas_rolled.mean()) + df_equals(modin_rolled.sum(), pandas_rolled.sum()) + df_equals(modin_rolled.var(ddof=0), pandas_rolled.var(ddof=0)) + df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0)) + # Testing of Rolling class + else: + df_equals(modin_rolled.count(), pandas_rolled.count()) + df_equals(modin_rolled.sum(), pandas_rolled.sum()) + df_equals(modin_rolled.mean(), pandas_rolled.mean()) + df_equals(modin_rolled.median(), pandas_rolled.median()) + df_equals(modin_rolled.var(ddof=0), pandas_rolled.var(ddof=0)) + df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0)) + df_equals(modin_rolled.min(), pandas_rolled.min()) + df_equals(modin_rolled.max(), pandas_rolled.max()) + df_equals(modin_rolled.skew(), pandas_rolled.skew()) + df_equals(modin_rolled.kurt(), pandas_rolled.kurt()) + df_equals(modin_rolled.apply(np.sum), pandas_rolled.apply(np.sum)) + df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) + df_equals( + modin_rolled.aggregate([np.sum, np.mean]), + pandas_rolled.aggregate([np.sum, np.mean]), + ) + df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) + + +@pytest.mark.parametrize("axis", [0, "columns"]) +@pytest.mark.parametrize("on", [None, "DateCol"]) +@pytest.mark.parametrize("closed", ["both", "right"]) +@pytest.mark.parametrize("window", [3, "3s"]) +def test_dataframe_dt_index(axis, on, closed, window): + index = pandas.date_range("31/12/2000", periods=12, freq="T") + data = {"A": range(12), "B": range(12)} + pandas_df = pandas.DataFrame(data, index=index) + modin_df = pd.DataFrame(data, index=index) + if on is not None and axis == 0 and isinstance(window, str): + pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") + modin_df[on] = pd.date_range("22/06/1941", periods=12, freq="T") + else: + on = None + if axis == "columns": + pandas_df = pandas_df.T + modin_df = modin_df.T + pandas_rolled = pandas_df.rolling(window=window, on=on, axis=axis, closed=closed) + modin_rolled = modin_df.rolling(window=window, on=on, axis=axis, closed=closed) + if isinstance(window, int): + # This functions are very slowly for data from test_rolling + df_equals( + modin_rolled.corr(modin_df, True), pandas_rolled.corr(pandas_df, True) + ) + df_equals( + modin_rolled.corr(modin_df, False), pandas_rolled.corr(pandas_df, False) + ) + df_equals(modin_rolled.cov(modin_df, True), pandas_rolled.cov(pandas_df, True)) + df_equals( + modin_rolled.cov(modin_df, False), pandas_rolled.cov(pandas_df, False) + ) + if axis == 0: + df_equals( + modin_rolled.cov(modin_df[modin_df.columns[0]], True), + pandas_rolled.cov(pandas_df[pandas_df.columns[0]], True), + ) + df_equals( + modin_rolled.corr(modin_df[modin_df.columns[0]], True), + pandas_rolled.corr(pandas_df[pandas_df.columns[0]], True), + ) + else: + df_equals(modin_rolled.count(), pandas_rolled.count()) + df_equals(modin_rolled.skew(), pandas_rolled.skew()) + df_equals( + modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), + ) + df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) + df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("window", [5, 100]) +@pytest.mark.parametrize("min_periods", [None, 5]) +@pytest.mark.parametrize("win_type", [None, "triang"]) +def test_series(data, window, min_periods, win_type): + modin_series, pandas_series = create_test_series(data) + + pandas_rolled = pandas_series.rolling( + window=window, min_periods=min_periods, win_type=win_type, center=True, + ) + modin_rolled = modin_series.rolling( + window=window, min_periods=min_periods, win_type=win_type, center=True, + ) + # Testing of Window class + if win_type is not None: + df_equals(modin_rolled.mean(), pandas_rolled.mean()) + df_equals(modin_rolled.sum(), pandas_rolled.sum()) + df_equals(modin_rolled.var(ddof=0), pandas_rolled.var(ddof=0)) + df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0)) + # Testing of Rolling class + else: + df_equals(modin_rolled.count(), pandas_rolled.count()) + df_equals(modin_rolled.sum(), pandas_rolled.sum()) + df_equals(modin_rolled.mean(), pandas_rolled.mean()) + df_equals(modin_rolled.median(), pandas_rolled.median()) + df_equals(modin_rolled.var(ddof=0), pandas_rolled.var(ddof=0)) + df_equals(modin_rolled.std(ddof=0), pandas_rolled.std(ddof=0)) + df_equals(modin_rolled.min(), pandas_rolled.min()) + df_equals(modin_rolled.max(), pandas_rolled.max()) + df_equals( + modin_rolled.corr(modin_series), pandas_rolled.corr(pandas_series), + ) + df_equals( + modin_rolled.cov(modin_series, True), pandas_rolled.cov(pandas_series, True) + ) + df_equals( + modin_rolled.cov(modin_series, False), + pandas_rolled.cov(pandas_series, False), + ) + df_equals(modin_rolled.skew(), pandas_rolled.skew()) + df_equals(modin_rolled.kurt(), pandas_rolled.kurt()) + df_equals(modin_rolled.apply(np.sum), pandas_rolled.apply(np.sum)) + df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) + df_equals( + modin_rolled.agg([np.sum, np.mean]), pandas_rolled.agg([np.sum, np.mean]), + ) + df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) + + +@pytest.mark.parametrize("closed", ["both", "right"]) +def test_series_dt_index(closed): + index = pandas.date_range("1/1/2000", periods=12, freq="T") + pandas_series = pandas.Series(range(12), index=index) + modin_series = pd.Series(range(12), index=index) + + pandas_rolled = pandas_series.rolling("3s", closed=closed) + modin_rolled = modin_series.rolling("3s", closed=closed) + df_equals(modin_rolled.count(), pandas_rolled.count()) + df_equals(modin_rolled.skew(), pandas_rolled.skew()) + df_equals( + modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True) + ) + df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) + df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 53d2764273f..768fad5d614 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2451,13 +2451,6 @@ def test_rmul(data): inter_df_math_helper(modin_series, pandas_series, "rmul") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rolling(data): - modin_series, _ = create_test_series(data) # noqa: F841 - with pytest.warns(UserWarning): - modin_series.rolling(10) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_round(data): modin_series, pandas_series = create_test_series(data) From 8658f093eaf05273df7bd801ee38afc117a8c5fb Mon Sep 17 00:00:00 2001 From: YarShev Date: Thu, 23 Jul 2020 23:42:09 +0300 Subject: [PATCH 009/120] REFACTOR-#1763: Move logic of `merge` (#1764) into the query compiler Signed-off-by: Igoshev, Yaroslav --- modin/backends/pandas/query_compiler.py | 72 ++++++++++++++++++++++--- modin/data_management/utils.py | 4 +- modin/pandas/base.py | 30 ++++------- modin/pandas/dataframe.py | 51 +----------------- modin/pandas/test/test_dataframe.py | 9 ++++ 5 files changed, 86 insertions(+), 80 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 11d67405704..1e9f170d36d 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -388,16 +388,55 @@ def merge(self, right, **kwargs): ----- See pd.merge or pd.DataFrame.merge for more info on kwargs. """ - right = right.to_pandas() + how = kwargs.get("how", "inner") + on = kwargs.get("on", None) + left_on = kwargs.get("left_on", None) + right_on = kwargs.get("right_on", None) + left_index = kwargs.get("left_index", False) + right_index = kwargs.get("right_index", False) + sort = kwargs.get("sort", False) - sort = kwargs.get("sort") - kwargs["sort"] = not sort if sort else sort + if how in ["left", "inner"] and left_index is False and right_index is False: + right = right.to_pandas() - def map_func(left, right=right, kwargs=kwargs): - return pandas.merge(left, right, **kwargs) + kwargs["sort"] = False - new_modin_frame = self._modin_frame._apply_full_axis(1, map_func) - return self.__constructor__(new_modin_frame) + def map_func(left, right=right, kwargs=kwargs): + return pandas.merge(left, right, **kwargs) + + new_self = self.__constructor__( + self._modin_frame._apply_full_axis(1, map_func) + ) + is_reset_index = True + if left_on and right_on: + left_on = left_on if is_list_like(left_on) else [left_on] + right_on = right_on if is_list_like(right_on) else [right_on] + is_reset_index = ( + False + if any(o in new_self.index.names for o in left_on) + and any(o in right.index.names for o in right_on) + else True + ) + if sort: + new_self = ( + new_self.sort_rows_by_column_values(left_on.append(right_on)) + if is_reset_index + else new_self.sort_index(axis=0, level=left_on.append(right_on)) + ) + if on: + on = on if is_list_like(on) else [on] + is_reset_index = not any( + o in new_self.index.names and o in right.index.names for o in on + ) + if sort: + new_self = ( + new_self.sort_rows_by_column_values(on) + if is_reset_index + else new_self.sort_index(axis=0, level=on) + ) + return new_self.reset_index(drop=True) if is_reset_index else new_self + else: + return self.default_to_pandas(pandas.DataFrame.merge, right, **kwargs) # END Inter-Data operations @@ -1504,6 +1543,21 @@ def sort_index(self, **kwargs): QueryCompiler containing the data sorted by columns or indices. """ axis = kwargs.pop("axis", 0) + level = kwargs.pop("level", None) + sort_remaining = kwargs.pop("sort_remaining", True) + kwargs["inplace"] = False + + if level is not None or ( + (axis == 0 and isinstance(self.index, pandas.MultiIndex)) + or (axis == 1 and isinstance(self.columns, pandas.MultiIndex)) + ): + return self.default_to_pandas( + pandas.DataFrame.sort_index, + level=level, + sort_remaining=sort_remaining, + **kwargs + ) + # sort_index can have ascending be None and behaves as if it is False. # sort_values cannot have ascending be None. Thus, the following logic is to # convert the ascending argument to one that works with sort_values @@ -1519,7 +1573,9 @@ def sort_index(self, **kwargs): new_columns = self.columns new_modin_frame = self._modin_frame._apply_full_axis( axis, - lambda df: df.sort_index(axis=axis, **kwargs), + lambda df: df.sort_index( + axis=axis, level=level, sort_remaining=sort_remaining, **kwargs + ), new_index, new_columns, dtypes="copy" if axis == 0 else None, diff --git a/modin/data_management/utils.py b/modin/data_management/utils.py index 6d5aee59664..0d0a4aafa18 100644 --- a/modin/data_management/utils.py +++ b/modin/data_management/utils.py @@ -106,9 +106,9 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None) def length_fn_pandas(df): assert isinstance(df, pandas.DataFrame) - return len(df) if len(df.columns) > 0 else 0 + return len(df) if len(df) > 0 else 0 def width_fn_pandas(df): assert isinstance(df, pandas.DataFrame) - return len(df.columns) if len(df) > 0 else 0 + return len(df.columns) if len(df.columns) > 0 else 0 diff --git a/modin/pandas/base.py b/modin/pandas/base.py index bcb6e5987de..871305f9c7e 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2800,28 +2800,18 @@ def sort_index( A sorted DataFrame """ axis = self._get_axis_number(axis) - if level is not None or ( - (axis == 0 and isinstance(self.index, pandas.MultiIndex)) - or (axis == 1 and isinstance(self.columns, pandas.MultiIndex)) - ): - new_query_compiler = self._default_to_pandas( - "sort_index", - axis=axis, - level=level, - ascending=ascending, - inplace=False, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - )._query_compiler - return self._create_or_update_from_compiler(new_query_compiler, inplace) + inplace = validate_bool_kwarg(inplace, "inplace") new_query_compiler = self._query_compiler.sort_index( - axis=axis, ascending=ascending, kind=kind, na_position=na_position + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, ) - if inplace: - self._update_inplace(new_query_compiler=new_query_compiler) - else: - return self.__constructor__(query_compiler=new_query_compiler) + return self._create_or_update_from_compiler(new_query_compiler, inplace) def sort_values( self, diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b2e1f360146..dad2511a5c1 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1491,57 +1491,8 @@ def merge( right, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], sort=sort ) - if how in ["left", "inner"] and left_index is False and right_index is False: - result = self.__constructor__( - query_compiler=self._query_compiler.merge( - right._query_compiler, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - sort=sort, - suffixes=suffixes, - copy=copy, - indicator=indicator, - validate=validate, - ) - ) - - is_reset_index = True - if left_on and right_on: - left_on = left_on if is_list_like(left_on) else [left_on] - right_on = right_on if is_list_like(right_on) else [right_on] - is_reset_index = ( - False - if any(o in self.index.names for o in left_on) - and any(o in right.index.names for o in right_on) - else True - ) - if sort: - result = ( - result.sort_values(left_on.append(right_on)) - if is_reset_index - else result.sort_index(axis=0, level=left_on.append(right_on)) - ) - if on: - on = on if is_list_like(on) else [on] - is_reset_index = not any( - o in self.index.names and o in right.index.names for o in on - ) - if sort: - result = ( - result.sort_values(on) - if is_reset_index - else result.sort_index(axis=0, level=on) - ) - - return result.reset_index(drop=True) if is_reset_index else result - return self.__constructor__( - query_compiler=self._query_compiler.default_to_pandas( - pandas.DataFrame.merge, + query_compiler=self._query_compiler.merge( right._query_compiler, how=how, on=on, diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 887fa40a5bb..3a0e0d2f515 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5777,6 +5777,15 @@ def test_merge(self, test_data, test_data2): ) df_equals(modin_result, pandas_result) + # Test for issue #1771 + modin_df = pd.DataFrame({"name": np.arange(40)}) + modin_df2 = pd.DataFrame({"name": [39], "position": [0]}) + pandas_df = pandas.DataFrame({"name": np.arange(40)}) + pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]}) + modin_result = modin_df.merge(modin_df2, on="name", how="inner") + pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner") + df_equals(modin_result, pandas_result) + frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], From 49f462c09a281bb0f316f57ba5835e7d6c4eb726 Mon Sep 17 00:00:00 2001 From: YarShev Date: Fri, 24 Jul 2020 02:17:32 +0300 Subject: [PATCH 010/120] TEST-#1785: Change size of commit message header (#1786) Signed-off-by: Igoshev, Yaroslav --- commitlint.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commitlint.config.js b/commitlint.config.js index c022602cab7..6c95efaef68 100644 --- a/commitlint.config.js +++ b/commitlint.config.js @@ -2,7 +2,7 @@ module.exports = { plugins: ['commitlint-plugin-jira-rules'], extends: ['jira'], rules: { - "header-max-length": [2, "always", 50], + "header-max-length": [2, "always", 70], "signed-off-by": [2, "always", "Signed-off-by"], "jira-task-id-max-length": [0, "always", 10], "jira-task-id-project-key": [2, "always", ["FEAT", "DOCS", "FIX", "REFACTOR", "TEST"]], From 86e1a4e6560edf1d65c2776a86fb2d7f20b456ca Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 01:07:54 -0700 Subject: [PATCH 011/120] FIX-#1775: Fix support for callable in loc/iloc (#1776) Signed-off-by: Devin Petersohn --- modin/pandas/indexing.py | 4 ++++ modin/pandas/test/test_dataframe.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 68d60db0006..f076e3cddc6 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -236,6 +236,8 @@ class _LocIndexer(_LocationIndexerBase): """An indexer for modin_df.loc[] functionality""" def __getitem__(self, key): + if callable(key): + return self.__getitem__(key(self.df)) row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key) if isinstance(row_loc, slice) and row_loc == slice(None): # If we're only slicing columns, handle the case with `__getitem__` @@ -361,6 +363,8 @@ class _iLocIndexer(_LocationIndexerBase): """An indexer for modin_df.iloc[] functionality""" def __getitem__(self, key): + if callable(key): + return self.__getitem__(key(self.df)) row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key) self._check_dtypes(row_loc) self._check_dtypes(col_loc) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 3a0e0d2f515..6daed7af609 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -4389,6 +4389,12 @@ def test_iloc(self, request, data): modin_df.iloc[:, 0] = modin_df.iloc[:, 1] pandas_df.iloc[:, 0] = pandas_df.iloc[:, 1] df_equals(modin_df, pandas_df) + + # From issue #1775 + df_equals( + modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], + pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], + ) else: with pytest.raises(IndexError): modin_df.iloc[0, 1] @@ -4484,6 +4490,12 @@ def test_loc(self, request, data): pandas_df_copy.loc[[1, 2]] = 42 df_equals(modin_df_copy, pandas_df_copy) + # From issue #1775 + df_equals( + modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + ) + # From issue #1374 with pytest.raises(KeyError): modin_df.loc["NO_EXIST"] From 21b515be60574f11238df516465f242037b0c5fc Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 06:57:30 -0700 Subject: [PATCH 012/120] FEAT-#1598: Update iterator implemetion to `iloc` (#1599) Co-authored-by: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Signed-off-by: Devin Petersohn --- modin/pandas/dataframe.py | 29 ++++++++---------------- modin/pandas/iterator.py | 23 ++++++++++--------- modin/pandas/series.py | 11 +++------- modin/pandas/test/test_dataframe.py | 34 ++++++++++++++++++++--------- 4 files changed, 49 insertions(+), 48 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index dad2511a5c1..29c9daa9614 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1180,14 +1180,11 @@ def iterrows(self): Returns: A generator that iterates over the rows of the frame. """ - index_iter = iter(self.index) - def iterrow_builder(df): - df.columns = self.columns - df.index = [next(index_iter)] - return df.iterrows() + def iterrow_builder(s): + return s.name, s - partition_iterator = PartitionIterator(self._query_compiler, 0, iterrow_builder) + partition_iterator = PartitionIterator(self, 0, iterrow_builder) for v in partition_iterator: yield v @@ -1202,14 +1199,11 @@ def items(self): Returns: A generator that iterates over the columns of the frame. """ - col_iter = iter(self.columns) - def items_builder(df): - df.columns = [next(col_iter)] - df.index = self.index - return df.items() + def items_builder(s): + return s.name, s - partition_iterator = PartitionIterator(self._query_compiler, 1, items_builder) + partition_iterator = PartitionIterator(self, 1, items_builder) for v in partition_iterator: yield v @@ -1240,16 +1234,11 @@ def itertuples(self, index=True, name="Pandas"): Returns: A tuple representing row data. See args for varying tuples. """ - index_iter = iter(self.index) - def itertuples_builder(df): - df.columns = self.columns - df.index = [next(index_iter)] - return df.itertuples(index=index, name=name) + def itertuples_builder(s): + return next(s._to_pandas().to_frame().T.itertuples(index=index, name=name)) - partition_iterator = PartitionIterator( - self._query_compiler, 0, itertuples_builder - ) + partition_iterator = PartitionIterator(self, 0, itertuples_builder) for v in partition_iterator: yield v diff --git a/modin/pandas/iterator.py b/modin/pandas/iterator.py index e525f5d33c6..21bd6b7643c 100644 --- a/modin/pandas/iterator.py +++ b/modin/pandas/iterator.py @@ -15,21 +15,27 @@ class PartitionIterator(Iterator): - def __init__(self, query_compiler, axis, func): + def __init__(self, df, axis, func): """PartitionIterator class to define a generator on partitioned data Args: - query_compiler: Data manager for the dataframe + df: The dataframe to iterate over axis: axis to iterate over func: The function to get inner iterables from each partition """ - self.query_compiler = query_compiler + self.df = df self.axis = axis self.index_iter = ( - iter(self.query_compiler.columns) + zip( + iter(slice(None) for _ in range(len(self.df.columns))), + range(len(self.df.columns)), + ) if axis - else iter(range(len(self.query_compiler.index))) + else zip( + range(len(self.df.index)), + iter(slice(None) for _ in range(len(self.df.index))), + ) ) self.func = func @@ -41,8 +47,5 @@ def __next__(self): def next(self): key = next(self.index_iter) - if self.axis: - df = self.query_compiler.getitem_column_array([key]).to_pandas() - else: - df = self.query_compiler.getitem_row_array([key]).to_pandas() - return next(self.func(df)) + df = self.df.iloc[key] + return self.func(df) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6ebea5e07a3..5c2ef11d840 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -934,15 +934,10 @@ def item(self): return self[0] def items(self): - index_iter = iter(self.index) + def item_builder(s): + return s.name, s.squeeze() - def item_builder(df): - s = df.iloc[:, 0] - s.index = [next(index_iter)] - s.name = self.name - return s.items() - - partition_iterator = PartitionIterator(self._query_compiler, 0, item_builder) + partition_iterator = PartitionIterator(self.to_frame(), 0, item_builder) for v in partition_iterator: yield v diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 6daed7af609..a18f3557cdd 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5381,8 +5381,10 @@ def test_iterrows(self, data): df_equals(pandas_series, modin_series) assert pandas_index == modin_index + @pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) + @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_itertuples(self, data): + def test_itertuples(self, name, index, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) @@ -5392,16 +5394,28 @@ def test_itertuples(self, data): for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): np.testing.assert_equal(modin_row, pandas_row) - # test all combinations of custom params - indices = [True, False] - names = [None, "NotPandas", "Pandas"] + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) - for index in indices: - for name in names: - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) - for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): - np.testing.assert_equal(modin_row, pandas_row) + mi_index_modin = pd.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] + ) + mi_index_pandas = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(pandas_df.columns))] + ) + modin_df.columns = mi_index_modin + pandas_df.columns = mi_index_pandas + modin_it_default = modin_df.itertuples() + pandas_it_default = pandas_df.itertuples() + for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): + np.testing.assert_equal(modin_row, pandas_row) + + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___iter__(self, data): From 8472cf19f78e7c7266928d4c052ca7edd7901a34 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 08:52:14 -0700 Subject: [PATCH 013/120] FIX-#1556: Fix support for nested assignment with `loc`/`iloc` (#1788) Signed-off-by: Devin Petersohn --- modin/pandas/series.py | 15 +++++++++------ modin/pandas/test/test_dataframe.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 5c2ef11d840..fae48fc14cb 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -132,6 +132,15 @@ def _validate_dtypes_min_max(self, axis, numeric_only): def _validate_dtypes(self, numeric_only=False): pass + def _update_inplace(self, new_query_compiler): + super(Series, self)._update_inplace(new_query_compiler=new_query_compiler) + # Propagate changes back to parent so that column in dataframe had the same contents + if self._parent is not None: + if self._parent_axis == 0: + self._parent.loc[self.name] = self + else: + self._parent[self.name] = self + def _create_or_update_from_compiler(self, new_query_compiler, inplace=False): """Returns or updates a DataFrame given new query_compiler""" assert ( @@ -329,12 +338,6 @@ def __setitem__(self, key, value): self._create_or_update_from_compiler( self._query_compiler.setitem(1, key, value), inplace=True ) - # Propagate changes back to parent so that column in dataframe had the same contents - if self._parent is not None: - if self._parent_axis == 0: - self._parent.loc[self.name] = self - else: - self._parent[self.name] = self def __sub__(self, right): return self.sub(right) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index a18f3557cdd..6484c908732 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -4593,6 +4593,21 @@ def test_loc_assignment(self): pandas_df.loc["row3"]["col2"] = 32 df_equals(modin_df, pandas_df) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_loc_nested_assignment(self, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + + modin_df[key1].loc[0] = 500 + pandas_df[key1].loc[0] = 500 + df_equals(modin_df, pandas_df) + + modin_df[key2].loc[0] = None + pandas_df[key2].loc[0] = None + df_equals(modin_df, pandas_df) + def test_iloc_assignment(self): modin_df = pd.DataFrame( index=["row1", "row2", "row3"], columns=["col1", "col2"] @@ -4614,6 +4629,21 @@ def test_iloc_assignment(self): pandas_df.iloc[2]["col2"] = 32 df_equals(modin_df, pandas_df) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_iloc_nested_assignment(self, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + + modin_df[key1].iloc[0] = 500 + pandas_df[key1].iloc[0] = 500 + df_equals(modin_df, pandas_df) + + modin_df[key2].iloc[0] = None + pandas_df[key2].iloc[0] = None + df_equals(modin_df, pandas_df) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_pop(self, request, data): modin_df = pd.DataFrame(data) From cf695b380f61cebbf171592b08bf4f2a9c0c476f Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Fri, 24 Jul 2020 20:50:25 +0300 Subject: [PATCH 014/120] FEAT-#1205: melt implementation (#1689) Signed-off-by: Dmitry Chigarev --- docs/supported_apis/dataframe_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 + modin/backends/pandas/query_compiler.py | 74 +++++++++++ modin/engines/base/frame/data.py | 124 ++++++++++++++++++ modin/engines/base/frame/partition_manager.py | 106 +++++++++++++-- modin/error_message.py | 9 ++ modin/pandas/dataframe.py | 37 +++++- modin/pandas/test/test_dataframe.py | 20 ++- 8 files changed, 354 insertions(+), 22 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 3c8897857a5..7ff26076737 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -227,7 +227,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``median`` | `median`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``melt`` | `melt`_ | D | | +| ``melt`` | `melt`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``memory_usage`` | `memory_usage`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 04e49f6c817..77860aca235 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -724,6 +724,10 @@ def sort_index(self, **kwargs): """ pass + @abc.abstractmethod + def melt(self, *args, **kwargs): + pass + # END Abstract map across rows/columns # Map across rows/columns diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 1e9f170d36d..c86ea870d54 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1582,6 +1582,80 @@ def sort_index(self, **kwargs): ) return self.__constructor__(new_modin_frame) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ): + ErrorMessage.missmatch_with_pandas( + operation="melt", message="Order of rows could be different from pandas" + ) + + if var_name is None: + var_name = "variable" + + def _convert_to_list(x): + if is_list_like(x): + x = [*x] + elif x is not None: + x = [x] + else: + x = [] + return x + + id_vars, value_vars = map(_convert_to_list, [id_vars, value_vars]) + + if len(value_vars) == 0: + value_vars = self.columns.drop(id_vars) + + if len(id_vars) != 0: + to_broadcast = self.getitem_column_array(id_vars)._modin_frame + else: + to_broadcast = None + + def applyier(df, internal_indices, other=[], internal_other_indices=[]): + if len(other): + other = pandas.concat(other, axis=1) + columns_to_add = other.columns.difference(df.columns) + df = pandas.concat([df, other[columns_to_add]], axis=1) + return df.melt( + id_vars=id_vars, + value_vars=df.columns[internal_indices], + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) + + # we have no able to calculate correct indices here, so making it `dummy_index` + inconsistent_frame = self._modin_frame.broadcast_apply_select_indices( + axis=0, + apply_indices=value_vars, + func=applyier, + other=to_broadcast, + new_index=["dummy_index"] * len(id_vars), + new_columns=["dummy_index"] * len(id_vars), + ) + # after applying `melt` for selected indices we will get partitions like this: + # id_vars vars value | id_vars vars value + # 0 foo col3 1 | 0 foo col5 a so stacking it into + # 1 fiz col3 2 | 1 fiz col5 b `new_parts` to get + # 2 bar col3 3 | 2 bar col5 c correct answer + # 3 zoo col3 4 | 3 zoo col5 d + new_parts = np.array( + [np.array([x]) for x in np.concatenate(inconsistent_frame._partitions.T)] + ) + new_index = pandas.RangeIndex(len(self.index) * len(value_vars)) + new_modin_frame = self._modin_frame.__constructor__( + new_parts, index=new_index, columns=id_vars + [var_name, value_name], + ) + result = self.__constructor__(new_modin_frame) + # this assigment needs to propagate correct indices into partitions + result.index = new_index + return result + # END Map across rows/columns # __getitem__ methods diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 507f6540872..04cdd8b6b74 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -1358,6 +1358,130 @@ def broadcast_apply(self, axis, func, other, preserve_labels=True, dtypes=None): new_frame, new_index, new_columns, None, None, dtypes=dtypes ) + def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): + """ + Computes indices to broadcast `self` with considering of `indices` + + Parameters + ---------- + axis : int, + axis to broadcast along + indices : dict, + Dict of indices and internal indices of partitions where `self` must + be broadcasted + broadcast_all : bool, + Whether broadcast the whole axis of `self` frame or just a subset of it + + Returns + ------- + Dictianary with indices of partitions to broadcast + """ + if broadcast_all: + + def get_len(part): + return part.width() if not axis else part.length() + + parts = self._partitions if not axis else self._partitions.T + return { + key: { + i: np.arange(get_len(parts[0][i])) for i in np.arange(len(parts[0])) + } + for key in indices.keys() + } + passed_len = 0 + result_dict = {} + for part_num, internal in indices.items(): + result_dict[part_num] = self._get_dict_of_block_index( + axis ^ 1, np.arange(passed_len, passed_len + len(internal)) + ) + passed_len += len(internal) + return result_dict + + def broadcast_apply_select_indices( + self, + axis, + func, + other, + apply_indices=None, + numeric_indices=None, + keep_remaining=False, + broadcast_all=True, + new_index=None, + new_columns=None, + ): + """ + Applyies `func` to select indices at specified axis and broadcasts + partitions of `other` frame. + + Parameters + ---------- + axis : int, + Axis to apply function along + func : callable, + Function to apply + other : BasePandasFrame, + Partitions of which should be broadcasted + apply_indices : list, + List of labels to apply (if `numeric_indices` are not specified) + numeric_indices : list, + Numeric indices to apply (if `apply_indices` are not specified) + keep_remaining : Whether or not to drop the data that is not computed over. + broadcast_all : Whether broadcast the whole axis of right frame to every + partition or just a subset of it. + new_index : Index, (optional) + The index of the result. We may know this in advance, + and if not provided it must be computed + new_columns : Index, (optional) + The columns of the result. We may know this in advance, + and if not provided it must be computed. + + Returns + ------- + BasePandasFrame + """ + assert ( + apply_indices is not None or numeric_indices is not None + ), "Indices to apply must be specified!" + + if other is None: + if apply_indices is None: + apply_indices = self.axes[axis][numeric_indices] + return self._apply_select_indices( + axis=axis, + func=func, + apply_indices=apply_indices, + keep_remaining=keep_remaining, + new_index=new_index, + new_columns=new_columns, + ) + + if numeric_indices is None: + old_index = self.index if axis else self.columns + numeric_indices = old_index.get_indexer_for(apply_indices) + + dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) + broadcasted_dict = other._prepare_frame_to_broadcast( + axis, dict_indices, broadcast_all=broadcast_all + ) + new_partitions = self._frame_mgr_cls.broadcast_apply_select_indices( + axis, + func, + self._partitions, + other._partitions, + dict_indices, + broadcasted_dict, + keep_remaining, + ) + if new_index is None: + new_index = self._frame_mgr_cls.get_indices( + 0, new_partitions, lambda df: df.index + ) + if new_columns is None: + new_columns = self._frame_mgr_cls.get_indices( + 1, new_partitions, lambda df: df.columns + ) + return self.__constructor__(new_partitions, new_index, new_columns) + def _copartition(self, axis, other, how, sort, force_repartition=False): """ Copartition two dataframes. diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 0592e5e9ec8..451f0573a1f 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -96,6 +96,76 @@ def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): ) return cls.map_axis_partitions(axis, new_partitions, reduce_func) + @classmethod + def broadcast_apply_select_indices( + cls, + axis, + apply_func, + left, + right, + left_indices, + right_indices, + keep_remaining=False, + ): + """ + Broadcast the right partitions to left and apply a function to selected indices + + Note: Your internal function must take this kwargs: + [`internal_indices`, `other`, `internal_other_indices`] to work correctly + + Parameters + ---------- + axis : The axis to apply and broadcast over. + apply_func : The function to apply. + left : The left partitions. + right : The right partitions. + left_indices : indices to apply function. + right_indices : dictianary of indices of right partitions that + you want to bring at specified left partition, for example that dict + {key: {key1: [0, 1], key2: [5]}} means, that in left[key] you want to + broadcast [right[key1], right[key2]] partitions and internal indices + for `right` must be [[0, 1], [5]] + keep_remaining : Whether or not to keep the other partitions. + Some operations may want to drop the remaining partitions and + keep only the results. + + Returns + ------- + A new `np.array` of partition objects. + """ + if not axis: + partitions_for_apply = left.T + right = right.T + else: + partitions_for_apply = left + right = right + + [obj.drain_call_queue() for row in right for obj in row] + + def get_partitions(index): + must_grab = right_indices[index] + partitions_list = np.array([right[i] for i in must_grab.keys()]) + indices_list = list(must_grab.values()) + return {"other": partitions_list, "internal_other_indices": indices_list} + + new_partitions = np.array( + [ + partitions_for_apply[i] + if i not in left_indices + else cls._apply_func_to_list_of_partitions_broadcast( + apply_func, + partitions_for_apply[i], + internal_indices=left_indices[i], + **get_partitions(i), + ) + for i in range(len(partitions_for_apply)) + if i in left_indices or keep_remaining + ] + ) + if not axis: + new_partitions = new_partitions.T + return new_partitions + @classmethod def broadcast_apply(cls, axis, apply_func, left, right): """Broadcast the right partitions to left and apply a function. @@ -371,6 +441,16 @@ def _compute_num_partitions(cls): return DEFAULT_NPARTITIONS + @classmethod + def _apply_func_to_list_of_partitions_broadcast( + cls, func, partitions, other, **kwargs + ): + preprocessed_func = cls.preprocess_func(func) + return [ + obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs) + for obj, broadcasted in zip(partitions, other.T) + ] + @classmethod def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): """Applies a function to a list of remote partitions. @@ -439,7 +519,7 @@ def apply_func_to_select_indices( if i_idx >= 0 }, ) - for o_idx, list_to_apply in indices + for o_idx, list_to_apply in indices.items() ] ) else: @@ -470,7 +550,7 @@ def apply_func_to_select_indices( partitions_for_apply[idx], internal_indices=list_to_apply, ) - for idx, list_to_apply in indices + for idx, list_to_apply in indices.items() ] ) else: @@ -526,11 +606,19 @@ def apply_func_to_select_indices_along_full_axis( # Since we might be keeping the remaining blocks that are not modified, # we have to also keep the block_partitions object in the correct # direction (transpose for columns). + if not keep_remaining: + selected_partitions = partitions.T if not axis else partitions + selected_partitions = np.array([selected_partitions[i] for i in indices]) + selected_partitions = ( + selected_partitions.T if not axis else selected_partitions + ) + else: + selected_partitions = partitions if not axis: - partitions_for_apply = cls.column_partitions(partitions) + partitions_for_apply = cls.column_partitions(selected_partitions) partitions_for_remaining = partitions.T else: - partitions_for_apply = cls.row_partitions(partitions) + partitions_for_apply = cls.row_partitions(selected_partitions) partitions_for_remaining = partitions # We may have a command to perform different functions on different # columns at the same time. We attempt to handle this as efficiently as @@ -540,11 +628,11 @@ def apply_func_to_select_indices_along_full_axis( if not keep_remaining: result = np.array( [ - partitions_for_apply[i].apply( + part.apply( preprocessed_func, func_dict={idx: dict_func[idx] for idx in indices[i]}, ) - for i in indices + for i, part in zip(indices, partitions_for_apply) ] ) else: @@ -565,10 +653,8 @@ def apply_func_to_select_indices_along_full_axis( # See notes in `apply_func_to_select_indices` result = np.array( [ - partitions_for_apply[i].apply( - preprocessed_func, internal_indices=indices[i] - ) - for i in indices + part.apply(preprocessed_func, internal_indices=indices[i]) + for i, part in zip(indices, partitions_for_apply) ] ) else: diff --git a/modin/error_message.py b/modin/error_message.py index 68e6b8dae0b..f4a1f0f04d9 100644 --- a/modin/error_message.py +++ b/modin/error_message.py @@ -17,6 +17,7 @@ class ErrorMessage(object): # Only print the request implementation one time. This only applies to Warnings. printed_request_implementation = False + warned_operations = set() @classmethod def not_implemented(cls, message=""): @@ -59,3 +60,11 @@ def non_verified_udf(cls): "User-defined function verification is still under development in Modin. " "The function provided is not verified." ) + + @classmethod + def missmatch_with_pandas(cls, operation, message): + if operation not in cls.warned_operations: + warnings.warn( + f"`{operation}` implementation has mismatches with pandas:\n{message}." + ) + cls.warned_operations.add(operation) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 29c9daa9614..40c45a0d856 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1352,13 +1352,36 @@ def melt( value_name="value", col_level=None, ): - return self._default_to_pandas( - pandas.DataFrame.melt, - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + Parameters + ---------- + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + """ + return self.__constructor__( + query_compiler=self._query_compiler.melt( + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) ) def memory_usage(self, index=True, deep=False): diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 6484c908732..b1798eb553f 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2374,10 +2374,22 @@ def test_mask(self): except ValueError: pass - def test_melt(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).melt() + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + @pytest.mark.parametrize( + "id_vars", [lambda df: df.columns[0], lambda df: df.columns[:4], None] + ) + @pytest.mark.parametrize( + "value_vars", [lambda df: df.columns[-1], lambda df: df.columns[-4:], None] + ) + def test_melt(self, data, id_vars, value_vars): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.melt(*args, **kwargs) + .sort_values(["variable", "value"]) + .reset_index(drop=True), + id_vars=id_vars, + value_vars=value_vars, + ) def test_pct_change(self): data = test_data_values[0] From 07112233bca8f997dcefe9cac620617944fef003 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 12:58:51 -0700 Subject: [PATCH 015/120] FIX-#1610: Fix support for `loc` with MultiIndex parameter (#1789) Signed-off-by: Devin Petersohn --- modin/pandas/indexing.py | 10 ++++++++-- modin/pandas/test/test_dataframe.py | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index f076e3cddc6..0b3c3d872e0 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -339,7 +339,10 @@ def _compute_lookup(self, row_loc, col_loc): self.qc.index.to_series().loc[row_loc] ) elif isinstance(self.qc.index, pandas.MultiIndex): - row_lookup = self.qc.index.get_locs(row_loc) + if isinstance(row_loc, pandas.MultiIndex): + row_lookup = self.qc.index.get_indexer_for(row_loc) + else: + row_lookup = self.qc.index.get_locs(row_loc) elif is_boolean_array(row_loc): # If passed in a list of booleans, we return the index of the true values row_lookup = [i for i, row_val in enumerate(row_loc) if row_val] @@ -350,7 +353,10 @@ def _compute_lookup(self, row_loc, col_loc): self.qc.columns.to_series().loc[col_loc] ) elif isinstance(self.qc.columns, pandas.MultiIndex): - col_lookup = self.qc.columns.get_locs(col_loc) + if isinstance(col_loc, pandas.MultiIndex): + col_lookup = self.qc.columns.get_indexer_for(col_loc) + else: + col_lookup = self.qc.columns.get_locs(col_loc) elif is_boolean_array(col_loc): # If passed in a list of booleans, we return the index of the true values col_lookup = [i for i, col_val in enumerate(col_loc) if col_val] diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index b1798eb553f..7ef6cfe8a6f 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -4584,6 +4584,10 @@ def test_loc_multi_index(self): transposed_pandas.loc[transposed_pandas.index[:-2], :], ) + # From issue #1610 + df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) + df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]]) + def test_loc_assignment(self): modin_df = pd.DataFrame( index=["row1", "row2", "row3"], columns=["col1", "col2"] From 3ef84f7f8f94c2c43c3872e0f8729fdf258affa6 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 15:49:34 -0700 Subject: [PATCH 016/120] FIX-#1700: Fix metadata for concat and mask when `axis=1` (#1797) Signed-off-by: Devin Petersohn --- modin/backends/pandas/query_compiler.py | 11 +++++++---- modin/engines/base/frame/data.py | 2 +- modin/pandas/test/test_concat.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index c86ea870d54..62bffcc3a98 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -252,11 +252,14 @@ def concat(self, axis, other, **kwargs): ignore_index = kwargs.get("ignore_index", False) other_modin_frame = [o._modin_frame for o in other] new_modin_frame = self._modin_frame._concat(axis, other_modin_frame, join, sort) + result = self.__constructor__(new_modin_frame) if ignore_index: - new_modin_frame.index = pandas.RangeIndex( - len(self.index) + sum(len(o.index) for o in other) - ) - return self.__constructor__(new_modin_frame) + if axis == 0: + return result.reset_index(drop=True) + else: + result.columns = pandas.RangeIndex(len(result.columns)) + return result + return result # END Append/Concat/Join diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 04cdd8b6b74..b839bda5184 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -489,7 +489,7 @@ def mask( new_col_widths = [len(idx) for _, idx in col_partitions_list.items()] new_columns = self.columns[sorted(col_numeric_idx)] if self._dtypes is not None: - new_dtypes = self.dtypes[sorted(col_numeric_idx)] + new_dtypes = self.dtypes.iloc[sorted(col_numeric_idx)] else: new_dtypes = None else: diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 5e67427110a..941f45b83fe 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -83,6 +83,17 @@ def test_concat_on_column(): pandas.concat([df, df2], axis="columns"), ) + modin_result = pd.concat( + [pd.Series(np.ones(10)), pd.Series(np.ones(10))], axis=1, ignore_index=True + ) + pandas_result = pandas.concat( + [pandas.Series(np.ones(10)), pandas.Series(np.ones(10))], + axis=1, + ignore_index=True, + ) + df_equals(modin_result, pandas_result) + assert modin_result.dtypes.equals(pandas_result.dtypes) + def test_invalid_axis_errors(): df, df2 = generate_dfs() From 1604c5f4f2ffeb30f7af826a140d49741c8658ea Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 15:56:24 -0700 Subject: [PATCH 017/120] FIX-#1774: Fix unlimited column printing for smaller dataframes (#1799) Signed-off-by: Devin Petersohn --- modin/pandas/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 40c45a0d856..9c88e0d2d88 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -145,6 +145,7 @@ def __repr__(self): "display.expand_frame_repr" ): width, _ = console.get_console_size() + width = min(width, len(self.columns)) col_counter = 0 i = 0 while col_counter < width: From a8fc4f3f815e5d4fa1da88ffed839c484c58c8d3 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 15:59:56 -0700 Subject: [PATCH 018/120] FIX-#1705: Fix visual bug with repr on smaller dataframes (#1798) Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 58 ++++++++++++++++------------- modin/pandas/test/test_dataframe.py | 12 ++++++ 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 871305f9c7e..29b6589b689 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -64,36 +64,42 @@ def _build_repr_df(self, num_rows, num_cols): index=self.index, columns=self.columns if hasattr(self, "columns") else None, ) - # Add one here so that pandas automatically adds the dots - # It turns out to be faster to extract 2 extra rows and columns than to - # build the dots ourselves. - num_rows_for_head = num_rows // 2 + 1 - num_rows_for_tail = ( - num_rows_for_head - if len(self.index) > num_rows - else len(self.index) - num_rows_for_head - if len(self.index) - num_rows_for_head >= 0 - else None - ) - row_indexer = list(range(len(self.index))[:num_rows_for_head]) + ( - list(range(len(self.index))[-num_rows_for_tail:]) - if num_rows_for_tail is not None - else [] - ) - if hasattr(self, "columns"): - num_cols_for_front = num_cols // 2 + 1 - num_cols_for_back = ( - num_cols_for_front - if len(self.columns) > num_cols - else len(self.columns) - num_cols_for_front - if len(self.columns) - num_cols_for_front >= 0 + if len(self.index) <= num_rows: + row_indexer = slice(None) + else: + # Add one here so that pandas automatically adds the dots + # It turns out to be faster to extract 2 extra rows and columns than to + # build the dots ourselves. + num_rows_for_head = num_rows // 2 + 1 + num_rows_for_tail = ( + num_rows_for_head + if len(self.index) > num_rows + else len(self.index) - num_rows_for_head + if len(self.index) - num_rows_for_head >= 0 else None ) - col_indexer = list(range(len(self.columns))[:num_cols_for_front]) + ( - list(range(len(self.columns))[-num_cols_for_back:]) - if num_cols_for_back is not None + row_indexer = list(range(len(self.index))[:num_rows_for_head]) + ( + list(range(len(self.index))[-num_rows_for_tail:]) + if num_rows_for_tail is not None else [] ) + if hasattr(self, "columns"): + if len(self.columns) <= num_cols: + col_indexer = slice(None) + else: + num_cols_for_front = num_cols // 2 + 1 + num_cols_for_back = ( + num_cols_for_front + if len(self.columns) > num_cols + else len(self.columns) - num_cols_for_front + if len(self.columns) - num_cols_for_front >= 0 + else None + ) + col_indexer = list(range(len(self.columns))[:num_cols_for_front]) + ( + list(range(len(self.columns))[-num_cols_for_back:]) + if num_cols_for_back is not None + else [] + ) indexer = row_indexer, col_indexer else: indexer = row_indexer diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 7ef6cfe8a6f..f99b31edc6c 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5597,6 +5597,18 @@ def test___repr__(self): assert repr(pandas_df) == repr(modin_df) + # From Issue #1705 + string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" +"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 +"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 +"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" + pandas_df = pandas.read_csv(io.StringIO(string_data)) + modin_df = pd.read_csv(io.StringIO(string_data)) + assert repr(pandas_df) == repr(modin_df) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_reset_index_with_multi_index(self, data): modin_df = pd.DataFrame(data) From 184d1c53a2e1a5a7eb1e279b72fd90a1d05f7bd8 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 16:01:10 -0700 Subject: [PATCH 019/120] FIX-#1467: Fix support for cummax and cummin across int and float (#1800) Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 8 ++++---- modin/pandas/test/test_dataframe.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 29b6589b689..5522410792a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -766,8 +766,8 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): The cumulative maximum of the DataFrame. """ axis = self._get_axis_number(axis) if axis is not None else 0 - if axis: - self._validate_dtypes() + if axis == 1: + self._validate_dtypes(numeric_only=True) return self.__constructor__( query_compiler=self._query_compiler.cummax( axis=axis, skipna=skipna, **kwargs @@ -785,8 +785,8 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): The cumulative minimum of the DataFrame. """ axis = self._get_axis_number(axis) if axis is not None else 0 - if axis: - self._validate_dtypes() + if axis == 1: + self._validate_dtypes(numeric_only=True) return self.__constructor__( query_compiler=self._query_compiler.cummin( axis=axis, skipna=skipna, **kwargs diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index f99b31edc6c..226428055f9 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -3564,6 +3564,13 @@ def test_cummax(self, request, data, axis, skipna): modin_result = modin_df.T.cummax(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + def test_cummax_int_and_float(self, axis): + data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -3591,6 +3598,13 @@ def test_cummin(self, request, data, axis, skipna): modin_result = modin_df.T.cummin(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + def test_cummin_int_and_float(self, axis): + data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.cummin(axis=axis), pandas_df.cummin(axis=axis)) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( From a47c9286d15cccaac044f7aa802b1ab120c2af6c Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 16:04:31 -0700 Subject: [PATCH 020/120] FIX-#1631: Fix support for dictionary in `pd.concat` (#1795) Signed-off-by: Devin Petersohn --- modin/pandas/concat.py | 50 ++++++++++++++++++++------------ modin/pandas/test/test_concat.py | 11 +++++++ stress_tests/kaggle/kaggle12.py | 2 +- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py index 38694799663..f7988328960 100644 --- a/modin/pandas/concat.py +++ b/modin/pandas/concat.py @@ -44,18 +44,21 @@ def concat( '"{name}"'.format(name=type(objs).__name__) ) axis = pandas.DataFrame()._get_axis_number(axis) - objs = list(objs) - if len(objs) == 0: + if isinstance(objs, dict): + list_of_objs = list(objs.values()) + else: + list_of_objs = list(objs) + if len(list_of_objs) == 0: raise ValueError("No objects to concatenate") - objs = [obj for obj in objs if obj is not None] + list_of_objs = [obj for obj in list_of_objs if obj is not None] - if len(objs) == 0: + if len(list_of_objs) == 0: raise ValueError("All objects passed were None") try: type_check = next( obj - for obj in objs + for obj in list_of_objs if not isinstance(obj, (pandas.Series, Series, pandas.DataFrame, DataFrame)) ) except StopIteration: @@ -63,17 +66,17 @@ def concat( if type_check is not None: raise ValueError( 'cannot concatenate object of type "{0}"; only ' - "pandas.Series, pandas.DataFrame, " + "modin.pandas.Series " "and modin.pandas.DataFrame objs are " "valid", type(type_check), ) - all_series = all(isinstance(obj, Series) for obj in objs) + all_series = all(isinstance(obj, Series) for obj in list_of_objs) if all_series and axis == 0: return Series( - query_compiler=objs[0]._query_compiler.concat( + query_compiler=list_of_objs[0]._query_compiler.concat( axis, - [o._query_compiler for o in objs[1:]], + [o._query_compiler for o in list_of_objs[1:]], join=join, join_axes=None, ignore_index=ignore_index, @@ -85,8 +88,6 @@ def concat( sort=sort, ) ) - if isinstance(objs, dict): - raise NotImplementedError("Obj as dicts not implemented.") if join not in ["inner", "outer"]: raise ValueError( "Only can inner (intersect) or outer (union) join the other axis" @@ -94,22 +95,29 @@ def concat( # We have the weird Series and axis check because, when concatenating a # dataframe to a series on axis=0, pandas ignores the name of the series, # and this check aims to mirror that (possibly buggy) functionality - objs = [ + list_of_objs = [ obj if isinstance(obj, DataFrame) else DataFrame(obj.rename()) if isinstance(obj, (pandas.Series, Series)) and axis == 0 else DataFrame(obj) - for obj in objs + for obj in list_of_objs + ] + list_of_objs = [ + obj._query_compiler + for obj in list_of_objs + if len(obj.index) or len(obj.columns) ] - objs = [obj._query_compiler for obj in objs if len(obj.index) or len(obj.columns)] if keys is not None: if all_series: new_idx = keys else: - objs = [objs[i] for i in range(min(len(objs), len(keys)))] + list_of_objs = [ + list_of_objs[i] for i in range(min(len(list_of_objs), len(keys))) + ] new_idx_labels = { - k: v.index if axis == 0 else v.columns for k, v in zip(keys, objs) + k: v.index if axis == 0 else v.columns + for k, v in zip(keys, list_of_objs) } tuples = [ (k, *o) if isinstance(o, tuple) else (k, o) @@ -120,14 +128,18 @@ def concat( if names is not None: new_idx.names = names else: - old_name = _determine_name(objs, axis) + old_name = _determine_name(list_of_objs, axis) if old_name is not None: new_idx.names = [None] + old_name + elif isinstance(objs, dict): + new_idx = pandas.concat( + {k: pandas.Series(index=obj.axes[axis]) for k, obj in objs.items()} + ).index else: new_idx = None - new_query_compiler = objs[0].concat( + new_query_compiler = list_of_objs[0].concat( axis, - objs[1:], + list_of_objs[1:], join=join, join_axes=None, ignore_index=ignore_index, diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 941f45b83fe..9d975b0bd29 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -190,3 +190,14 @@ def test_concat_multiindex(axis, names): pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names), pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names), ) + + +@pytest.mark.parametrize("axis", [0, 1]) +def test_concat_dictionary(axis): + pandas_df, pandas_df2 = generate_dfs() + modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) + + df_equals( + pd.concat({"A": modin_df, "B": modin_df2}, axis=axis), + pandas.concat({"A": pandas_df, "B": pandas_df2}, axis=axis), + ) diff --git a/stress_tests/kaggle/kaggle12.py b/stress_tests/kaggle/kaggle12.py index 250cf3b192f..69b09953582 100755 --- a/stress_tests/kaggle/kaggle12.py +++ b/stress_tests/kaggle/kaggle12.py @@ -52,7 +52,7 @@ def detect_outliers(df, n, features): train.loc[Outliers_to_drop] # Show the outliers rows train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True) train_len = len(train) -dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True) +dataset = pd.concat(list_of_objs=[train, test], axis=0).reset_index(drop=True) dataset = dataset.fillna(np.nan) dataset.isnull().sum() train.info() From d8a99727658f251148742d87ca50c6881aff5b81 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 24 Jul 2020 16:05:36 -0700 Subject: [PATCH 021/120] TEST-#1119: Re-enable skipped Windows tests for `DataFrame.prod` (#1801) Signed-off-by: Devin Petersohn --- modin/pandas/test/test_dataframe.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 226428055f9..1ca1ea48d92 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -15,7 +15,6 @@ import numpy as np import pandas import pandas.util.testing as tm -import os import matplotlib import modin.pandas as pd from modin.pandas.utils import to_pandas @@ -3386,10 +3385,6 @@ def test_min(self, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) - @pytest.mark.skipif( - os.name == "nt", - reason="Windows has a memory issue for large numbers on this test", - ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -3441,10 +3436,6 @@ def test_prod(self, request, data, axis, skipna, numeric_only, min_count): ) df_equals(modin_result, pandas_result) - @pytest.mark.skipif( - os.name == "nt", - reason="Windows has a memory issue for large numbers on this test", - ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( From aaa79ed38645dd5d0f84c59e9cf74f67605a91ee Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sat, 25 Jul 2020 17:13:17 -0700 Subject: [PATCH 022/120] Revert "TEST-#1119: Re-enable skipped Windows tests for `DataFrame.prod`" (#1813) This reverts commit d8a99727658f251148742d87ca50c6881aff5b81. --- modin/pandas/test/test_dataframe.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 1ca1ea48d92..226428055f9 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -15,6 +15,7 @@ import numpy as np import pandas import pandas.util.testing as tm +import os import matplotlib import modin.pandas as pd from modin.pandas.utils import to_pandas @@ -3385,6 +3386,10 @@ def test_min(self, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + @pytest.mark.skipif( + os.name == "nt", + reason="Windows has a memory issue for large numbers on this test", + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -3436,6 +3441,10 @@ def test_prod(self, request, data, axis, skipna, numeric_only, min_count): ) df_equals(modin_result, pandas_result) + @pytest.mark.skipif( + os.name == "nt", + reason="Windows has a memory issue for large numbers on this test", + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( From dcff8da6d51af18463aef25d3b84a63c59cc784e Mon Sep 17 00:00:00 2001 From: Yutaro Ikeda Date: Sun, 26 Jul 2020 09:15:37 +0900 Subject: [PATCH 023/120] DOCS-#1809 missing links in the architecture page (#1810) --- docs/architecture.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/architecture.rst b/docs/architecture.rst index 1fd727eb62a..f0a4e42c668 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -202,8 +202,8 @@ documentation page on Contributing_. .. _Ray: https://github.com/ray-project/ray .. _code: https://github.com/modin-project/modin/blob/master/modin/engines/base/frame/data.py .. _Contributing: contributing.html -.. _Pandas on Ray: UsingPandasonRay/optimizations.html -.. _Pandas on Dask: UsingPandasonDask/optimizations.html +.. _Pandas on Ray: UsingPandasonRay/index.html +.. _Pandas on Dask: UsingPandasonDask/index.html .. _Dask Futures: https://docs.dask.org/en/latest/futures.html .. _issue: https://github.com/modin-project/modin/issues .. _Discourse: https://discuss.modin.org From 53f38a4518996455b3cddaf31fd8fbf833aeee13 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Sun, 26 Jul 2020 21:13:25 +0300 Subject: [PATCH 024/120] TEST-#1779: Limit object store to 1GB during CI tests (#1744) Signed-off-by: Vasilij Litvinov --- .github/workflows/ci.yml | 2 ++ .github/workflows/push.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cdc38768e72..db2ad58ebfe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -95,6 +95,7 @@ jobs: part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} + MODIN_MEMORY: 1000000000 name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v1 @@ -137,6 +138,7 @@ jobs: part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} + MODIN_MEMORY: 1000000000 name: test-windows steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 8c8f38e2f57..ef84f588d89 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -10,6 +10,7 @@ jobs: part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} + MODIN_MEMORY: 1000000000 name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v1 @@ -51,6 +52,7 @@ jobs: part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} + MODIN_MEMORY: 1000000000 name: test-windows steps: - uses: actions/checkout@v1 From a08c0c8dadba13a4f5f3bf48df0aab39a21a9506 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Mon, 27 Jul 2020 05:36:30 +0300 Subject: [PATCH 025/120] FIX-#1803: Change the order of execution engine change callbacks (#1805) Signed-off-by: Vasilij Litvinov --- modin/__init__.py | 8 ++-- modin/pandas/__init__.py | 98 ++++++++++++++++++++-------------------- 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/modin/__init__.py b/modin/__init__.py index 1ea9e11340f..664ddadeba0 100644 --- a/modin/__init__.py +++ b/modin/__init__.py @@ -85,11 +85,11 @@ class Publisher(object): def __init__(self, name, value): self.name = name self.__value = value.title() - self.__subs = set() - self.__once = collections.defaultdict(set) + self.__subs = [] + self.__once = collections.defaultdict(list) def subscribe(self, callback): - self.__subs.add(callback) + self.__subs.append(callback) callback(self) def once(self, onvalue, callback): @@ -97,7 +97,7 @@ def once(self, onvalue, callback): if onvalue == self.__value: callback(self) else: - self.__once[onvalue].add(callback) + self.__once[onvalue].append(callback) def get(self): return self.__value diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index eaa8455bebc..ce4ce73f20a 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -87,6 +87,55 @@ import os import multiprocessing +from .. import execution_engine, Publisher + +DEFAULT_NPARTITIONS = 4 +num_cpus = 1 + + +_is_first_update = {} +dask_client = None + + +def _update_engine(publisher: Publisher): + global DEFAULT_NPARTITIONS, dask_client, num_cpus + + if publisher.get() == "Ray": + import ray + from modin.engines.ray.utils import initialize_ray + + if _is_first_update.get("Ray", True): + initialize_ray() + num_cpus = ray.cluster_resources()["CPU"] + elif publisher.get() == "Dask": # pragma: no cover + from distributed.client import get_client + + if threading.current_thread().name == "MainThread" and _is_first_update.get( + "Dask", True + ): + import warnings + + warnings.warn("The Dask Engine for Modin is experimental.") + + try: + dask_client = get_client() + except ValueError: + from distributed import Client + + num_cpus = ( + os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() + ) + dask_client = Client(n_workers=int(num_cpus)) + + elif publisher.get() != "Python": + raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) + + _is_first_update[publisher.get()] = False + DEFAULT_NPARTITIONS = max(4, int(num_cpus)) + + +execution_engine.subscribe(_update_engine) + from .. import __version__ from .concat import concat from .dataframe import DataFrame @@ -133,58 +182,9 @@ value_counts, ) from .plotting import Plotting as plotting -from .. import execution_engine, Publisher # Set this so that Pandas doesn't try to multithread by itself os.environ["OMP_NUM_THREADS"] = "1" -num_cpus = 1 - - -DEFAULT_NPARTITIONS = 4 -_is_first_update = {} -dask_client = None - - -def _update_engine(publisher: Publisher): - global DEFAULT_NPARTITIONS, dask_client - - num_cpus = DEFAULT_NPARTITIONS - if publisher.get() == "Ray": - import ray - from modin.engines.ray.utils import initialize_ray - - if _is_first_update.get("Ray", True): - initialize_ray() - num_cpus = ray.cluster_resources()["CPU"] - elif publisher.get() == "Dask": # pragma: no cover - from distributed.client import get_client - - if threading.current_thread().name == "MainThread" and _is_first_update.get( - "Dask", True - ): - import warnings - - warnings.warn("The Dask Engine for Modin is experimental.") - - try: - dask_client = get_client() - except ValueError: - from distributed import Client - - num_cpus = ( - os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() - ) - dask_client = Client(n_workers=int(num_cpus)) - - elif publisher.get() != "Python": - raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) - - _is_first_update[publisher.get()] = False - DEFAULT_NPARTITIONS = max(4, int(num_cpus)) - - -execution_engine.subscribe(_update_engine) - __all__ = [ "DataFrame", "Series", From cac75c0d963ca24968f7b11cdb99b2f1bfa48bd3 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Mon, 27 Jul 2020 05:37:24 +0300 Subject: [PATCH 026/120] FIX-#1807: Fix passing zone in rayscale, add ability to override image (#1808) Signed-off-by: Vasilij Litvinov --- modin/experimental/cloud/cluster.py | 18 +++++++++++++++--- modin/experimental/cloud/rayscale.py | 7 ++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/modin/experimental/cloud/cluster.py b/modin/experimental/cloud/cluster.py index dc902897d25..a2cb0633388 100644 --- a/modin/experimental/cloud/cluster.py +++ b/modin/experimental/cloud/cluster.py @@ -34,6 +34,7 @@ class Provider: __KNOWN = {AWS: [_RegionZone(region="us-west-1", zone="us-west-1a")]} __DEFAULT_HEAD = {AWS: "m5.large"} __DEFAULT_WORKER = {AWS: "m5.large"} + __DEFAULT_IMAGE = {AWS: "ami-0f56279347d2fa43e"} def __init__( self, @@ -41,12 +42,14 @@ def __init__( credentials_file: str = None, region: str = None, zone: str = None, + image: str = None, ): """ Class that holds all information about particular connection to cluster provider, namely * provider name (must be one of known ones) * path to file with credentials (file format is provider-specific); omit to use global provider-default credentials * region and zone where cluster is to be spawned (optional, would be deduced if omitted) + * image to use (optional, would use default for provider if omitted) """ if name not in self.__KNOWN: @@ -77,6 +80,7 @@ def __init__( self.credentials_file = ( os.path.abspath(credentials_file) if credentials_file is not None else None ) + self.image = image or self.__DEFAULT_IMAGE[name] @property def default_head_type(self): @@ -197,6 +201,7 @@ def create( credentials: str = None, region: str = None, zone: str = None, + image: str = None, project_name: str = None, cluster_name: str = "modin-cluster", workers: int = 4, @@ -222,6 +227,9 @@ def create( If omitted a default for given provider will be taken. zone : str, optional Availability zone (part of region) where to spawn the cluster. + If omitted a default for given provider and region will be taken. + image: str, optional + Image to use for spawning head and worker nodes. If omitted a default for given provider will be taken. project_name : str, optional Project name to assign to the cluster in cloud, for easier manual tracking. @@ -247,12 +255,16 @@ def create( """ if not isinstance(provider, Provider): provider = Provider( - name=provider, credentials_file=credentials, region=region, zone=zone + name=provider, + credentials_file=credentials, + region=region, + zone=zone, + image=image, ) else: - if any(p is not None for p in (credentials, region, zone)): + if any(p is not None for p in (credentials, region, zone, image)): warnings.warn( - "Ignoring credentials, region and zone parameters because provider is specified as Provider descriptor, not as name", + "Ignoring credentials, region, zone and image parameters because provider is specified as Provider descriptor, not as name", UserWarning, ) if __spawner__ == "rayscale": diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index a4f8e37ff0c..ee7b5445d64 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -59,6 +59,7 @@ class RayCluster(BaseCluster): os.path.abspath(os.path.dirname(__file__)), "ray-autoscaler.yml" ) __instance_key = {Provider.AWS: "InstanceType"} + __image_key = {Provider.AWS: "ImageId"} __credentials_env = {Provider.AWS: "AWS_SHARED_CREDENTIALS_FILE"} def __init__(self, *a, **kw): @@ -109,7 +110,7 @@ def __make_config(self): if self.provider.region: config["provider"]["region"] = self.provider.region if self.provider.zone: - config["provider"]["zone"] = self.provider.zone + config["provider"]["availability_zone"] = self.provider.zone # connection details config["auth"]["ssh_user"] = "ubuntu" @@ -120,10 +121,14 @@ def __make_config(self): # instance types try: instance_key = self.__instance_key[self.provider.name] + image_key = self.__image_key[self.provider.name] except KeyError: raise ValueError(f"Unsupported provider: {self.provider.name}") + config["head_node"][instance_key] = self.head_node_type + config["head_node"][image_key] = self.provider.image config["worker_nodes"][instance_key] = self.worker_node_type + config["worker_nodes"][image_key] = self.provider.image return _bootstrap_config(config) From 4213f2ef11e6fc1562495ca9e5ad4417a8c10c66 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Tue, 28 Jul 2020 01:15:31 +0300 Subject: [PATCH 027/120] FIX-#1457: Series.reset_index considering 'name' fix (#1820) Signed-off-by: Dmitry Chigarev --- modin/pandas/series.py | 5 +---- modin/pandas/test/test_series.py | 25 +++++++++++-------------- modin/pandas/test/utils.py | 12 ++++++++---- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index fae48fc14cb..23b8350b948 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1235,7 +1235,6 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): else: result = self.copy() result.index = new_idx - result.name = name or self.name return result elif not drop and inplace: raise TypeError( @@ -1247,9 +1246,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): obj.name = name from .dataframe import DataFrame - return DataFrame(self.copy()).reset_index( - level=level, drop=drop, inplace=inplace - ) + return DataFrame(obj).reset_index(level=level, drop=drop, inplace=inplace) def rdivmod(self, other, level=None, fill_value=None, axis=0): return self._default_to_pandas( diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 768fad5d614..9682be5dae8 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2409,20 +2409,17 @@ def test_resample(closed, label, level): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("drop", [True, False], ids=["True", "False"]) -def test_reset_index(data, drop): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.reset_index(drop=drop), pandas_series.reset_index(drop=drop)) - - modin_series_cp = modin_series.copy() - pandas_series_cp = pandas_series.copy() - try: - pandas_result = pandas_series_cp.reset_index(drop=drop, inplace=True) - except Exception as e: - with pytest.raises(type(e)): - modin_series_cp.reset_index(drop=drop, inplace=True) - else: - modin_result = modin_series_cp.reset_index(drop=drop, inplace=True) - df_equals(pandas_result, modin_result) +@pytest.mark.parametrize("name", [None, "Custom name"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_reset_index(data, drop, name, inplace): + eval_general( + *create_test_series(data), + lambda df, *args, **kwargs: df.reset_index(*args, **kwargs), + drop=drop, + name=name, + inplace=inplace, + __inplace__=inplace, + ) @pytest.mark.skip(reason="Using pandas Series.") diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 3ae06c5ff69..bd7dcd0596b 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -557,10 +557,12 @@ def check_df_columns_have_nans(df, cols): ) -def eval_general(modin_df, pandas_df, operation, comparator=df_equals, **kwargs): +def eval_general( + modin_df, pandas_df, operation, comparator=df_equals, __inplace__=False, **kwargs +): md_kwargs, pd_kwargs = {}, {} - def execute_callable(fn, md_kwargs={}, pd_kwargs={}): + def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): try: pd_result = fn(pandas_df, **pd_kwargs) except Exception as e: @@ -569,7 +571,7 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}): repr(fn(modin_df, **md_kwargs)) else: md_result = fn(modin_df, **md_kwargs) - return md_result, pd_result + return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df) for key, value in kwargs.items(): if callable(value): @@ -585,7 +587,9 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}): md_kwargs[key] = md_value pd_kwargs[key] = pd_value - values = execute_callable(operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs) + values = execute_callable( + operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__ + ) if values is not None: comparator(*values) From a95ddf9e7a5f2578b7f741edb52222f4667a26d7 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Tue, 28 Jul 2020 03:29:52 +0300 Subject: [PATCH 028/120] FIX-#1459: 'to_pandas' of nested objects added (#1828) Signed-off-by: Dmitry Chigarev --- modin/pandas/base.py | 9 ++++---- modin/pandas/groupby.py | 23 +++++++++++++-------- modin/pandas/test/test_groupby.py | 34 +++++++++++++++++++++++++++++-- modin/pandas/utils.py | 34 +++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 15 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5522410792a..5fde31165be 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -32,6 +32,7 @@ import pickle as pkl from modin.error_message import ErrorMessage +from modin.pandas.utils import try_cast_to_pandas # Similar to pandas, sentinel value to use as kwarg in place of None when None has # special meaning and needs to be distinguished from a user explicitly passing None. @@ -241,11 +242,9 @@ def _default_to_pandas(self, op, *args, **kwargs): empty_self_str, ) ) - args = (a._to_pandas() if hasattr(a, "_to_pandas") else a for a in args) - kwargs = { - k: v._to_pandas() if hasattr(v, "_to_pandas") else v - for k, v in kwargs.items() - } + + args = try_cast_to_pandas(args) + kwargs = try_cast_to_pandas(kwargs) pandas_obj = self._to_pandas() if callable(op): result = op(pandas_obj, *args, **kwargs) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index cc9ba04d280..a18c9e30589 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -346,7 +346,9 @@ def aggregate(self, func=None, *args, **kwargs): if func is None or is_list_like(func): return self._default_to_pandas( - lambda df: df.aggregate(func, *args, **kwargs) + lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), + *args, + **kwargs, ) if isinstance(func, str): @@ -355,7 +357,10 @@ def aggregate(self, func=None, *args, **kwargs): return agg_func(*args, **kwargs) return self._apply_agg_function( - lambda df: df.aggregate(func, *args, **kwargs), drop=self._as_index + lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), + drop=self._as_index, + *args, + **kwargs, ) agg = aggregate @@ -622,7 +627,7 @@ def _wrap_aggregation( return result.squeeze() return result - def _apply_agg_function(self, f, drop=True, **kwargs): + def _apply_agg_function(self, f, drop=True, *args, **kwargs): """Perform aggregation and combine stages based on a given function. Args: @@ -634,7 +639,7 @@ def _apply_agg_function(self, f, drop=True, **kwargs): assert callable(f), "'{0}' object is not callable".format(type(f)) if self._is_multi_by: - return self._default_to_pandas(f, **kwargs) + return self._default_to_pandas(f, *args, **kwargs) if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze() @@ -658,7 +663,7 @@ def _apply_agg_function(self, f, drop=True, **kwargs): return result.squeeze() return result - def _default_to_pandas(self, f, **kwargs): + def _default_to_pandas(self, f, *args, **kwargs): """Defailts the execution of this function to pandas. Args: @@ -677,10 +682,12 @@ def _default_to_pandas(self, f, **kwargs): else: by = self._by - def groupby_on_multiple_columns(df): - return f(df.groupby(by=by, axis=self._axis, **self._kwargs), **kwargs) + def groupby_on_multiple_columns(df, *args, **kwargs): + return f( + df.groupby(by=by, axis=self._axis, **self._kwargs), *args, **kwargs + ) - return self._df._default_to_pandas(groupby_on_multiple_columns) + return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs) class SeriesGroupBy(DataFrameGroupBy): diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index fd8bf474db2..5dbe30baa19 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -44,13 +44,17 @@ def modin_groupby_equals_pandas(modin_groupby, pandas_groupby): df_equals(g1[1], g2[1]) -def eval_aggregation(md_df, pd_df, operation, by=None, *args, **kwargs): +def eval_aggregation(md_df, pd_df, operation=None, by=None, *args, **kwargs): if by is None: by = md_df.columns[0] + if operation is None: + operation = {} return eval_general( md_df, pd_df, - operation=lambda df: df.groupby(by=by).agg(operation), + operation=lambda df, *args, **kwargs: df.groupby(by=by).agg( + operation, *args, **kwargs + ), *args, **kwargs, ) @@ -1104,3 +1108,29 @@ def test_agg_exceptions(operation): data = {**data1, **data2} eval_aggregation(*create_test_dfs(data), operation=operation) + + +@pytest.mark.parametrize( + "kwargs", + [ + { + "Max": ("cnt", np.max), + "Sum": ("cnt", np.sum), + "Num": ("c", pd.Series.nunique), + "Num1": ("c", pandas.Series.nunique), + }, + { + "func": { + "Max": ("cnt", np.max), + "Sum": ("cnt", np.sum), + "Num": ("c", pd.Series.nunique), + "Num1": ("c", pandas.Series.nunique), + } + }, + ], +) +def test_to_pandas_convertion(kwargs): + data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} + by = ["a", "b"] + + eval_aggregation(*create_test_dfs(data), by=by, **kwargs) diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index cc3b39eadd4..8ed2e60d5cd 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import pandas + def from_non_pandas(df, index, columns, dtype): from modin.data_management.dispatcher import EngineDispatcher @@ -82,3 +84,35 @@ def decorator(cls): return cls return decorator + + +def try_cast_to_pandas(obj): + """ + Converts obj and all nested objects from modin to pandas if it is possible, + otherwise returns obj + + Parameters + ---------- + obj : object, + object to convert from modin to pandas + + Returns + ------- + Converted object + """ + if hasattr(obj, "_to_pandas"): + return obj._to_pandas() + if isinstance(obj, (list, tuple)): + return type(obj)([try_cast_to_pandas(o) for o in obj]) + if isinstance(obj, dict): + return {k: try_cast_to_pandas(v) for k, v in obj.items()} + if callable(obj): + module_hierarchy = getattr(obj, "__module__", "").split(".") + fn_name = getattr(obj, "__name__", None) + if fn_name and module_hierarchy[0] == "modin": + return ( + getattr(pandas.DataFrame, fn_name, obj) + if module_hierarchy[-1] == "dataframe" + else getattr(pandas.Series, fn_name, obj) + ) + return obj From f43a70ad5386af42dfc88fc5989120ec92818c29 Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 28 Jul 2020 04:24:44 +0300 Subject: [PATCH 029/120] FIX-1708: Don't sort indexes in Series functions with level parameter (#1830) Signed-off-by: Anatoly Myachev --- modin/pandas/base.py | 2 +- modin/pandas/test/test_series.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5fde31165be..ea348eb04c3 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -125,7 +125,7 @@ def _handle_level_agg(self, axis, level, op, **kwargs): level: The level of the axis to apply the operation on op: String representation of the operation to be performed on the level """ - return getattr(self.groupby(level=level, axis=axis), op)(**kwargs) + return getattr(self.groupby(level=level, axis=axis, sort=False), op)(**kwargs) def _validate_other( self, diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 9682be5dae8..dbfdf459d34 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -28,6 +28,7 @@ df_equals, arg_keys, name_contains, + test_data, test_data_values, test_data_keys, test_string_data_values, @@ -1802,6 +1803,24 @@ def test_last(): df_equals(modin_series.last("20D"), pandas_series.last("20D")) +def test_index_order(): + # see #1708 for details + s_modin, s_pandas = create_test_series(test_data["dense_nan_data"]) + rows_number = len(s_modin.index) + level_0 = np.random.choice([x for x in range(10)], rows_number) + level_1 = np.random.choice([x for x in range(10)], rows_number) + index = pandas.MultiIndex.from_arrays([level_0, level_1]) + + s_modin.index = index + s_pandas.index = index + + for func in ["all", "any", "mad"]: + df_equals( + getattr(s_modin, func)(level=0).index, + getattr(s_pandas, func)(level=0).index, + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_last_valid_index(data): modin_series, pandas_series = create_test_series(data) From a2c68d624cb763985b23963b51838579567204e0 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Tue, 28 Jul 2020 10:25:21 +0300 Subject: [PATCH 030/120] FEAT-#1701: Enable running Modin via remote Ray on spawned cluster (#1818) For details see https://github.com/modin-project/modin/pull/1818 Co-authored-by: Devin Petersohn Signed-off-by: Vasilij Litvinov --- examples/cluster/mortgage-runner.py | 53 +++ modin/data_management/factories.py | 41 +++ modin/engines/ray/utils.py | 43 ++- modin/experimental/cloud/connection.py | 5 +- modin/experimental/cloud/meta_magic.py | 148 +++++++++ modin/experimental/cloud/rayscale.py | 3 + modin/experimental/cloud/rpyc_proxy.py | 440 +++++++++++++++++++++++++ modin/pandas/__init__.py | 32 +- modin/pandas/base.py | 10 +- modin/pandas/dataframe.py | 10 +- modin/pandas/groupby.py | 8 + modin/pandas/series.py | 7 + modin/pandas/test/test_api.py | 1 + 13 files changed, 785 insertions(+), 16 deletions(-) create mode 100644 examples/cluster/mortgage-runner.py create mode 100644 modin/experimental/cloud/meta_magic.py create mode 100644 modin/experimental/cloud/rpyc_proxy.py diff --git a/examples/cluster/mortgage-runner.py b/examples/cluster/mortgage-runner.py new file mode 100644 index 00000000000..1c5c8122329 --- /dev/null +++ b/examples/cluster/mortgage-runner.py @@ -0,0 +1,53 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +# pip install git+https://github.com/intel-go/ibis.git@develop + +# NOTE: expects https://github.com/intel-go/omniscripts/ checked out and in PYTHONPATH + +# the following import turns on experimental mode in Modin, +# including enabling running things in remote cloud +import modin.experimental.pandas as pd # noqa: F401 +from modin.experimental.cloud import create_cluster, get_connection + +from mortgage import run_benchmark +from mortgage.mortgage_pandas import etl_pandas + +test_cluster = create_cluster( + "aws", + "aws_credentials", + cluster_name="rayscale-test", + region="eu-north-1", + zone="eu-north-1b", + image="ami-00e1e82d7d4ca80d3", +) +with test_cluster: + conn = get_connection() + np = conn.modules["numpy"] + etl_pandas.__globals__["np"] = np + + parameters = { + "data_file": "https://modin-datasets.s3.amazonaws.com/mortgage", + # "data_file": "s3://modin-datasets/mortgage", + "dfiles_num": 1, + "no_ml": True, + "validation": False, + "no_ibis": True, + "no_pandas": False, + "pandas_mode": "Modin_on_ray", + "ray_tmpdir": "/tmp", + "ray_memory": 1024 * 1024 * 1024, + } + + run_benchmark(parameters) diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index c7f5bf5eb75..e6933ff2a23 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -210,3 +210,44 @@ def prepare(cls): from modin.experimental.engines.pyarrow_on_ray.io import PyarrowOnRayIO cls.io_cls = PyarrowOnRayIO + + +class ExperimentalPandasOnCloudrayFactory(ExperimentalBaseFactory): + @classmethod + def prepare(cls): + # query_compiler import is needed so remote PandasQueryCompiler + # has an imported local counterpart; + # if there isn't such counterpart rpyc generates some bogus + # class type which raises TypeError() + # upon checking its isinstance() or issubclass() + import modin.backends.pandas.query_compiler # noqa: F401 + from modin.experimental.cloud import get_connection + + class WrappedIO: + def __init__(self, conn): + self.__conn = conn + self.__io_cls = conn.modules[ + "modin.engines.ray.pandas_on_ray.io" + ].PandasOnRayIO + self.__reads = { + name for name in BaseIO.__dict__ if name.startswith("read_") + } + self.__wrappers = {} + + def __getattr__(self, name): + if name in self.__reads: + try: + wrap = self.__wrappers[name] + except KeyError: + + def wrap(*a, _original=getattr(self.__io_cls, name), **kw): + a = tuple(self.__conn.deliver(x) for x in a) + kw = {k: self.__conn.deliver(v) for k, v in kw.items()} + return _original(*a, **kw) + + self.__wrappers[name] = wrap + else: + wrap = getattr(self.__io_cls, name) + return wrap + + cls.io_cls = WrappedIO(get_connection()) diff --git a/modin/engines/ray/utils.py b/modin/engines/ray/utils.py index d0b5f2f0d95..51f96d6f298 100644 --- a/modin/engines/ray/utils.py +++ b/modin/engines/ray/utils.py @@ -61,18 +61,43 @@ def _import_pandas(*args): import pandas # noqa F401 -def initialize_ray(): - """Initializes ray based on environment variables and internal defaults.""" +def initialize_ray( + override_is_cluster=False, + override_redis_address: str = None, + override_redis_password: str = None, +): + """ + Initializes ray based on parameters, environment variables and internal defaults. + + Parameters + ---------- + override_is_cluster: bool, optional + Whether to override the detection of Moding being run in a cluster + and always assume this runs on cluster head node. + This also overrides Ray worker detection and always runs the function, + not only from main thread. + If not specified, $MODIN_RAY_CLUSTER env variable is used. + override_redis_address: str, optional + What Redis address to connect to when running in Ray cluster. + If not specified, $MODIN_REDIS_ADDRESS is used. + override_redis_password: str, optional + What password to use when connecting to Redis. + If not specified, a new random one is generated. + """ import ray - if threading.current_thread().name == "MainThread": + if threading.current_thread().name == "MainThread" or override_is_cluster: import secrets - plasma_directory = None - num_cpus = os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() - cluster = os.environ.get("MODIN_RAY_CLUSTER", "").title() - redis_address = os.environ.get("MODIN_REDIS_ADDRESS", None) - redis_password = secrets.token_hex(16) + cluster = ( + "True" + if override_is_cluster + else os.environ.get("MODIN_RAY_CLUSTER", "").title() + ) + redis_address = override_redis_address or os.environ.get( + "MODIN_REDIS_ADDRESS", None + ) + redis_password = override_redis_password or secrets.token_hex(16) if cluster == "True": # We only start ray in a cluster setting for the head node. @@ -84,7 +109,9 @@ def initialize_ray(): logging_level=100, ) elif cluster == "": + num_cpus = os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() object_store_memory = os.environ.get("MODIN_MEMORY", None) + plasma_directory = None if os.environ.get("MODIN_OUT_OF_CORE", "False").title() == "True": from tempfile import gettempdir diff --git a/modin/experimental/cloud/connection.py b/modin/experimental/cloud/connection.py index 7be5d6cf8c0..1e35ee972d3 100644 --- a/modin/experimental/cloud/connection.py +++ b/modin/experimental/cloud/connection.py @@ -103,14 +103,15 @@ def get(cls): def __try_connect(self): import rpyc + from .rpyc_proxy import WrappingService try: self.__connection = rpyc.connect( "127.0.0.1", self.rpyc_port, - rpyc.ClassicService, - config={"sync_request_timeout": RPYC_REQUEST_TIMEOUT}, + WrappingService, keepalive=True, + config={"sync_request_timeout": RPYC_REQUEST_TIMEOUT}, ) except (ConnectionRefusedError, EOFError): if self.proc.poll() is not None: diff --git a/modin/experimental/cloud/meta_magic.py b/modin/experimental/cloud/meta_magic.py new file mode 100644 index 00000000000..3410122fae5 --- /dev/null +++ b/modin/experimental/cloud/meta_magic.py @@ -0,0 +1,148 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import sys +import inspect +import types + +from modin import execution_engine + +_LOCAL_ATTRS = frozenset(("__new__", "__dict__", "__wrapper_remote__")) + + +class RemoteMeta(type): + """ + Metaclass that relays getting non-existing attributes from + a proxying object *CLASS* to a remote end transparently. + + Attributes existing on a proxying object are retrieved locally. + """ + + @property + def __signature__(self): + """ + Override detection performed by inspect.signature(). + Defining custom __new__() throws off inspect.signature(ClassType) + as it returns a signature of __new__(), even if said __new__() is defined + in a parent class. + """ + # Note that we create an artificial bound method here, as otherwise + # self.__init__ is an ordinary function, and inspect.signature() shows + # "self" argument while it should hide it for our purposes. + # So we make a method bound to class type (it would normally be bound to instance) + # and pass that to .signature() + return inspect.signature(types.MethodType(self.__init__, self)) + + def __getattribute__(self, name): + if name in _LOCAL_ATTRS: + # never proxy special attributes, always get them from the class type + res = object.__getattribute__(self, name) + else: + try: + # Go for proxying class-level attributes first; + # make sure to check for attribute in self.__dict__ to get the class-level + # attribute from the class itself, not from some of its parent classes. + # Also note we use object.__getattribute__() to skip any potential + # class-level __getattr__ + res = object.__getattribute__(self, "__dict__")[name] + except KeyError: + try: + res = object.__getattribute__(self, name) + except AttributeError: + frame = sys._getframe() + try: + is_inspect = frame.f_back.f_code.co_filename == inspect.__file__ + except AttributeError: + is_inspect = False + finally: + del frame + if is_inspect: + # be always-local for inspect.* functions + res = super().__getattribute__(name) + else: + try: + remote = object.__getattribute__( + object.__getattribute__(self, "__real_cls__"), + "__wrapper_remote__", + ) + except AttributeError: + # running in local mode, fall back + res = super().__getattribute__(name) + else: + res = getattr(remote, name) + try: + # note that any attribute might be in fact a data descriptor, + # account for that + getter = res.__get__ + except AttributeError: + return res + return getter(None, self) + + +_KNOWN_DUALS = {} + + +def make_wrapped_class(local_cls: type, rpyc_wrapper_name: str): + """ + Replaces given local class in its module with a descendant class + which has __new__ overridden (a dual-nature class). + This new class is instantiated differently depending o + whether this is done in remote context or local. + + In local context we effectively get the same behaviour, but in remote + context the created class is actually of separate type which + proxies most requests to a remote end. + + Parameters + ---------- + local_cls: class + The class to replace with a dual-nature class + rpyc_wrapper_name: str + The function *name* to make a proxy class type. + Note that this is specifically taken as string to not import + "rpyc_proxy" module in top-level, as it requires RPyC to be + installed, and not all users of Modin (even in experimental mode) + need remote context. + """ + namespace = { + "__real_cls__": None, + "__new__": None, + "__module__": local_cls.__module__, + } + result = RemoteMeta(local_cls.__name__, (local_cls,), namespace) + + def make_new(__class__): + """ + Define a __new__() with a __class__ that is closure-bound, needed for super() to work + """ + + def __new__(cls, *a, **kw): + if cls is result and cls.__real_cls__ is not result: + return cls.__real_cls__(*a, **kw) + return super().__new__(cls) + + __class__.__new__ = __new__ + + make_new(result) + setattr(sys.modules[local_cls.__module__], local_cls.__name__, result) + _KNOWN_DUALS[local_cls] = result + + def update_class(_): + if execution_engine.get() == "Cloudray": + from . import rpyc_proxy + + result.__real_cls__ = getattr(rpyc_proxy, rpyc_wrapper_name)(result) + else: + result.__real_cls__ = result + + execution_engine.subscribe(update_class) diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index ee7b5445d64..9c990185204 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -55,6 +55,9 @@ def join(self): class RayCluster(BaseCluster): + target_engine = "Cloudray" + target_partition = "Pandas" + __base_config = os.path.join( os.path.abspath(os.path.dirname(__file__)), "ray-autoscaler.yml" ) diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py new file mode 100644 index 00000000000..3cc6321fa9a --- /dev/null +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -0,0 +1,440 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import types + +import rpyc +from rpyc.lib.compat import pickle +from rpyc.core import netref + +from . import get_connection +from .meta_magic import _LOCAL_ATTRS, RemoteMeta, _KNOWN_DUALS + + +class WrappingConnection(rpyc.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self._remote_pickle_loads = None + + def deliver(self, local_obj): + """ + More caching version of rpyc.classic.deliver() + """ + try: + local_obj = object.__getattribute__(local_obj, "__remote_end__") + except AttributeError: + pass + if isinstance(local_obj, netref.BaseNetref) and local_obj.____conn__ is self: + return local_obj + return self._remote_pickle_loads(bytes(pickle.dumps(local_obj))) + + def _netref_factory(self, id_pack): + result = super()._netref_factory(id_pack) + # try getting __real_cls__ from result.__class__ BUT make sure to + # NOT get it from some parent class for result.__class__, otherwise + # multiple wrappings happen + + # we cannot use 'result.__class__' as this could cause a lookup of + # '__class__' on remote end + try: + local_cls = object.__getattribute__(result, "__class__") + except AttributeError: + return result + + try: + # first of all, check if remote object has a known "wrapping" class + # example: _DataFrame has DataFrame dual-nature wrapper + local_cls = _KNOWN_DUALS[local_cls] + except KeyError: + pass + try: + # Try to get local_cls.__real_cls__ but look it up within + # local_cls.__dict__ to not grab it from any parent class. + # Also get the __dict__ by using low-level __getattribute__ + # to override any potential __getattr__ callbacks on the class. + wrapping_cls = object.__getattribute__(local_cls, "__dict__")[ + "__real_cls__" + ] + except (AttributeError, KeyError): + return result + return wrapping_cls.from_remote_end(result) + + def _box(self, obj): + while True: + try: + obj = object.__getattribute__(obj, "__remote_end__") + except AttributeError: + break + return super()._box(obj) + + def _init_deliver(self): + self._remote_pickle_loads = self.modules["rpyc.lib.compat"].pickle.loads + + +class WrappingService(rpyc.ClassicService): + _protocol = WrappingConnection + + def on_connect(self, conn): + super().on_connect(conn) + conn._init_deliver() + + +def _in_empty_class(): + class Empty: + pass + + return frozenset(Empty.__dict__.keys()) + + +_EMPTY_CLASS_ATTRS = _in_empty_class() + +_PROXY_LOCAL_ATTRS = frozenset(["__name__", "__remote_end__"]) +_NO_OVERRIDE = ( + _LOCAL_ATTRS + | _PROXY_LOCAL_ATTRS + | rpyc.core.netref.DELETED_ATTRS + | frozenset(["__getattribute__"]) + | _EMPTY_CLASS_ATTRS +) + + +def make_proxy_cls( + remote_cls: netref.BaseNetref, + origin_cls: type, + override: type, + cls_name: str = None, +): + """ + Makes a new class type which inherits from (for isinstance() and issubtype()), + takes methods from as-is and proxy all requests for other members to . + Note that origin_cls and remote_cls are assumed to be the same class types, but one is local + and other is obtained from RPyC. + + Effectively implements subclassing, but without subclassing. This is needed because it is + impossible to subclass a remote-obtained class, something in the very internals of RPyC bugs out. + + Parameters + ---------- + remote_cls: netref.BaseNetref + Type obtained from RPyC connection, expected to mirror origin_cls + origin_cls: type + The class to prepare a proxying wrapping for + override: type + The mixin providing methods and attributes to overlay on top of remote values and methods. + cls_name: str, optional + The name to give to the resulting class. + + Returns + ------- + type + New wrapper that takes attributes from override and relays requests to all other + attributes to remote_cls + """ + + class ProxyMeta(RemoteMeta): + """ + This metaclass deals with printing a telling repr() to assist in debugging, + and to actually implement the "subclass without subclassing" thing by + directly adding references to attributes of "override" and by making proxy methods + for other functions of origin_cls. Class-level attributes being proxied is managed + by RemoteMeta parent. + + Do note that we cannot do the same for certain special members like __getitem__ + because CPython for optimization doesn't do a lookup of "type(obj).__getitem__(foo)" when + "obj[foo]" is called, but it effectively does "type(obj).__dict__['__getitem__'](foo)" + (but even without checking for __dict__), so all present methods must be declared + beforehand. + """ + + def __repr__(self): + return f"" + + def __prepare__(*args, **kw): + """ + Cooks the __dict__ of the type being constructed. Takes attributes from as is + and adds proxying wrappers for other attributes of . + This "manual inheritance" is needed for RemoteMeta.__getattribute__ which first looks into + type(obj).__dict__ (EXCLUDING parent classes) and then goes to proxy type. + """ + namespace = type.__prepare__(*args, **kw) + + # try computing overridden differently to allow subclassing one override from another + no_override = set(_NO_OVERRIDE) + for base in override.__mro__: + if base == object: + continue + for attr_name, attr_value in base.__dict__.items(): + if ( + attr_name not in namespace + and attr_name not in no_override + and getattr(object, attr_name, None) != attr_value + ): + namespace[ + attr_name + ] = attr_value # force-inherit an attribute manually + no_override.add(attr_name) + + for base in origin_cls.__mro__: + if base == object: + continue + # try unwrapping a dual-nature class first + while True: + try: + sub_base = object.__getattribute__(base, "__real_cls__") + except AttributeError: + break + if sub_base is base: + break + base = sub_base + for name, entry in base.__dict__.items(): + if ( + name not in namespace + and name not in no_override + and isinstance(entry, types.FunctionType) + ): + + def method(_self, *_args, __method_name__=name, **_kw): + return getattr(_self.__remote_end__, __method_name__)( + *_args, **_kw + ) + + method.__name__ = name + namespace[name] = method + return namespace + + class Wrapper(override, origin_cls, metaclass=ProxyMeta): + """ + Subclass origin_cls replacing attributes with what is defined in override while + relaying requests for all other attributes to remote_cls. + """ + + __name__ = cls_name or origin_cls.__name__ + __wrapper_remote__ = remote_cls + + def __new__(cls, *a, **kw): + return override.__new__(cls) + + def __init__(self, *a, __remote_end__=None, **kw): + if __remote_end__ is None: + __remote_end__ = remote_cls(*a, **kw) + while True: + # unwrap the object if it's a wrapper + try: + __remote_end__ = object.__getattribute__( + __remote_end__, "__remote_end__" + ) + except AttributeError: + break + object.__setattr__(self, "__remote_end__", __remote_end__) + + @classmethod + def from_remote_end(cls, remote_inst): + return cls(__remote_end__=remote_inst) + + def __getattribute__(self, name): + """ + Implement "default" resolution order to override whatever __getattribute__ + a parent being wrapped may have defined, but only look up on own __dict__ + without looking into ancestors' ones, because we copy them in __prepare__. + + Effectively, any attributes not currently known to Wrapper (i.e. not defined here + or in override class) will be retrieved from the remote end. + + Algorithm (mimicking default Python behaviour): + 1) check if type(self).__dict__[name] exists and is a get/set data descriptor + 2) check if self.__dict__[name] exists + 3) check if type(self).__dict__[name] is a non-data descriptor + 4) check if type(self).__dict__[name] exists + 5) pass through to remote end + """ + dct = object.__getattribute__(self, "__dict__") + if name == "__dict__": + return dct + cls_dct = object.__getattribute__(type(self), "__dict__") + try: + cls_attr, has_cls_attr = cls_dct[name], True + except KeyError: + has_cls_attr = False + else: + oget = None + try: + oget = object.__getattribute__(cls_attr, "__get__") + object.__getattribute__(cls_attr, "__set__") + except AttributeError: + pass # not a get/set data descriptor, go next + else: + return oget(self, type(self)) + # type(self).name is not a get/set data descriptor + try: + return dct[name] + except KeyError: + # instance doesn't have an attribute + if has_cls_attr: + # type(self) has this attribute, but it's not a get/set descriptor + if oget: + # this attribute is a get data descriptor + return oget(self, type(self)) + return cls_attr # not a data descriptor whatsoever + + # this instance/class does not have this attribute, pass it through to remote end + return getattr(dct["__remote_end__"], name) + + if override.__setattr__ == object.__setattr__: + # no custom attribute setting, define our own relaying to remote end + def __setattr__(self, name, value): + if name not in _PROXY_LOCAL_ATTRS: + setattr(self.__remote_end__, name, value) + else: + object.__setattr__(self, name, value) + + if override.__delattr__ == object.__delattr__: + # no custom __delattr__, define our own + def __delattr__(self, name): + if name not in _PROXY_LOCAL_ATTRS: + delattr(self.__remote_end__, name) + + return Wrapper + + +def _deliveringWrapper( + origin_cls: type, methods=(), mixin: type = None, target_name: str = None +): + """ + Prepare a proxying wrapper for origin_cls which overrides methods specified in + "methods" with "delivering" versions of methods. + A "delivering" method is a method which delivers its arguments to a remote end + before calling the remote method, effectively calling it with arguments passed + by value, not by reference. + This is mostly a workaround for RPyC bug when it translates a non-callable + type to a remote type which has __call__() method (which would raise TypeError + when called because local class is not callable). + + Note: this could lead to some weird side-effects if any arguments passed + in are very funny, but this should never happen in a real data science life. + + Parameters + ---------- + origin_cls: type + Local class to make a "delivering wrapper" for. + methods: sequence of method names, optional + List of methods to override making "delivering wrappers" for. + mixin: type, optional + Parent mixin class to subclass (to inherit already prepared wrappers). + If not specified, a new mixin is created. + target_name: str, optional + Name to give to prepared wrapper class. + If not specified, take the name of local class being wrapped. + + Returns + ------- + type + The "delivering wrapper" mixin, to be used in conjunction with make_proxy_cls() + """ + conn = get_connection() + remote_cls = getattr(conn.modules[origin_cls.__module__], origin_cls.__name__) + + if mixin is None: + + class DeliveringMixin: + pass + + mixin = DeliveringMixin + + for method in methods: + + def wrapper(self, *args, __remote_conn__=conn, __method_name__=method, **kw): + args = tuple(__remote_conn__.deliver(x) for x in args) + kw = {k: __remote_conn__.deliver(v) for k, v in kw.items()} + return getattr(self.__remote_end__, __method_name__)(*args, **kw) + + wrapper.__name__ = method + setattr(mixin, method, wrapper) + return make_proxy_cls( + remote_cls, origin_cls, mixin, target_name or origin_cls.__name__ + ) + + +def _prepare_loc_mixin(): + """ + Prepare a mixin that overrides .loc and .iloc properties with versions + which return a special "delivering" instances of indexers. + """ + from modin.pandas.indexing import _LocIndexer, _iLocIndexer + + DeliveringLocIndexer = _deliveringWrapper( + _LocIndexer, ["__getitem__", "__setitem__"] + ) + DeliveringILocIndexer = _deliveringWrapper( + _iLocIndexer, ["__getitem__", "__setitem__"] + ) + + class DeliveringMixin: + @property + def loc(self): + return DeliveringLocIndexer(self.__remote_end__) + + @property + def iloc(self): + return DeliveringILocIndexer(self.__remote_end__) + + return DeliveringMixin + + +def make_dataframe_wrapper(DataFrame): + """ + Prepares a "delivering wrapper" proxy class for DataFrame. + It makes DF.loc, DF.groupby() and other methods listed below deliver their + arguments to remote end by value. + """ + DeliveringDataFrame = _deliveringWrapper( + DataFrame, + ["groupby", "agg", "aggregate", "__getitem__", "astype", "drop", "merge"], + _prepare_loc_mixin(), + "DataFrame", + ) + return DeliveringDataFrame + + +def make_base_dataset_wrapper(BasePandasDataset): + """ + Prepares a "delivering wrapper" proxy class for BasePandasDataset. + Look for deatils in make_dataframe_wrapper() and _deliveringWrapper(). + """ + DeliveringBasePandasDataset = _deliveringWrapper( + BasePandasDataset, + ["agg", "aggregate"], + _prepare_loc_mixin(), + "BasePandasDataset", + ) + return DeliveringBasePandasDataset + + +def make_dataframe_groupby_wrapper(DataFrameGroupBy): + """ + Prepares a "delivering wrapper" proxy class for DataFrameGroupBy. + Look for deatils in make_dataframe_wrapper() and _deliveringWrapper(). + """ + DeliveringDataFrameGroupBy = _deliveringWrapper( + DataFrameGroupBy, ["agg", "aggregate", "apply"], target_name="DataFrameGroupBy", + ) + return DeliveringDataFrameGroupBy + + +def make_series_wrapper(Series): + """ + Prepares a "delivering wrapper" proxy class for Series. + Note that for now _no_ methods that really deliver their arguments by value + are overridded here, so what it mostly does is it produces a wrapper class + inherited from normal Series but wrapping all access to remote end transparently. + """ + return _deliveringWrapper(Series, target_name="Series") diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index ce4ce73f20a..7e0f0a32ae1 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -89,6 +89,8 @@ from .. import execution_engine, Publisher +# Set this so that Pandas doesn't try to multithread by itself +os.environ["OMP_NUM_THREADS"] = "1" DEFAULT_NPARTITIONS = 4 num_cpus = 1 @@ -127,6 +129,34 @@ def _update_engine(publisher: Publisher): ) dask_client = Client(n_workers=int(num_cpus)) + elif publisher.get() == "Cloudray": + from modin.experimental.cloud import get_connection + import rpyc + + conn: rpyc.ClassicService = get_connection() + remote_ray = conn.modules["ray"] + if _is_first_update.get("Cloudray", True): + + @conn.teleport + def init_remote_ray(): + from ray import ray_constants + import modin + from modin.engines.ray.utils import initialize_ray + + modin.set_backends("Ray") + initialize_ray( + override_is_cluster=True, + override_redis_address=f"localhost:{ray_constants.DEFAULT_PORT}", + override_redis_password=ray_constants.REDIS_DEFAULT_PASSWORD, + ) + + init_remote_ray() + # import EngineDispatcher here to initialize IO class + # so it doesn't skew read_csv() timings later on + import modin.data_management.dispatcher # noqa: F401 + + num_cpus = remote_ray.cluster_resources()["CPU"] + elif publisher.get() != "Python": raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) @@ -183,8 +213,6 @@ def _update_engine(publisher: Publisher): ) from .plotting import Plotting as plotting -# Set this so that Pandas doesn't try to multithread by itself -os.environ["OMP_NUM_THREADS"] = "1" __all__ = [ "DataFrame", "Series", diff --git a/modin/pandas/base.py b/modin/pandas/base.py index ea348eb04c3..10d9b01e050 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import os import numpy as np from numpy import nan import pandas @@ -3583,7 +3584,7 @@ def __getattribute__(self, item): # We default to pandas on empty DataFrames. This avoids a large amount of # pain in underlying implementation and returns a result immediately rather # than dealing with the edge cases that empty DataFrames have. - if self.empty and is_callable: + if is_callable and self.empty: def default_handler(*args, **kwargs): return self._default_to_pandas(item, *args, **kwargs) @@ -3592,6 +3593,12 @@ def default_handler(*args, **kwargs): return object.__getattribute__(self, item) +if os.environ.get("MODIN_EXPERIMENTAL", "").title() == "True": + from modin.experimental.cloud.meta_magic import make_wrapped_class + + make_wrapped_class(BasePandasDataset, "make_base_dataset_wrapper") + + class Resampler(object): def __init__( self, @@ -3621,7 +3628,6 @@ def __init__( on, level, ] - self.__groups = self.__get_groups(*self.resample_args) def __get_groups( diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 9c88e0d2d88..98b4dcbb0ed 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -28,6 +28,7 @@ import functools import numpy as np import sys +import os from typing import Tuple, Union import warnings @@ -36,6 +37,7 @@ from .iterator import PartitionIterator from .series import Series from .base import BasePandasDataset +from .groupby import DataFrameGroupBy @_inherit_docstrings( @@ -458,8 +460,6 @@ def groupby( elif mismatch: raise KeyError(next(x for x in by if x not in self)) - from .groupby import DataFrameGroupBy - return DataFrameGroupBy( self, by, @@ -2795,3 +2795,9 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False): def _to_pandas(self): return self._query_compiler.to_pandas() + + +if os.environ.get("MODIN_EXPERIMENTAL", "").title() == "True": + from modin.experimental.cloud.meta_magic import make_wrapped_class + + make_wrapped_class(DataFrame, "make_dataframe_wrapper") diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index a18c9e30589..52f5972d06a 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -11,12 +11,14 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import os import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like import pandas.core.common as com from modin.error_message import ErrorMessage + from .utils import _inherit_docstrings from .series import Series @@ -690,6 +692,12 @@ def groupby_on_multiple_columns(df, *args, **kwargs): return self._df._default_to_pandas(groupby_on_multiple_columns, *args, **kwargs) +if os.environ.get("MODIN_EXPERIMENTAL", "").title() == "True": + from modin.experimental.cloud.meta_magic import make_wrapped_class + + make_wrapped_class(DataFrameGroupBy, "make_dataframe_groupby_wrapper") + + class SeriesGroupBy(DataFrameGroupBy): @property def ndim(self): diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 23b8350b948..51495b36422 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import os import numpy as np import pandas from pandas.core.common import apply_if_callable, is_bool_indexer @@ -1700,6 +1701,12 @@ def _to_pandas(self): return series +if os.environ.get("MODIN_EXPERIMENTAL", "").title() == "True": + from modin.experimental.cloud.meta_magic import make_wrapped_class + + make_wrapped_class(Series, "make_series_wrapper") + + class DatetimeProperties(object): def __init__(self, series): self._series = series diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 5f328b2d902..269c086e133 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -51,6 +51,7 @@ def test_top_level_api_equality(): "base", "utils", "dataframe", + "groupby", "threading", "general", "datetimes", From cc8d2d1283d8311297aaf1fb2e168aa3b512bae4 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Tue, 28 Jul 2020 12:37:21 +0300 Subject: [PATCH 031/120] FIX-#1729: Fix result of `Series.dt.components/freq/tz` (#1730) Signed-off-by: Alexey Prutskov --- modin/backends/pandas/query_compiler.py | 12 +++++++++--- modin/engines/base/frame/data.py | 17 ++++++++++++++--- modin/pandas/test/test_series.py | 11 ++++------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 62bffcc3a98..f940ddf23d6 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1135,8 +1135,12 @@ def unique(self): dt_is_leap_year = MapFunction.register(_dt_prop_map("is_leap_year")) dt_daysinmonth = MapFunction.register(_dt_prop_map("daysinmonth")) dt_days_in_month = MapFunction.register(_dt_prop_map("days_in_month")) - dt_tz = MapFunction.register(_dt_prop_map("tz")) - dt_freq = MapFunction.register(_dt_prop_map("freq")) + dt_tz = MapReduceFunction.register( + _dt_prop_map("tz"), lambda df: pandas.DataFrame(df.iloc[0]), axis=0 + ) + dt_freq = MapReduceFunction.register( + _dt_prop_map("freq"), lambda df: pandas.DataFrame(df.iloc[0]), axis=0 + ) dt_to_period = MapFunction.register(_dt_func_map("to_period")) dt_to_pydatetime = MapFunction.register(_dt_func_map("to_pydatetime")) dt_tz_localize = MapFunction.register(_dt_func_map("tz_localize")) @@ -1154,7 +1158,9 @@ def unique(self): dt_days = MapFunction.register(_dt_prop_map("days")) dt_microseconds = MapFunction.register(_dt_prop_map("microseconds")) dt_nanoseconds = MapFunction.register(_dt_prop_map("nanoseconds")) - dt_components = MapFunction.register(_dt_prop_map("components")) + dt_components = MapFunction.register( + _dt_prop_map("components"), validate_columns=True + ) dt_qyear = MapFunction.register(_dt_prop_map("qyear")) dt_start_time = MapFunction.register(_dt_prop_map("start_time")) dt_end_time = MapFunction.register(_dt_prop_map("end_time")) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index b839bda5184..462b50d1ce7 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -1021,7 +1021,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): reduce_parts, new_index, new_columns, validate_axes="reduced" ) - def _map(self, func, dtypes=None, validate_index=False): + def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): """Perform a function that maps across the entire dataset. Pamareters @@ -1055,12 +1055,23 @@ def _map(self, func, dtypes=None, validate_index=False): new_row_lengths = None else: new_row_lengths = self._row_lengths + + if validate_columns: + new_columns = self._frame_mgr_cls.get_indices( + 1, new_partitions, lambda df: df.columns + ) + else: + new_columns = self.columns + if len(new_columns) != len(self.columns): + new_column_widths = None + else: + new_column_widths = self._column_widths return self.__constructor__( new_partitions, new_index, - self.columns, + new_columns, new_row_lengths, - self._column_widths, + new_column_widths, dtypes=dtypes, ) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index dbfdf459d34..7f32d36a5e7 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1363,11 +1363,8 @@ def test_dtype(data): df_equals(modin_series.dtype, pandas_series.dtypes) -@pytest.mark.xfail( - reason="Datetime properties is broken for now, see #1729 for details" -) def test_dt(): - data = pd.date_range("2016-12-31", "2017-01-08", freq="D", tz="Europe/Berlin") + data = pd.date_range("2016-12-31", periods=128, freq="D", tz="Europe/Berlin") modin_series = pd.Series(data) pandas_series = pandas.Series(data) @@ -1422,8 +1419,8 @@ def test_dt(): df_equals(modin_series.dt.month_name(), pandas_series.dt.month_name()) df_equals(modin_series.dt.day_name(), pandas_series.dt.day_name()) - modin_series = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) - pandas_series = pandas.Series(pandas.to_timedelta(np.arange(5), unit="d")) + modin_series = pd.Series(pd.to_timedelta(np.arange(128), unit="d")) + pandas_series = pandas.Series(pandas.to_timedelta(np.arange(128), unit="d")) assert_array_equal( modin_series.dt.to_pytimedelta(), pandas_series.dt.to_pytimedelta() @@ -1435,7 +1432,7 @@ def test_dt(): df_equals(modin_series.dt.nanoseconds, pandas_series.dt.nanoseconds) df_equals(modin_series.dt.components, pandas_series.dt.components) - data_per = pd.date_range("1/1/2012", periods=5, freq="M") + data_per = pd.date_range("1/1/2012", periods=128, freq="M") pandas_series = pandas.Series(data_per, index=data_per).dt.to_period() modin_series = pd.Series(data_per, index=data_per).dt.to_period() From 36517b7684e10cf9f24a0fb46d7d8be7e81a061a Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 28 Jul 2020 17:52:31 +0300 Subject: [PATCH 032/120] DOCS-#1835: add runner of taxi benchmark as example (#1836) Signed-off-by: Anatoly Myachev --- examples/cluster/taxi-runner.py | 52 +++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 examples/cluster/taxi-runner.py diff --git a/examples/cluster/taxi-runner.py b/examples/cluster/taxi-runner.py new file mode 100644 index 00000000000..a4e8128b24b --- /dev/null +++ b/examples/cluster/taxi-runner.py @@ -0,0 +1,52 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +# pip install git+https://github.com/intel-go/ibis.git@develop + +# NOTE: expects https://github.com/intel-go/omniscripts/ checked out and in PYTHONPATH + +# the following import turns on experimental mode in Modin, +# including enabling running things in remote cloud +import modin.experimental.pandas as pd # noqa: F401 +from modin.experimental.cloud import create_cluster, get_connection + +from taxi import run_benchmark as run_benchmark +from taxi.taxibench_pandas_ibis import etl_pandas + +test_cluster = create_cluster( + "aws", + "aws_credentials", + cluster_name="rayscale-test", + region="eu-north-1", + zone="eu-north-1b", + image="ami-00e1e82d7d4ca80d3", +) +with test_cluster: + conn = get_connection() + np = conn.modules["numpy"] + etl_pandas.__globals__["np"] = np + + parameters = { + "data_file": "https://modin-datasets.s3.amazonaws.com/trips_data.csv", + # "data_file": "s3://modin-datasets/trips_data.csv", + "dfiles_num": 1, + "validation": False, + "no_ibis": True, + "no_pandas": False, + "pandas_mode": "Modin_on_ray", + "ray_tmpdir": "/tmp", + "ray_memory": 1024 * 1024 * 1024, + } + + run_benchmark(parameters) From 02977df1cb1979864033aabcdd0119d78355878b Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 28 Jul 2020 21:19:06 +0300 Subject: [PATCH 033/120] DOCS-#1816: Add notes about using MODIN_SOCKS_PROXY variable (#1817) Co-authored-by: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/cluster.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modin/experimental/cloud/cluster.py b/modin/experimental/cloud/cluster.py index a2cb0633388..bb1c3237d9c 100644 --- a/modin/experimental/cloud/cluster.py +++ b/modin/experimental/cloud/cluster.py @@ -252,6 +252,14 @@ def create( The object that knows how to destroy the cluster and how to activate it as remote context. Note that by default spawning and destroying of the cluster happens in the background, as it's usually a rather lengthy process. + + Notes + ----- + Cluster computation actually can work when proxies are required to access the cloud. + You should set normal "http_proxy"/"https_proxy" variables for HTTP/HTTPS proxies and + set "MODIN_SOCKS_PROXY" variable for SOCKS proxy before calling the function. + + Using SOCKS proxy requires Ray newer than 0.8.6, which might need to be installed manually. """ if not isinstance(provider, Provider): provider = Provider( From 0a03e7a29724b1f7c1ac1ea9e1dcada168097e98 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Tue, 28 Jul 2020 23:25:50 +0300 Subject: [PATCH 034/120] FEAT-#1821: Speed up RPyC connection (#1833) Signed-off-by: Vasilij Litvinov --- modin/data_management/factories.py | 3 +- modin/experimental/cloud/connection.py | 9 +- modin/experimental/cloud/rpyc_proxy.py | 223 ++++++++++++++++++++++--- 3 files changed, 209 insertions(+), 26 deletions(-) diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index e6933ff2a23..a9b234e73a0 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -241,8 +241,7 @@ def __getattr__(self, name): except KeyError: def wrap(*a, _original=getattr(self.__io_cls, name), **kw): - a = tuple(self.__conn.deliver(x) for x in a) - kw = {k: self.__conn.deliver(v) for k, v in kw.items()} + a, kw = self.__conn.deliver(a, kw) return _original(*a, **kw) self.__wrappers[name] = wrap diff --git a/modin/experimental/cloud/connection.py b/modin/experimental/cloud/connection.py index 1e35ee972d3..ec2a2ccf6e2 100644 --- a/modin/experimental/cloud/connection.py +++ b/modin/experimental/cloud/connection.py @@ -106,11 +106,12 @@ def __try_connect(self): from .rpyc_proxy import WrappingService try: - self.__connection = rpyc.connect( - "127.0.0.1", - self.rpyc_port, + stream = rpyc.SocketStream.connect( + host="127.0.0.1", port=self.rpyc_port, nodelay=True, keepalive=True + ) + self.__connection = rpyc.connect_stream( + stream, WrappingService, - keepalive=True, config={"sync_request_timeout": RPYC_REQUEST_TIMEOUT}, ) except (ConnectionRefusedError, EOFError): diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index 3cc6321fa9a..ceb605ff2c5 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -12,34 +12,177 @@ # governing permissions and limitations under the License. import types +import collections import rpyc from rpyc.lib.compat import pickle -from rpyc.core import netref +from rpyc.lib import get_methods + +from rpyc.core import netref, AsyncResult, consts from . import get_connection from .meta_magic import _LOCAL_ATTRS, RemoteMeta, _KNOWN_DUALS +def _batch_loads(items): + return tuple(pickle.loads(item) for item in items) + + +def _tuplize(arg): + """turns any sequence or iterator into a flat tuple""" + return tuple(arg) + + class WrappingConnection(rpyc.Connection): def __init__(self, *a, **kw): super().__init__(*a, **kw) - self._remote_pickle_loads = None + self._remote_batch_loads = None + self._remote_cls_cache = {} + self._static_cache = collections.defaultdict(dict) + self._remote_dumps = None + self._remote_tuplize = None + + def __wrap(self, local_obj): + while True: + # unwrap magic wrappers first; keep unwrapping in case it's a wrapper-in-a-wrapper + # this shouldn't usually happen, so this is mostly a safety net + try: + local_obj = object.__getattribute__(local_obj, "__remote_end__") + except AttributeError: + break + # do not pickle netrefs of our current connection, but do pickle those of other; + # example of this: an object made in _other_ remote context being pased to ours + if isinstance(local_obj, netref.BaseNetref) and local_obj.____conn__ is self: + return None + return bytes(pickle.dumps(local_obj)) + + def deliver(self, args, kw): - def deliver(self, local_obj): """ - More caching version of rpyc.classic.deliver() + More efficient, batched version of rpyc.classic.deliver() """ - try: - local_obj = object.__getattribute__(local_obj, "__remote_end__") - except AttributeError: - pass - if isinstance(local_obj, netref.BaseNetref) and local_obj.____conn__ is self: - return local_obj - return self._remote_pickle_loads(bytes(pickle.dumps(local_obj))) + pickled_args = [self.__wrap(arg) for arg in args] + pickled_kw = [(k, self.__wrap(v)) for (k, v) in kw.items()] + + pickled = [i for i in pickled_args if i is not None] + [ + v for (k, v) in pickled_kw if v is not None + ] + remote = iter(self._remote_batch_loads(tuple(pickled))) + + delivered_args = [] + for local_arg, pickled_arg in zip(args, pickled_args): + delivered_args.append( + next(remote) if pickled_arg is not None else local_arg + ) + delivered_kw = {} + for k, pickled_v in pickled_kw: + delivered_kw[k] = next(remote) if pickled_v is not None else kw[k] + + return tuple(delivered_args), delivered_kw + + def obtain(self, remote): + while True: + try: + remote = object.__getattribute__(remote, "__remote_end__") + except AttributeError: + break + return pickle.loads(self._remote_dumps(remote)) + + def obtain_tuple(self, remote): + while True: + try: + remote = object.__getattribute__(remote, "__remote_end__") + except AttributeError: + break + return self._remote_tuplize(remote) + + def sync_request(self, handler, *args): + """ + Intercept outgoing synchronous requests from RPyC to add caching or + fulfilling them locally if possible to improve performance. + We should try to make as few remote calls as possible, because each + call adds up to latency. + """ + if handler == consts.HANDLE_INSPECT: + # always inspect classes from modin, pandas and numpy locally, + # do not go to network for those + id_name = str(args[0][0]) + if id_name.split(".", 1)[0] in ("modin", "pandas", "numpy"): + try: + modobj = __import__(id_name) + for subname in id_name.split(".")[1:]: + modobj = getattr(modobj, subname) + except (ImportError, AttributeError): + pass + else: + return get_methods(netref.LOCAL_ATTRS, modobj) + modname, clsname = id_name.rsplit(".", 1) + try: + modobj = __import__(modname) + for subname in modname.split(".")[1:]: + modobj = getattr(modobj, subname) + clsobj = getattr(modobj, clsname) + except (ImportError, AttributeError): + pass + else: + return get_methods(netref.LOCAL_ATTRS, clsobj) + elif handler in (consts.HANDLE_GETATTR, consts.HANDLE_STR, consts.HANDLE_HASH): + if handler == consts.HANDLE_GETATTR: + obj, attr = args + key = (attr, handler) + else: + obj = args[0] + key = handler + + if str(obj.____id_pack__[0]) in {"numpy", "numpy.dtype"}: + # always assume numpy attributes and numpy.dtype attributes are always the same; + # note that we're using RPyC id_pack as cache key, and it includes the name, + # class id and instance id, so this cache is unique to each instance of, say, + # numpy.dtype(), hence numpy.int16 and numpy.float64 got different caches. + cache = self._static_cache[obj.____id_pack__] + try: + result = cache[key] + except KeyError: + result = cache[key] = super().sync_request(handler, *args) + if handler == consts.HANDLE_GETATTR: + # save an entry in our cache telling that we get this attribute cached + self._static_cache[result.____id_pack__]["__getattr__"] = True + return result + + return super().sync_request(handler, *args) + + def async_request(self, handler, *args, **kw): + """ + Override async request handling to intercept outgoing deletion requests because we cache + certain things, and if we allow deletion of cached things our cache becomes stale. + We can clean the cache upon deletion, but it would increase cache misses a lot. + + Also note that memory is not leaked forever, RPyC frees all of it upon disconnect. + """ + if handler == consts.HANDLE_DEL: + obj, _ = args + if obj.____id_pack__ in self._static_cache: + # object is cached by us, so ignore the request or remote end dies and cache is suddenly stale; + # we shouldn't remove item from cache as it would reduce performance + res = AsyncResult(self) + res._is_ready = True # simulate finished async request + return res + return super().async_request(handler, *args, **kw) def _netref_factory(self, id_pack): - result = super()._netref_factory(id_pack) + id_name, cls_id, inst_id = id_pack + id_name = str(id_name) + first = id_name.split(".", 1)[0] + if first in ("modin", "numpy", "pandas") and inst_id: + try: + cached_cls = self._remote_cls_cache[(id_name, cls_id)] + except KeyError: + result = super()._netref_factory(id_pack) + self._remote_cls_cache[(id_name, cls_id)] = type(result) + else: + result = cached_cls(self, id_pack) + else: + result = super()._netref_factory(id_pack) # try getting __real_cls__ from result.__class__ BUT make sure to # NOT get it from some parent class for result.__class__, otherwise # multiple wrappings happen @@ -78,7 +221,13 @@ def _box(self, obj): return super()._box(obj) def _init_deliver(self): - self._remote_pickle_loads = self.modules["rpyc.lib.compat"].pickle.loads + self._remote_batch_loads = self.modules[ + "modin.experimental.cloud.rpyc_proxy" + ]._batch_loads + self._remote_dumps = self.modules["rpyc.lib.compat"].pickle.dumps + self._remote_tuplize = self.modules[ + "modin.experimental.cloud.rpyc_proxy" + ]._tuplize class WrappingService(rpyc.ClassicService): @@ -167,6 +316,7 @@ def __prepare__(*args, **kw): type(obj).__dict__ (EXCLUDING parent classes) and then goes to proxy type. """ namespace = type.__prepare__(*args, **kw) + namespace["__remote_methods__"] = {} # try computing overridden differently to allow subclassing one override from another no_override = set(_NO_OVERRIDE) @@ -204,9 +354,15 @@ def __prepare__(*args, **kw): ): def method(_self, *_args, __method_name__=name, **_kw): - return getattr(_self.__remote_end__, __method_name__)( - *_args, **_kw - ) + try: + remote = _self.__remote_methods__[__method_name__] + except KeyError: + # use remote_cls.__getattr__ to force RPyC return us + # a proxy for remote method call instead of its local wrapper + _self.__remote_methods__[ + __method_name__ + ] = remote = remote_cls.__getattr__(__method_name__) + return remote(_self.__remote_end__, *_args, **_kw) method.__name__ = name namespace[name] = method @@ -353,9 +509,16 @@ class DeliveringMixin: for method in methods: def wrapper(self, *args, __remote_conn__=conn, __method_name__=method, **kw): - args = tuple(__remote_conn__.deliver(x) for x in args) - kw = {k: __remote_conn__.deliver(v) for k, v in kw.items()} - return getattr(self.__remote_end__, __method_name__)(*args, **kw) + args, kw = __remote_conn__.deliver(args, kw) + cache = object.__getattribute__(self, "__remote_methods__") + try: + remote = cache[__method_name__] + except KeyError: + # see comments in ProxyMeta.__prepare__ on using remote_cls.__getattr__ + cache[__method_name__] = remote = remote_cls.__getattr__( + __method_name__ + ) + return remote(self.__remote_end__, *args, **kw) wrapper.__name__ = method setattr(mixin, method, wrapper) @@ -396,10 +559,30 @@ def make_dataframe_wrapper(DataFrame): It makes DF.loc, DF.groupby() and other methods listed below deliver their arguments to remote end by value. """ + + from modin.pandas.series import Series + + conn = get_connection() + + class ObtainingItems: + def items(self): + return conn.obtain_tuple(self.__remote_end__.items()) + + def iteritems(self): + return conn.obtain_tuple(self.__remote_end__.iteritems()) + + ObtainingItems = _deliveringWrapper(Series, mixin=ObtainingItems) + + class DataFrameOverrides(_prepare_loc_mixin()): + @property + def dtypes(self): + remote_dtypes = self.__remote_end__.dtypes + return ObtainingItems(__remote_end__=remote_dtypes) + DeliveringDataFrame = _deliveringWrapper( DataFrame, ["groupby", "agg", "aggregate", "__getitem__", "astype", "drop", "merge"], - _prepare_loc_mixin(), + DataFrameOverrides, "DataFrame", ) return DeliveringDataFrame From e25342beab2d5d345c50d06bfe0f4f11651f58fd Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Wed, 29 Jul 2020 00:22:00 +0300 Subject: [PATCH 035/120] Implement numpy autowrapping in remote context (#1834) Signed-off-by: Vasilij Litvinov --- examples/cluster/mortgage-runner.py | 6 +- examples/cluster/taxi-runner.py | 7 +- modin/data_management/factories.py | 3 + modin/experimental/pandas/__init__.py | 4 + modin/experimental/pandas/numpy_wrap.py | 165 ++++++++++++++++++++++++ 5 files changed, 174 insertions(+), 11 deletions(-) create mode 100644 modin/experimental/pandas/numpy_wrap.py diff --git a/examples/cluster/mortgage-runner.py b/examples/cluster/mortgage-runner.py index 1c5c8122329..3feb98f41a7 100644 --- a/examples/cluster/mortgage-runner.py +++ b/examples/cluster/mortgage-runner.py @@ -19,10 +19,9 @@ # the following import turns on experimental mode in Modin, # including enabling running things in remote cloud import modin.experimental.pandas as pd # noqa: F401 -from modin.experimental.cloud import create_cluster, get_connection +from modin.experimental.cloud import create_cluster from mortgage import run_benchmark -from mortgage.mortgage_pandas import etl_pandas test_cluster = create_cluster( "aws", @@ -33,9 +32,6 @@ image="ami-00e1e82d7d4ca80d3", ) with test_cluster: - conn = get_connection() - np = conn.modules["numpy"] - etl_pandas.__globals__["np"] = np parameters = { "data_file": "https://modin-datasets.s3.amazonaws.com/mortgage", diff --git a/examples/cluster/taxi-runner.py b/examples/cluster/taxi-runner.py index a4e8128b24b..6d4ed1d366e 100644 --- a/examples/cluster/taxi-runner.py +++ b/examples/cluster/taxi-runner.py @@ -19,10 +19,9 @@ # the following import turns on experimental mode in Modin, # including enabling running things in remote cloud import modin.experimental.pandas as pd # noqa: F401 -from modin.experimental.cloud import create_cluster, get_connection +from modin.experimental.cloud import create_cluster from taxi import run_benchmark as run_benchmark -from taxi.taxibench_pandas_ibis import etl_pandas test_cluster = create_cluster( "aws", @@ -33,10 +32,6 @@ image="ami-00e1e82d7d4ca80d3", ) with test_cluster: - conn = get_connection() - np = conn.modules["numpy"] - etl_pandas.__globals__["np"] = np - parameters = { "data_file": "https://modin-datasets.s3.amazonaws.com/trips_data.csv", # "data_file": "s3://modin-datasets/trips_data.csv", diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index a9b234e73a0..eafbc40d7fb 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -223,6 +223,9 @@ def prepare(cls): import modin.backends.pandas.query_compiler # noqa: F401 from modin.experimental.cloud import get_connection + # import a numpy overrider if it wasn't already imported + import modin.experimental.pandas.numpy_wrap # noqa: F401 + class WrappedIO: def __init__(self, conn): self.__conn = conn diff --git a/modin/experimental/pandas/__init__.py b/modin/experimental/pandas/__init__.py index 7a2d6444af9..ef624f132e2 100644 --- a/modin/experimental/pandas/__init__.py +++ b/modin/experimental/pandas/__init__.py @@ -14,6 +14,10 @@ import os os.environ["MODIN_EXPERIMENTAL"] = "True" + +# import numpy_wrap as early as possible to intercept all "import numpy" statements +# in the user code +from .numpy_wrap import _CAUGHT_NUMPY # noqa F401 from modin.pandas import * # noqa F401, F403 from .io_exp import read_sql # noqa F401 import warnings diff --git a/modin/experimental/pandas/numpy_wrap.py b/modin/experimental/pandas/numpy_wrap.py new file mode 100644 index 00000000000..d7bd78af8bc --- /dev/null +++ b/modin/experimental/pandas/numpy_wrap.py @@ -0,0 +1,165 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import sys + +""" +This is a module that hides real numpy from future "import numpy" statements +and replaces it with a wrapping module that serves attributes from either +local or "remote" numpy depending on active execution context. +""" +_CAUGHT_NUMPY = "numpy" not in sys.modules +try: + import numpy as real_numpy +except ImportError: + pass +else: + import types + import copyreg + from modin import execution_engine + import modin + import pandas + import os + + _EXCLUDE_MODULES = [modin, pandas] + try: + import rpyc + except ImportError: + pass + else: + _EXCLUDE_MODULES.append(rpyc) + _EXCLUDE_PATHS = tuple( + os.path.dirname(mod.__file__) + os.sep for mod in _EXCLUDE_MODULES + ) + + class InterceptedNumpy(types.ModuleType): + """ + This class is intended to replace the "numpy" module as seen by outer world, + getting attributes from either local numpy or remote one when remote context + is activated. + It also registers helpers for pickling local numpy objects in remote context + and vice versa. + """ + + __own_attrs__ = set(["__own_attrs__"]) + + __spec__ = real_numpy.__spec__ + __current_numpy = real_numpy + __prev_numpy = real_numpy + __has_to_warn = not _CAUGHT_NUMPY + __reducers = {} + + def __init__(self): + self.__own_attrs__ = set(type(self).__dict__.keys()) + execution_engine.subscribe(self.__update_engine) + + def __swap_numpy(self, other_numpy=None): + self.__current_numpy, self.__prev_numpy = ( + other_numpy or self.__prev_numpy, + self.__current_numpy, + ) + if self.__current_numpy is not real_numpy and self.__has_to_warn: + import warnings + + warnings.warn( + "Was not able to intercept all numpy imports. " + "To intercept all of these please do 'import modin.experimental.pandas' as early as possible" + ) + self.__has_to_warn = False + + def __update_engine(self, _): + if execution_engine.get() == "Cloudray": + from modin.experimental.cloud import get_connection + + self.__swap_numpy(get_connection().modules["numpy"]) + else: + self.__swap_numpy() + + def __make_reducer(self, name): + """ + Prepare a "reducer" routine - the one Pickle calls to serialize an instance of a class. + Note that we need this to allow pickling a local numpy object in "remote numpy" context, + because without a custom reduce callback pickle complains that what it reduced has a + different "numpy" class than original. + """ + try: + reducer = self.__reducers[name] + except KeyError: + + def reducer( + obj, + real_obj=getattr(real_numpy, name), + real_obj_reducer=getattr(real_numpy, name).__reduce__, + ): + # See details on __reduce__ protocol in Python docs: + # https://docs.python.org/3.6/library/pickle.html#object.__reduce__ + reduced = real_obj_reducer(obj) + if not isinstance(reduced, tuple): + return reduced + assert isinstance( + reduced[0], + (type, types.FunctionType, types.BuiltinFunctionType), + ), "Do not know how to support this reconstructor" + + modobj = self.__current_numpy + for submod in reduced[0].__module__.split(".")[1:]: + modobj = getattr(modobj, submod) + reconstruct = getattr(modobj, reduced[0].__name__) + # TODO: see if replacing all "real numpy" things in reduced[1:] is needed + return (reconstruct,) + reduced[1:] + + self.__reducers[name] = reducer + return reducer + + def __get_numpy(self): + frame = sys._getframe() + try: + # get the path to module where caller of caller is defined; + # this function is expected to be called from one of + # __getattr__, __setattr__ or __delattr__, so this + # "caller_file" should point to the file that wants a + # numpy attribute; we want to always give local numpy + # to modin, numpy and rpyc as it's all internal for us + caller_file = frame.f_back.f_back.f_code.co_filename + except AttributeError: + return self.__current_numpy + finally: + del frame + if any(caller_file.startswith(mod_path) for mod_path in _EXCLUDE_PATHS): + return real_numpy + return self.__current_numpy + + def __getattr__(self, name): + # note that __getattr__ is not symmetric to __setattr__, as it is + # only called when an attribute is not found by usual lookups + obj = getattr(self.__get_numpy(), name) + if isinstance(obj, type): + # register a special callback for pickling + copyreg.pickle(obj, self.__make_reducer(name)) + return obj + + def __setattr__(self, name, value): + # set our own attributes on the self instance, but pass through + # setting other attributes to numpy being wrapped + if name in self.__own_attrs__: + super().__setattr__(name, value) + else: + setattr(self.__get_numpy(), name, value) + + def __delattr__(self, name): + # do not allow to delete our own attributes, pass through + # deletion of others to numpy being wrapped + if name not in self.__own_attrs__: + delattr(self.__get_numpy(), name) + + sys.modules["numpy"] = InterceptedNumpy() From 538bd9c03de728f45362e02fa1b21d60d3b84853 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 29 Jul 2020 07:27:13 +0300 Subject: [PATCH 036/120] FIX-#1426: Groupby on categories fixed (#1802) Signed-off-by: Dmitry Chigarev --- modin/backends/pandas/query_compiler.py | 5 ++ .../functions/groupby_function.py | 24 +++++++++- modin/pandas/groupby.py | 10 ++-- modin/pandas/test/test_groupby.py | 48 ++++++++++++++----- modin/pandas/test/utils.py | 27 +++++++++-- 5 files changed, 95 insertions(+), 19 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index f940ddf23d6..181af801306 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2029,6 +2029,11 @@ def _callable_func(self, func, axis, *args, **kwargs): ) def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): + # since we're going to modify `groupby_args` dict in a `groupby_agg_builder`, + # we want to copy it to not propagate these changes into source dict, in case + # of unsuccessful end of function + groupby_args = groupby_args.copy() + as_index = groupby_args.get("as_index", True) def groupby_agg_builder(df): diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py index 6d2373d83bf..ddc10fdaf7f 100644 --- a/modin/data_management/functions/groupby_function.py +++ b/modin/data_management/functions/groupby_function.py @@ -40,7 +40,13 @@ def caller( ) else: qc = query_compiler + # since we're going to modify `groupby_args` dict in a `compute_map`, + # we want to copy it to not propagate these changes into source dict, in case + # of unsuccessful end of function + groupby_args = groupby_args.copy() + as_index = groupby_args.get("as_index", True) + observed = groupby_args.get("observed", False) def _map(df, other): def compute_map(df, other): @@ -48,6 +54,7 @@ def compute_map(df, other): # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. groupby_args["as_index"] = True + groupby_args["observed"] = True other = other.squeeze(axis=axis ^ 1) if isinstance(other, pandas.DataFrame): df = pandas.concat( @@ -57,6 +64,20 @@ def compute_map(df, other): result = map_func( df.groupby(by=other, axis=axis, **groupby_args), **map_args ) + # if `other` has category dtype, then pandas will drop that + # column after groupby, inserting it back to correctly process + # reduce phase + if ( + drop + and not as_index + and isinstance(other, pandas.Series) + and isinstance(other.dtype, pandas.CategoricalDtype) + and result.index.name is not None + and result.index.name not in result.columns + ): + result.insert( + loc=0, column=result.index.name, value=result.index + ) # The _modin_groupby_ prefix indicates that this is the first partition, # and since we may need to insert the grouping data in the reduce phase if ( @@ -82,6 +103,7 @@ def compute_reduce(df): df = df.reset_index(drop=False) # See note above about setting `as_index` groupby_args["as_index"] = as_index + groupby_args["observed"] = observed if other_len > 1: by_part = list(df.columns[0:other_len]) else: @@ -98,7 +120,7 @@ def compute_reduce(df): if isinstance(by_part, str) and by_part in result.columns: if "_modin_groupby_" in by_part and drop: col_name = by_part[len("_modin_groupby_") :] - new_result = result.drop(columns=col_name) + new_result = result.drop(columns=col_name, errors="ignore") new_result.columns = [ col_name if "_modin_groupby_" in c else c for c in new_result.columns diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 52f5972d06a..383f23ab99b 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -128,7 +128,7 @@ def _index_grouped(self): } else: if isinstance(self._by, type(self._query_compiler)): - by = self._by.to_pandas().squeeze() + by = self._by.to_pandas().squeeze().values else: by = self._by if self._axis == 0: @@ -433,7 +433,7 @@ def size(self): series_result = Series(query_compiler=result._query_compiler) # Pandas does not name size() output series_result.name = None - return series_result + return series_result.fillna(0) else: return DataFrameGroupBy( self._df.T, @@ -529,12 +529,16 @@ def fillna(self, **kwargs): return result def count(self, **kwargs): - return self._wrap_aggregation( + result = self._wrap_aggregation( type(self._query_compiler).groupby_count, lambda df, **kwargs: df.count(**kwargs), numeric_only=False, **kwargs, ) + # pandas do it in case of Series + if isinstance(result, Series): + result = result.fillna(0) + return result def pipe(self, func, *args, **kwargs): return com.pipe(self, func, *args, **kwargs) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 5dbe30baa19..060906df206 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -21,18 +21,28 @@ check_df_columns_have_nans, create_test_dfs, eval_general, + df_categories_equals, ) pd.DEFAULT_NPARTITIONS = 4 def modin_df_almost_equals_pandas(modin_df, pandas_df): - difference = to_pandas(modin_df) - pandas_df + df_categories_equals(modin_df._to_pandas(), pandas_df) + + modin_df = to_pandas(modin_df) + + if hasattr(modin_df, "select_dtypes"): + modin_df = modin_df.select_dtypes(exclude=["category"]) + if hasattr(pandas_df, "select_dtypes"): + pandas_df = pandas_df.select_dtypes(exclude=["category"]) + + difference = modin_df - pandas_df diff_max = difference.max() if isinstance(diff_max, pandas.Series): diff_max = diff_max.max() assert ( - to_pandas(modin_df).equals(pandas_df) + modin_df.equals(pandas_df) or diff_max < 0.0001 or (all(modin_df.isna().all()) and all(pandas_df.isna().all())) ) @@ -234,7 +244,8 @@ def test_mixed_dtypes_groupby(as_index): ], ) @pytest.mark.parametrize("as_index", [True, False]) -def test_simple_row_groupby(by, as_index): +@pytest.mark.parametrize("col1_category", [True, False]) +def test_simple_row_groupby(by, as_index, col1_category): pandas_df = pandas.DataFrame( { "col1": [0, 1, 2, 3], @@ -245,6 +256,9 @@ def test_simple_row_groupby(by, as_index): } ) + if col1_category: + pandas_df = pandas_df.astype({"col1": "category"}) + modin_df = from_pandas(pandas_df) n = 1 modin_groupby = modin_df.groupby(by=by, as_index=as_index) @@ -267,10 +281,10 @@ def test_simple_row_groupby(by, as_index): eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs - eval_cumsum(modin_groupby, pandas_groupby) - eval_cummax(modin_groupby, pandas_groupby) - eval_cummin(modin_groupby, pandas_groupby) - eval_cumprod(modin_groupby, pandas_groupby) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum(axis=0)) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax(axis=0)) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin(axis=0)) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod(axis=0)) eval_general( modin_groupby, @@ -312,7 +326,7 @@ def test_simple_row_groupby(by, as_index): modin_df_almost_equals_pandas, is_default=True, ) - eval_rank(modin_groupby, pandas_groupby) + eval_general(modin_groupby, pandas_groupby, lambda df: df.rank()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) @@ -332,7 +346,12 @@ def test_simple_row_groupby(by, as_index): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: - eval_transform(modin_groupby, pandas_groupby, func) + eval_general( + modin_groupby, + pandas_groupby, + lambda df: df.transform(func), + check_exception_type=None, + ) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: @@ -347,7 +366,9 @@ def test_simple_row_groupby(by, as_index): ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) - eval_size(modin_groupby, pandas_groupby) + eval_general( + modin_groupby, pandas_groupby, lambda df: df.size(), check_exception_type=None + ) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) @@ -471,7 +492,8 @@ def test_single_group_row_groupby(): eval_groups(modin_groupby, pandas_groupby) -def test_large_row_groupby(): +@pytest.mark.parametrize("is_by_category", [True, False]) +def test_large_row_groupby(is_by_category): pandas_df = pandas.DataFrame( np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD") ) @@ -479,6 +501,10 @@ def test_large_row_groupby(): modin_df = from_pandas(pandas_df) by = [str(i) for i in pandas_df["A"].tolist()] + + if is_by_category: + by = pandas.Categorical(by) + n = 4 modin_groupby = modin_df.groupby(by=by) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index bd7dcd0596b..ddd1eac497f 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -381,8 +381,17 @@ def categories_equals(left, right): def df_categories_equals(df1, df2): - categories_columns = df1.select_dtypes(include="category").columns + if not hasattr(df1, "select_dtypes"): + if isinstance(df1, pandas.CategoricalDtype): + return categories_equals(df1, df2) + elif isinstance(getattr(df1, "dtype"), pandas.CategoricalDtype) and isinstance( + getattr(df1, "dtype"), pandas.CategoricalDtype + ): + return categories_equals(df1.dtype, df2.dtype) + else: + return True + categories_columns = df1.select_dtypes(include="category").columns for column in categories_columns: is_category_ordered = df1[column].dtype.ordered assert_categorical_equal( @@ -558,17 +567,27 @@ def check_df_columns_have_nans(df, cols): def eval_general( - modin_df, pandas_df, operation, comparator=df_equals, __inplace__=False, **kwargs + modin_df, + pandas_df, + operation, + comparator=df_equals, + __inplace__=False, + check_exception_type=True, + **kwargs, ): md_kwargs, pd_kwargs = {}, {} def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): try: pd_result = fn(pandas_df, **pd_kwargs) - except Exception as e: - with pytest.raises(type(e)): + except Exception as pd_e: + if check_exception_type is None: + return None + with pytest.raises(Exception) as md_e: # repr to force materialization repr(fn(modin_df, **md_kwargs)) + if check_exception_type: + assert isinstance(md_e.value, type(pd_e)) else: md_result = fn(modin_df, **md_kwargs) return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df) From c139515de6fe4d7d3208d033328b42fe45798756 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 29 Jul 2020 10:14:02 +0300 Subject: [PATCH 037/120] FIX-#1464: product/sum incorrect behavior of 'min_count' fixed (#1827) Signed-off-by: Dmitry Chigarev --- modin/pandas/dataframe.py | 18 ++- modin/pandas/test/test_dataframe.py | 172 ++++++++++------------------ modin/pandas/test/test_series.py | 101 ++++++++++++---- modin/pandas/test/utils.py | 11 ++ 4 files changed, 161 insertions(+), 141 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 98b4dcbb0ed..68a9f05ea3a 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1738,8 +1738,13 @@ def prod( **kwargs, ): axis = self._get_axis_number(axis) - new_index = self.columns if axis else self.index - if min_count > len(new_index): + axis_to_apply = self.columns if axis else self.index + if ( + skipna is not False + and numeric_only is None + and min_count > len(axis_to_apply) + ): + new_index = self.columns if not axis else self.index return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) @@ -2063,8 +2068,13 @@ def sum( **kwargs, ): axis = self._get_axis_number(axis) - new_index = self.columns if axis else self.index - if min_count > len(new_index): + axis_to_apply = self.columns if axis else self.index + if ( + skipna is not False + and numeric_only is None + and min_count > len(axis_to_apply) + ): + new_index = self.columns if not axis else self.index return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 226428055f9..a886151dc28 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -56,6 +56,8 @@ int_arg_values, eval_general, create_test_dfs, + test_data_small_values, + test_data_small_keys, ) pd.DEFAULT_NPARTITIONS = 4 @@ -3390,7 +3392,11 @@ def test_min(self, data, axis, skipna, numeric_only): os.name == "nt", reason="Windows has a memory issue for large numbers on this test", ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + @pytest.mark.parametrize( + "data", + test_data_values + test_data_small_values, + ids=test_data_keys + test_data_small_keys, + ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) @@ -3401,84 +3407,48 @@ def test_min(self, data, axis, skipna, numeric_only): @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) - def test_prod(self, request, data, axis, skipna, numeric_only, min_count): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.prod( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - except Exception: - with pytest.raises(TypeError): - modin_df.prod( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - else: - modin_result = modin_df.prod( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.prod( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.prod( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - else: - modin_result = modin_df.T.prod( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.skipif( - os.name == "nt", - reason="Windows has a memory issue for large numbers on this test", - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) + "operation", + [ + "prod", + pytest.param( + "product", + marks=pytest.mark.skipif( + pandas.DataFrame.product == pandas.DataFrame.prod + and pd.DataFrame.product == pd.DataFrame.prod, + reason="That operation was already tested.", + ), + ), + ], ) + def test_prod( + self, + request, + data, + axis, + skipna, + numeric_only, + min_count, + is_transposed, + operation, + ): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: getattr( + df.T if is_transposed else df, operation + )(*args, **kwargs), + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + @pytest.mark.parametrize( - "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) + "data", + test_data_values + test_data_small_values, + ids=test_data_keys + test_data_small_keys, ) - def test_product(self, request, data, axis, skipna, numeric_only, min_count): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.product( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - except Exception: - with pytest.raises(TypeError): - modin_df.product( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - else: - modin_result = modin_df.product( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) @@ -3489,44 +3459,20 @@ def test_product(self, request, data, axis, skipna, numeric_only, min_count): @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) - def test_sum(self, request, data, axis, skipna, numeric_only, min_count): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.sum( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - except Exception: - with pytest.raises(TypeError): - modin_df.sum( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - else: - modin_result = modin_df.sum( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - df_equals(modin_result, pandas_result) - try: - pandas_result = pandas_df.T.sum( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.sum( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - else: - modin_result = modin_df.T.sum( - axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count - ) - df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("is_transposed", [False, True]) + def test_sum( + self, request, data, axis, skipna, numeric_only, min_count, is_transposed + ): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: (df.T if is_transposed else df).sum( + *args, **kwargs + ), + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sum_single_column(self, data): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 7f32d36a5e7..b39de9174c5 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -55,6 +55,8 @@ encoding_types, categories_equals, eval_general, + test_data_small_values, + test_data_small_keys, ) pd.DEFAULT_NPARTITIONS = 4 @@ -2102,19 +2104,53 @@ def test_pow(data): inter_df_math_helper(modin_series, pandas_series, "pow") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_prod(data): - modin_series, pandas_series = create_test_series(data) - # Wrap in Series to test almost_equal because of overflow - df_equals(pd.Series([modin_series.prod()]), pandas.Series([pandas_series.prod()])) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_product(data): - modin_series, pandas_series = create_test_series(data) - # Wrap in Series to test almost_equal because of overflow - df_equals( - pd.Series([modin_series.product()]), pandas.Series([pandas_series.product()]) +@pytest.mark.parametrize( + "data", + test_data_values + test_data_small_values, + ids=test_data_keys + test_data_small_keys, +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "numeric_only", + [ + None, + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="numeric_only not implemented for pandas.Series" + ), + ), + ], +) +@pytest.mark.parametrize( + "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) +) +@pytest.mark.parametrize( + "operation", + [ + "prod", + pytest.param( + "product", + marks=pytest.mark.skipif( + pandas.Series.product == pandas.Series.prod + and pd.Series.product == pd.Series.prod, + reason="That operation was already tested.", + ), + ), + ], +) +def test_prod(data, axis, skipna, numeric_only, min_count, operation): + eval_general( + *create_test_series(data), + lambda df, *args, **kwargs: type(df)([getattr(df, operation)(*args, **kwargs)]), + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, ) @@ -2698,23 +2734,40 @@ def test_subtract(data): inter_df_math_helper(modin_series, pandas_series, "subtract") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "data", + test_data_values + test_data_small_values, + ids=test_data_keys + test_data_small_keys, +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) +@pytest.mark.parametrize( + "numeric_only", + [ + None, + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="numeric_only not implemented for pandas.Series" + ), + ), + ], +) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) -def test_sum(data, skipna, min_count): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.sum(skipna=skipna, min_count=min_count) - except Exception: - with pytest.raises(TypeError): - modin_series.sum(skipna=skipna, min_count=min_count) - else: - modin_result = modin_series.sum(skipna=skipna, min_count=min_count) - df_equals(modin_result, pandas_result) +def test_sum(data, axis, skipna, numeric_only, min_count): + eval_general( + *create_test_series(data), + lambda df, *args, **kwargs: df.sum(*args, **kwargs), + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index ddd1eac497f..27415c3d61a 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -159,6 +159,17 @@ }, } +test_data_small = { + "small": { + "col0": [1, 2, 3, 4], + "col1": [8.0, 9.4, 10.1, 11.3], + "col2": [4, 5, 6, 7], + } +} + +test_data_small_values = list(test_data_small.values()) +test_data_small_keys = list(test_data_small.keys()) + test_data_with_duplicates_values = list(test_data_with_duplicates.values()) test_data_with_duplicates_keys = list(test_data_with_duplicates.keys()) From ac91203b801c64627c40027365272ff8670ca1b7 Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 29 Jul 2020 12:51:14 +0300 Subject: [PATCH 038/120] FIX-#1849: Fix AttributeError in ClusterError class (#1850) Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/experimental/cloud/base.py b/modin/experimental/cloud/base.py index f78ddf30c7d..8439fecb5b8 100644 --- a/modin/experimental/cloud/base.py +++ b/modin/experimental/cloud/base.py @@ -27,9 +27,9 @@ def __init__(self, *args, cause: BaseException = None, traceback: str = None, ** super().__init__(*args, **kw) def __str__(self): - if self.clause: - return f"clause: {self.cause}\n{super()}" - return str(super()) + if self.cause: + return f"cause: {self.cause}\n{super().__str__()}" + return super().__str__() class CannotSpawnCluster(ClusterError): From 9ed8e06960cb610680763039a44ae211ea82931c Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Wed, 29 Jul 2020 13:00:52 +0300 Subject: [PATCH 039/120] FEAT-#1831: Add optional RPyC tracing and a simple trace parser (#1843) For details see #1843 Signed-off-by: Vasilij Litvinov --- modin/experimental/cloud/rpyc_proxy.py | 9 +- modin/experimental/cloud/tracing/__init__.py | 12 + .../cloud/tracing/parse_rpyc_trace.py | 229 ++++++++++++++++++ .../cloud/tracing/tracing_connection.py | 109 +++++++++ 4 files changed, 358 insertions(+), 1 deletion(-) create mode 100644 modin/experimental/cloud/tracing/__init__.py create mode 100644 modin/experimental/cloud/tracing/parse_rpyc_trace.py create mode 100644 modin/experimental/cloud/tracing/tracing_connection.py diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index ceb605ff2c5..f926d57400b 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -13,6 +13,7 @@ import types import collections +import os import rpyc from rpyc.lib.compat import pickle @@ -33,6 +34,9 @@ def _tuplize(arg): return tuple(arg) +_TRACE_RPYC = os.environ.get("MODIN_TRACE_RPYC", "").title() == "True" + + class WrappingConnection(rpyc.Connection): def __init__(self, *a, **kw): super().__init__(*a, **kw) @@ -231,7 +235,10 @@ def _init_deliver(self): class WrappingService(rpyc.ClassicService): - _protocol = WrappingConnection + if _TRACE_RPYC: + from .tracing.tracing_connection import TracingWrappingConnection as _protocol + else: + _protocol = WrappingConnection def on_connect(self, conn): super().on_connect(conn) diff --git a/modin/experimental/cloud/tracing/__init__.py b/modin/experimental/cloud/tracing/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/cloud/tracing/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/cloud/tracing/parse_rpyc_trace.py b/modin/experimental/cloud/tracing/parse_rpyc_trace.py new file mode 100644 index 00000000000..92d172a3140 --- /dev/null +++ b/modin/experimental/cloud/tracing/parse_rpyc_trace.py @@ -0,0 +1,229 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Simple parser for rpyc traces produced by tracing_connection.py +""" + +from rpyc.core import consts +import re +import collections +import sys + + +def read_log(fname): + with open(fname, "rb") as inp: + data = inp.read() + data = data.decode("utf8", "xmlcharrefreplace") + # split the last logging chunk + data = data.rsplit("---[", 1)[1] + + items = [] + for line in data.splitlines(): + if ":args=" not in line: + continue + preargs, args = line.split(":args=", 1) + pieces = ("kind=" + preargs).split(":") + ["args=" + args] + item = dict(piece.split("=", 1) for piece in pieces) + item["timing"] = eval(item.get("timing", "0")) + items.append(item) + + return items + + +def get_syncs(items): + # dels are asynchronous, assume others are synchronous + dels = { + i["seq"] + for i in items + if i["kind"] == "send" and i.get("req", "") == "HANDLE_DEL" + } + return [i for i in items if i["seq"] not in dels] + + +items = read_log((sys.argv[1:] or ["rpyc-trace.log"])[0]) +syncs = get_syncs(items) + +print( # noqa: T001 + f"total time={sum(i['timing'] for i in syncs if i['kind'] == 'recv' and i['msg'] == 'MSG_REPLY')}" +) + +longs = [i for i in syncs if i["timing"] > 0.5] +print(f'longs ({len(longs)}) time={sum(i["timing"] for i in longs)}') # noqa: T001 + +s_sends = [i for i in syncs if i["kind"] == "send"] + +buckets = collections.defaultdict(list) + +for i in s_sends: + buckets[i.get("req", "")].append(i["args"]) + +print("-------------------") # noqa: T001 +for k, v in buckets.items(): + print(f"{k}={len(v)}") # noqa: T001 +print("-------------------") # noqa: T001 + +sends = { + i["seq"]: i for i in items if i["kind"] == "send" and i["msg"] == "MSG_REQUEST" +} +pairs, responses = [], {} +for i in items: + if i["kind"] == "recv" and i["msg"] == "MSG_REPLY": + try: + pairs.append((sends[i["seq"]], i)) + responses[i["seq"]] = i + except KeyError: + pass +getattrs = [p for p in pairs if p[0].get("req", "") == "HANDLE_GETATTR"] + + +def _unbox(package): # boxing + label, value = package + if label == consts.LABEL_VALUE: + return value + if label == consts.LABEL_TUPLE: + return tuple(_unbox(item) for item in value) + if label == consts.LABEL_LOCAL_REF: + id_pack = (str(value[0]), value[1], value[2]) # so value is a id_pack + return f"[local]{id_pack[0]}(cls={id_pack[1]}:inst={id_pack[2]})" + if label == consts.LABEL_REMOTE_REF: + id_pack = (str(value[0]), value[1], value[2]) # so value is a id_pack + return f"[remote]{id_pack[0]}(cls={id_pack[1]}:inst={id_pack[2]})" + raise ValueError("invalid label %r" % (label,)) + + +def from_getattr_send(i, s=True): + _, args = eval(i["args"]) + obj, attr = _unbox(args) + return f"{obj}::{attr}" if s else (obj, attr) + + +def from_getattr_recv(i, s=True): + if not i: + return "" + args = eval(i["args"]) + return _unbox(args) + + +def from_hash_send(i, s=True): + _, args = eval(i["args"]) + obj = _unbox(args)[0] + return obj + + +def _unwrap_obj(obj, remote): + try: + obj, attr = remote[obj.replace("[local]", "[remote]")] + except (KeyError, ValueError): + obj = "[>_<] " + obj + else: + obj = f"{obj.replace('[local]', '[remote]')}.{attr}" + return obj + + +def _stringify(obj): + if not isinstance(obj, str): + return str(obj) + if "[local]" in obj or "[remote]" in obj: + return obj + return repr(obj) + + +def _format_args(args, kw): + fargs = ", ".join(_stringify(x) for x in args) + fkw = ", ".join(f"{k}={_stringify(v)}" for (k, v) in kw) + if fargs and fkw: + fargs += ", " + return f"({fargs}{fkw})" + + +def from_callattr_send(i, s=True, remote=None): + _, args = eval(i["args"]) + obj, name, args, kw = _unbox(args) + if remote: + obj = _unwrap_obj(obj, remote) + return f"{obj}.{name}{_format_args(args, kw)}" if s else (obj, name, args, kw) + + +def from_call_send(i, s=True, remote=None): + _, args = eval(i["args"]) + obj, args, kw = _unbox(args) + if remote: + obj = _unwrap_obj(obj, remote) + if s: + res = f"{obj}{_format_args(args, kw)}" + return re.sub(r"\(cls=\d+:inst=", "(inst:", res) + return obj, args, kw + + +def _parse_msg(m, s=False, **kw): + if m["kind"] == "send": + if m.get("req") == "HANDLE_GETATTR": + return from_getattr_send(m, s, **kw) + if m.get("req") in ("HANDLE_HASH", "HANDLE_STR"): + return from_hash_send(m, s, **kw) + if m.get("req") == "HANDLE_CALLATTR": + return from_callattr_send(m, s, **kw) + if m.get("req") == "HANDLE_CALL": + return from_call_send(m, s, **kw) + return str(m) + return from_getattr_recv(m, s, **kw) + + +remote = {} +for gsend, grecv in pairs: + got, sent = _parse_msg(grecv, False), _parse_msg(gsend, False) + if isinstance(got, str): + remote[got] = sent + # remote[from_getattr_recv(grecv, False)] = from_getattr_send(gsend, False) + +print(f"total time getattrs={sum(x[1]['timing'] for x in getattrs)}") # noqa: T001 + +# import pdb; pdb.set_trace() + +print("\n\n----[ getattr ]----") # noqa: T001 +for gsend, grecv in getattrs: + print(f"{from_getattr_send(gsend)} --> {from_getattr_recv(grecv)}") # noqa: T001 + + +print("\n\n----[ hash ]----") # noqa: T001 +for i in syncs: + if i.get("req", "") == "HANDLE_HASH" and i["kind"] == "send": + print( # noqa: T001 + from_hash_send(i), "-->", from_getattr_recv(responses.get(i["seq"])) + ) + +print("\n\n----[ str ]----") # noqa: T001 +for i in syncs: + if i.get("req", "") == "HANDLE_STR" and i["kind"] == "send": + print( # noqa: T001 + from_hash_send(i), "-->", from_getattr_recv(responses.get(i["seq"])) + ) + +print("\n\n----[ callattr ]----") # noqa: T001 +for i in syncs: + if i.get("req", "") == "HANDLE_CALLATTR" and i["kind"] == "send": + print( # noqa: T001 + from_callattr_send(i, remote=remote), + "-->", + from_getattr_recv(responses.get(i["seq"])), + ) + +print("\n\n----[ call ]----") # noqa: T001 +for i in syncs: + if i.get("req", "") == "HANDLE_CALL" and i["kind"] == "send": + print( # noqa: T001 + from_call_send(i, remote=remote), + "-->", + from_getattr_recv(responses.get(i["seq"])), + ) diff --git a/modin/experimental/cloud/tracing/tracing_connection.py b/modin/experimental/cloud/tracing/tracing_connection.py new file mode 100644 index 00000000000..8a75e15a893 --- /dev/null +++ b/modin/experimental/cloud/tracing/tracing_connection.py @@ -0,0 +1,109 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import threading +import time +import collections + +from rpyc.core import brine, consts, netref + +from ..rpyc_proxy import WrappingConnection + +_msg_to_name = collections.defaultdict(dict) +for name in dir(consts): + if name.upper() == name: + category, _ = name.split("_", 1) + _msg_to_name[category][getattr(consts, name)] = name +_msg_to_name = dict(_msg_to_name) + + +class _Logger: + def __init__(self, conn, logname): + self.conn = conn + self.logname = logname + + def __enter__(self): + with self.conn.logLock: + self.conn.logfiles.add(self.logname) + with open(self.logname, "a") as out: + out.write(f"------------[new trace at {time.asctime()}]----------\n") + return self + + def __exit__(self, *a, **kw): + with self.conn.logLock: + self.conn.logfiles.remove(self.logname) + + +class TracingWrappingConnection(WrappingConnection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self.logLock = threading.RLock() + self.timings = {} + with open("rpyc-trace.log", "a") as out: + out.write(f"------------[new trace at {time.asctime()}]----------\n") + self.logfiles = set(["rpyc-trace.log"]) + + @classmethod + def __stringify(cls, args): + if isinstance(args, (tuple, list)): + return tuple(cls.__stringify(i) for i in args) + if isinstance(args, netref.BaseNetref): + return str(args.____id_pack__) + return args + + @classmethod + def __to_text(cls, args): + return str(cls.__stringify(args)) + + def _send(self, msg, seq, args): + str_args = self.__to_text(args).replace("\r", "").replace("\n", "\tNEWLINE\t") + if msg == consts.MSG_REQUEST: + handler, _ = args + str_handler = f":req={_msg_to_name['HANDLE'][handler]}" + else: + str_handler = "" + with self.logLock: + for logfile in self.logfiles: + with open(logfile, "a") as out: + out.write( + f"send:msg={_msg_to_name['MSG'][msg]}:seq={seq}{str_handler}:args={str_args}\n" + ) + self.timings[seq] = time.time() + return super()._send(msg, seq, args) + + def _dispatch(self, data): + """tracing only""" + got1 = time.time() + try: + return super()._dispatch(data) + finally: + got2 = time.time() + msg, seq, args = brine.load(data) + sent = self.timings.pop(seq, got1) + if msg == consts.MSG_REQUEST: + handler, args = args + str_handler = f":req={_msg_to_name['HANDLE'][handler]}" + else: + str_handler = "" + str_args = ( + self.__to_text(args).replace("\r", "").replace("\n", "\tNEWLINE\t") + ) + with self.logLock: + for logfile in self.logfiles: + with open(logfile, "a") as out: + out.write( + f"recv:timing={got1 - sent}+{got2 - got1}:msg={_msg_to_name['MSG'][msg]}:seq={seq}{str_handler}:args={str_args}\n" + ) + + def _log_extra(self, logname): + return _Logger(self, logname) From f426705e8a00c0fd6700c3214a60ed16d251beb0 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 29 Jul 2020 13:03:18 -0400 Subject: [PATCH 040/120] FIX-#1770: Support for groupby() with original Series in by list. Signed-off-by: Itamar Turner-Trauring --- modin/pandas/dataframe.py | 16 +++++++++++++++- modin/pandas/test/test_groupby.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 68a9f05ea3a..8cb7e2fc03d 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -440,7 +440,21 @@ def groupby( by = by._query_compiler elif is_list_like(by): # fastpath for multi column groupby - if not isinstance(by, Series) and axis == 0 and all(o in self for o in by): + if ( + not isinstance(by, Series) + and axis == 0 + and all( + ( + (isinstance(o, str) and (o in self)) + or (isinstance(o, Series) and (o._parent is self)) + ) + for o in by + ) + ): + # We can just revert Series back to names because the parent is + # this dataframe: + by = [o.name if isinstance(o, Series) else o for o in by] + warnings.warn( "Multi-column groupby is a new feature. " "Please report any bugs/issues to bug_reports@modin.org." diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 060906df206..32c9e81a68c 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1160,3 +1160,31 @@ def test_to_pandas_convertion(kwargs): by = ["a", "b"] eval_aggregation(*create_test_dfs(data), by=by, **kwargs) + + +@pytest.mark.parametrize( + # When True, do df[name], otherwise just use name + "columns", + [ + [(False, "a"), (False, "b"), (False, "c")], + [(False, "a"), (False, "b")], + [(True, "a"), (True, "b"), (True, "c")], + [(True, "a"), (True, "b")], + [(False, "a"), (False, "b"), (True, "c")], + [(False, "a"), (True, "c")], + ], +) +def test_mixed_columns(columns): + def get_columns(df): + return [df[name] if lookup else name for (lookup, name) in columns] + + data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} + + df1 = pandas.DataFrame(data) + df1 = pandas.concat([df1]) + ref = df1.groupby(get_columns(df1)).size() + + df2 = pd.DataFrame(data) + df2 = pd.concat([df2]) + exp = df2.groupby(get_columns(df2)).size() + df_equals(ref, exp) From 79be7d6f4640ec0fed5e67595606840f781fbe32 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Wed, 29 Jul 2020 20:15:00 +0300 Subject: [PATCH 041/120] FIX-#1852: Add stub engines to known ones during test-internals (#1853) Signed-off-by: Vasilij Litvinov --- modin/data_management/test/test_dispatcher.py | 5 +++++ modin/pandas/__init__.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/modin/data_management/test/test_dispatcher.py b/modin/data_management/test/test_dispatcher.py index 8d55c3743c4..787b6661966 100644 --- a/modin/data_management/test/test_dispatcher.py +++ b/modin/data_management/test/test_dispatcher.py @@ -18,6 +18,8 @@ from modin.data_management.dispatcher import EngineDispatcher, FactoryNotFoundError from modin.data_management import factories +import modin.pandas as pd + class PandasOnTestFactory(factories.BaseFactory): """ @@ -63,6 +65,9 @@ def prepare(cls): factories.TestOnPythonFactory = TestOnPythonFactory factories.FooOnBarFactory = FooOnBarFactory +# register them as known "no init" engines for modin.pandas +pd._NOINIT_ENGINES |= {"Test", "Bar"} + def test_default_engine(): assert issubclass(EngineDispatcher.get_engine(), factories.BaseFactory) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 7e0f0a32ae1..9e4419989f2 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -97,6 +97,9 @@ _is_first_update = {} dask_client = None +_NOINIT_ENGINES = { + "Python" +} # engines that don't require initialization, useful for unit tests def _update_engine(publisher: Publisher): @@ -157,7 +160,7 @@ def init_remote_ray(): num_cpus = remote_ray.cluster_resources()["CPU"] - elif publisher.get() != "Python": + elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) _is_first_update[publisher.get()] = False From 2251f71bc1f38364463b9ba138940dbc8d804467 Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 29 Jul 2020 21:32:22 +0300 Subject: [PATCH 042/120] DOCS-#1855: add runner of h2o benchmark as example (#1856) Signed-off-by: Anatoly Myachev --- examples/cluster/h2o-runner.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/cluster/h2o-runner.py diff --git a/examples/cluster/h2o-runner.py b/examples/cluster/h2o-runner.py new file mode 100644 index 00000000000..56bffcfe5e2 --- /dev/null +++ b/examples/cluster/h2o-runner.py @@ -0,0 +1,50 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +# pip install git+https://github.com/intel-go/ibis.git@develop + +# NOTE: expects https://github.com/intel-go/omniscripts/ checked out and in PYTHONPATH + +# the following import turns on experimental mode in Modin, +# including enabling running things in remote cloud +import modin.experimental.pandas as pd # noqa: F401 +from modin.experimental.cloud import create_cluster + +from h2o import run_benchmark + +test_cluster = create_cluster( + "aws", + "aws_credentials", + cluster_name="rayscale-test", + region="eu-north-1", + zone="eu-north-1b", + image="ami-00e1e82d7d4ca80d3", +) +with test_cluster: + parameters = { + "no_pandas": False, + "pandas_mode": "Modin_on_ray", + "ray_tmpdir": "/tmp", + "ray_memory": 1024 * 1024 * 1024, + "extended_functionality": False, + } + + # G1... - for groupby queries; J1... - for join queries; + # Additional required files inside h2o-data folder: + # - J1_1e6_1e0_0_0.csv + # - J1_1e6_1e3_0_0.csv + # - J1_1e6_1e6_0_0.csv + for data_file in ["G1_5e5_1e2_0_0.csv", "J1_1e6_NA_0_0.csv"]: + parameters["data_file"] = f"https://modin-datasets.s3.amazonaws.com/h2o/{data_file}" + run_benchmark(parameters) From dcd36740ed5db12f91cfc5f30190f406a5528450 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 29 Jul 2020 22:34:47 +0300 Subject: [PATCH 043/120] FIX-#1857: make 'sort_index' consider axis parameter (#1858) Signed-off-by: Dmitry Chigarev --- modin/backends/pandas/query_compiler.py | 1 + modin/pandas/test/test_dataframe.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 181af801306..4d585658893 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1562,6 +1562,7 @@ def sort_index(self, **kwargs): ): return self.default_to_pandas( pandas.DataFrame.sort_index, + axis=axis, level=level, sort_remaining=sort_remaining, **kwargs diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index a886151dc28..43786a5e2c9 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5970,11 +5970,19 @@ def test_sort_index(self, data, axis, ascending, na_position, sort_remaining): pandas_df.index = pandas.MultiIndex.from_tuples( [(i // 10, i // 5, i) for i in range(len(pandas_df))] ) + modin_df.columns = pd.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(modin_df.columns))] + ) + pandas_df.columns = pd.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(pandas_df.columns))] + ) with pytest.warns(UserWarning): df_equals(modin_df.sort_index(level=0), pandas_df.sort_index(level=0)) with pytest.warns(UserWarning): df_equals(modin_df.sort_index(axis=0), pandas_df.sort_index(axis=0)) + with pytest.warns(UserWarning): + df_equals(modin_df.sort_index(axis=1), pandas_df.sort_index(axis=1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) From 9a65e777fce940a2de3e9ace6f5119023d44a011 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Thu, 30 Jul 2020 01:05:55 +0300 Subject: [PATCH 044/120] FIX-#1154: properly process UDFs (#1845) Signed-off-by: Dmitry Chigarev --- modin/backends/pandas/query_compiler.py | 21 ++++++----- modin/engines/base/frame/data.py | 48 ++++++++++++++++++++----- modin/pandas/base.py | 8 +++-- modin/pandas/groupby.py | 3 +- modin/pandas/test/test_dataframe.py | 12 +++++++ modin/pandas/test/utils.py | 14 ++++++++ modin/pandas/utils.py | 11 ++++++ 7 files changed, 96 insertions(+), 21 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 4d585658893..ef9c0314230 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -22,6 +22,7 @@ from modin.backends.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage +from modin.pandas.utils import try_cast_to_pandas, wrap_udf_function from modin.data_management.functions import ( FoldFunction, MapFunction, @@ -1896,6 +1897,10 @@ def apply(self, func, axis, *args, **kwargs): Returns: A new PandasQueryCompiler. """ + # if any of args contain modin object, we should + # convert it to pandas + args = try_cast_to_pandas(args) + kwargs = try_cast_to_pandas(kwargs) if isinstance(func, str): return self._apply_text_func_elementwise(func, axis, *args, **kwargs) elif callable(func): @@ -1920,7 +1925,7 @@ def _apply_text_func_elementwise(self, func, axis, *args, **kwargs): assert isinstance(func, str) kwargs["axis"] = axis new_modin_frame = self._modin_frame._apply_full_axis( - axis, lambda df: getattr(df, func)(**kwargs) + axis, lambda df: df.apply(func, *args, **kwargs) ) return self.__constructor__(new_modin_frame) @@ -1942,6 +1947,7 @@ def dict_apply_builder(df, func_dict={}): # all objects are `DataFrame`s. return pandas.DataFrame(df.apply(func_dict, *args, **kwargs)) + func = {k: wrap_udf_function(v) if callable(v) else v for k, v in func.items()} return self.__constructor__( self._modin_frame._apply_full_axis_select_indices( axis, dict_apply_builder, func, keep_remaining=False @@ -1969,6 +1975,7 @@ def _list_like_func(self, func, axis, *args, **kwargs): if axis == 1 else self.columns ) + func = [wrap_udf_function(f) if callable(f) else f for f in func] new_modin_frame = self._modin_frame._apply_full_axis( axis, lambda df: pandas.DataFrame(df.apply(func, axis, *args, **kwargs)), @@ -1987,14 +1994,10 @@ def _callable_func(self, func, axis, *args, **kwargs): Returns: A new PandasQueryCompiler. """ - if isinstance(pandas.DataFrame().apply(func), pandas.Series): - new_modin_frame = self._modin_frame._fold_reduce( - axis, lambda df: df.apply(func, axis=axis, *args, **kwargs) - ) - else: - new_modin_frame = self._modin_frame._apply_full_axis( - axis, lambda df: df.apply(func, axis=axis, *args, **kwargs) - ) + func = wrap_udf_function(func) + new_modin_frame = self._modin_frame._apply_full_axis( + axis, lambda df: df.apply(func, axis=axis, *args, **kwargs) + ) return self.__constructor__(new_modin_frame) # END UDF diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 462b50d1ce7..d59fbba5dc8 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -211,7 +211,7 @@ def _set_columns(self, new_columns): self._dtypes.index = new_columns self._apply_index_objs(axis=1) - def _set_axis(self, axis, new_axis): + def _set_axis(self, axis, new_axis, cache_only=False): """Replaces the current labels at the specified axis with the new one Parameters @@ -220,11 +220,20 @@ def _set_axis(self, axis, new_axis): Axis to set labels along new_axis : Index, The replacement labels + cache_only : bool, + Whether to change only external indices, or propagate it + into partitions """ if axis: - self._set_columns(new_axis) + if not cache_only: + self._set_columns(new_axis) + else: + self._columns_cache = ensure_index(new_axis) else: - self._set_index(new_axis) + if not cache_only: + self._set_index(new_axis) + else: + self._index_cache = ensure_index(new_axis) columns = property(_get_columns, _set_columns) index = property(_get_index, _set_index) @@ -256,7 +265,7 @@ def _filter_empties(self): self._column_widths_cache = [w for w in self._column_widths if w > 0] self._row_lengths_cache = [r for r in self._row_lengths if r > 0] - def _validate_axis_equality(self, axis: int): + def _validate_axis_equality(self, axis: int, force: bool = False): """ Validates internal and external indices of modin_frame at the specified axis. @@ -264,22 +273,32 @@ def _validate_axis_equality(self, axis: int): ---------- axis : int, Axis to validate indices along + force : bool, + Whether to update external indices with internal if their lengths + do not match or raise an exception in that case. """ internal_axis = self._frame_mgr_cls.get_indices( axis, self._partitions, lambda df: df.axes[axis] ) is_equals = self.axes[axis].equals(internal_axis) + is_lenghts_matches = len(self.axes[axis]) == len(internal_axis) if not is_equals: - self._set_axis(axis, self.axes[axis]) + if force: + new_axis = self.axes[axis] if is_lenghts_matches else internal_axis + self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches) + else: + self._set_axis( + axis, self.axes[axis], + ) def _validate_internal_indices(self, mode=None, **kwargs): """ Validates and optionally updates internal and external indices of modin_frame in specified mode. There is 3 modes supported: - 1. "reduced" - validates and updates indices on that axes + 1. "reduced" - force validates on that axes where external indices is ["__reduced__"] - 2. "all" - validates indices at all axes, optionally updates - internal indices if `update` parameter specified in kwargs + 2. "all" - validates indices at all axes, optionally force + if `force` parameter specified in kwargs 3. "custom" - validation follows arguments specified in kwargs. Parameters @@ -287,10 +306,16 @@ def _validate_internal_indices(self, mode=None, **kwargs): mode : str or bool, default None validate_index : bool, (optional, could be specified via `mode`) validate_columns : bool, (optional, could be specified via `mode`) + force : bool (optional, could be specified via `mode`) + Whether to update external indices with internal if their lengths + do not match or raise an exception in that case. """ if isinstance(mode, bool): + is_force = mode mode = "all" + else: + is_force = kwargs.get("force", False) reduced_sample = pandas.Index(["__reduced__"]) args_dict = { @@ -298,8 +323,13 @@ def _validate_internal_indices(self, mode=None, **kwargs): "reduced": { "validate_index": self.index.equals(reduced_sample), "validate_columns": self.columns.equals(reduced_sample), + "force": True, + }, + "all": { + "validate_index": True, + "validate_columns": True, + "force": is_force, }, - "all": {"validate_index": True, "validate_columns": True}, } args = args_dict.get(mode, args_dict["custom"]) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 10d9b01e050..03dac9e0565 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -583,7 +583,9 @@ def apply( axis = self._get_axis_number(axis) ErrorMessage.non_verified_udf() if isinstance(func, str): - result = self._query_compiler.apply(func, axis=axis, *args, **kwds) + result = self._query_compiler.apply( + func, axis=axis, raw=raw, result_type=result_type, *args, **kwds, + ) if isinstance(result, BasePandasDataset): return result._query_compiler return result @@ -601,7 +603,9 @@ def apply( ) elif not callable(func) and not is_list_like(func): raise TypeError("{} object is not callable".format(type(func))) - query_compiler = self._query_compiler.apply(func, axis, args=args, **kwds) + query_compiler = self._query_compiler.apply( + func, axis, args=args, raw=raw, result_type=result_type, **kwds, + ) return query_compiler def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 383f23ab99b..c9f35f7a94f 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -19,7 +19,7 @@ from modin.error_message import ErrorMessage -from .utils import _inherit_docstrings +from .utils import _inherit_docstrings, wrap_udf_function from .series import Series @@ -644,6 +644,7 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): """ assert callable(f), "'{0}' object is not callable".format(type(f)) + f = wrap_udf_function(f) if self._is_multi_by: return self._default_to_pandas(f, *args, **kwargs) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 43786a5e2c9..487e791c63a 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -58,6 +58,8 @@ create_test_dfs, test_data_small_values, test_data_small_keys, + udf_func_values, + udf_func_keys, ) pd.DEFAULT_NPARTITIONS = 4 @@ -1812,6 +1814,16 @@ def test_apply_numeric(self, request, data, func, axis): pandas_result = pandas_df.apply(lambda df: df.drop(key), axis=1) df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("func", udf_func_values, ids=udf_func_keys) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_apply_udf(self, data, func): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.apply(*args, **kwargs), + func=func, + other=lambda df: df, + ) + def test_eval_df_use_case(self): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} df = pandas.DataFrame(frame_data) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 27415c3d61a..ebf0ce97828 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -259,6 +259,20 @@ numeric_agg_funcs = ["sum mean", "sum sum", "sum df sum"] +udf_func = { + "return self": lambda df: lambda x, *args, **kwargs: type(x)(x.values), + "change index": lambda df: lambda x, *args, **kwargs: pandas.Series( + x.values, index=np.arange(-1, len(x.index) - 1) + ), + "return none": lambda df: lambda x, *args, **kwargs: None, + "return empty": lambda df: lambda x, *args, **kwargs: pandas.Series(), + "access self": lambda df: lambda x, other, *args, **kwargs: pandas.Series( + x.values, index=other.index + ), +} +udf_func_keys = list(udf_func.keys()) +udf_func_values = list(udf_func.values()) + # Test q values for quantiles quantiles = { "0.25": 0.25, diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 8ed2e60d5cd..0696ef325c9 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -116,3 +116,14 @@ def try_cast_to_pandas(obj): else getattr(pandas.Series, fn_name, obj) ) return obj + + +def wrap_udf_function(func): + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + # if user accidently returns modin DataFrame or Series + # casting it back to pandas to properly process + return try_cast_to_pandas(result) + + wrapper.__name__ = func.__name__ + return wrapper From 03a919eab67492a5e41ecc712051beb37c75598b Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 29 Jul 2020 15:44:37 -0700 Subject: [PATCH 045/120] Bump version to 0.8.0 (#1864) Signed-off-by: Devin Petersohn --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bb3a37f5aaf..4e7864639c5 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - +

To use Modin, replace the pandas import:

@@ -40,15 +40,16 @@ scheduling computation!

-| pandas Object | Ray Engine Coverage | Dask Engine Coverage | +| pandas Object | Modin's Ray Engine Coverage | Modin's Dask Engine Coverage | |-------------------|:------------------------------------------------------------------------------------:|:---------------:| -| `pd.DataFrame` | | | -| `pd.Series` | | | +| `pd.DataFrame` | | | +| `pd.Series` | | | | `pd.read_csv` | ✅ | ✅ | | `pd.read_table` | ✅ | ✅ | | `pd.read_parquet` | ✅ | ✅ | | `pd.read_sql` | ✅ | ✅ | | `pd.read_feather` | ✅ | ✅ | +| `pd.read_excel` | ✅ | ✅ | | `pd.read_json` | [✳️](https://github.com/modin-project/modin/issues/554) | [✳️](https://github.com/modin-project/modin/issues/554) | | `pd.read_` | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | [✴️](https://modin.readthedocs.io/en/latest/supported_apis/io_supported.html) | From a244a564b4faea100555606069e90d998ab878fe Mon Sep 17 00:00:00 2001 From: anmyachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 30 Jul 2020 09:22:34 +0300 Subject: [PATCH 046/120] FEAT-#1861: Use cloudpickle library for experimental.cloud features (#1862) Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/ray-autoscaler.yml | 2 +- modin/experimental/cloud/rpyc_proxy.py | 2 +- requirements.txt | 1 + requirements/env_windows.yml | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml index da85bbcf5bc..f4ea45f7ca8 100644 --- a/modin/experimental/cloud/ray-autoscaler.yml +++ b/modin/experimental/cloud/ray-autoscaler.yml @@ -123,7 +123,7 @@ setup_commands: conda activate modin conda install python==3.7.6 - pip install modin "ray==0.8.6" + pip install modin "ray==0.8.6" cloudpickle echo 'export MODIN_RAY_CLUSTER=True' >> ~/.bashrc # Consider uncommenting these if you also want to run apt-get commands during setup diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index f926d57400b..00d9cc94350 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -16,7 +16,7 @@ import os import rpyc -from rpyc.lib.compat import pickle +import cloudpickle as pickle from rpyc.lib import get_methods from rpyc.core import netref, AsyncResult, consts diff --git a/requirements.txt b/requirements.txt index 28f1d4d3db6..8d1a99ed7ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,4 @@ matplotlib<=3.2.2 sqlalchemy msgpack<1.0 pandas_gbq +cloudpickle diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index 2f955a9dbf6..0e30d07419d 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -37,3 +37,4 @@ dependencies: - matplotlib - sqlalchemy - msgpack<1.0 + - cloudpickle From 4d80bd8e163f6e628e2d9b4ce6d933b7aad65b48 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 29 Jul 2020 23:27:35 -0700 Subject: [PATCH 047/120] FIX-#911: Pin Dask Dependency for Python 3.8 compatiblity (#1846) Signed-off-by: Devin Petersohn --- requirements.txt | 4 ++-- requirements/env_windows.yml | 4 ++-- setup.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8d1a99ed7ed..59dbb076856 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ pandas==1.0.5 numpy pyarrow<0.17 -dask[complete]>=2.1.0,<=2.19.0 -distributed>=2.3.2,<=2.19.0 +dask[complete]>=2.12.0,<=2.19.0 +distributed>=2.12.0,<=2.19.0 ray==0.8.6 psutil==5.6.6 xarray diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index 0e30d07419d..bada08e47af 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -15,8 +15,8 @@ dependencies: - pandas==1.0.5 - numpy - pyarrow>=0.13.0 - - dask[complete]>=2.1.0 - - distributed>=2.3.2 + - dask[complete]>=2.12.0 + - distributed>=2.12.0 - psutil==5.6.6 - xarray - Jinja2 diff --git a/setup.py b/setup.py index cee41c69e60..59af02fa005 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def is_pure(self): return False -dask_deps = ["dask>=2.1.0", "distributed>=2.3.2"] +dask_deps = ["dask>=2.12.0", "distributed>=2.12.0"] ray_deps = ["ray==0.8.6", "pyarrow<0.17"] all_deps = dask_deps + ray_deps From 0d1a74e2d389f427f85ee062357c83c7e31bc88e Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Fri, 31 Jul 2020 15:25:38 +0300 Subject: [PATCH 048/120] Fix access to special attributes in experimental mode (#1875) Signed-off-by: Vasilij Litvinov --- .gitignore | 1 + modin/experimental/cloud/meta_magic.py | 72 ++++++++++++++------------ modin/experimental/cloud/rpyc_proxy.py | 5 +- 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 8bfc868a89a..678945c2776 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ cscope.out # Dask workspace dask-worker-space/ +node_modules diff --git a/modin/experimental/cloud/meta_magic.py b/modin/experimental/cloud/meta_magic.py index 3410122fae5..f4af52923ce 100644 --- a/modin/experimental/cloud/meta_magic.py +++ b/modin/experimental/cloud/meta_magic.py @@ -17,7 +17,18 @@ from modin import execution_engine -_LOCAL_ATTRS = frozenset(("__new__", "__dict__", "__wrapper_remote__")) +# the attributes that must be alwasy taken from a local part of dual-nature class, +# never going to remote end +_LOCAL_ATTRS = frozenset( + ( + "__new__", + "__dict__", + "__wrapper_remote__", + "__real_cls__", + "__mro__", + "__class__", + ) +) class RemoteMeta(type): @@ -46,47 +57,44 @@ def __signature__(self): def __getattribute__(self, name): if name in _LOCAL_ATTRS: # never proxy special attributes, always get them from the class type - res = object.__getattribute__(self, name) + return super().__getattribute__(name) else: try: # Go for proxying class-level attributes first; # make sure to check for attribute in self.__dict__ to get the class-level # attribute from the class itself, not from some of its parent classes. - # Also note we use object.__getattribute__() to skip any potential - # class-level __getattr__ - res = object.__getattribute__(self, "__dict__")[name] + res = super().__getattribute__("__dict__")[name] except KeyError: + # Class-level attribute not found in the class itself; it might be present + # in its parents, but we must first see if we should go to a remote + # end, because in "remote context" local attributes are only those which + # are explicitly allowed by being defined in the class itself. + frame = sys._getframe() try: - res = object.__getattribute__(self, name) + is_inspect = frame.f_back.f_code.co_filename == inspect.__file__ except AttributeError: - frame = sys._getframe() + is_inspect = False + finally: + del frame + if is_inspect: + # be always-local for inspect.* functions + return super().__getattribute__(name) + else: try: - is_inspect = frame.f_back.f_code.co_filename == inspect.__file__ + remote = self.__real_cls__.__wrapper_remote__ except AttributeError: - is_inspect = False - finally: - del frame - if is_inspect: - # be always-local for inspect.* functions - res = super().__getattribute__(name) - else: - try: - remote = object.__getattribute__( - object.__getattribute__(self, "__real_cls__"), - "__wrapper_remote__", - ) - except AttributeError: - # running in local mode, fall back - res = super().__getattribute__(name) - else: - res = getattr(remote, name) - try: - # note that any attribute might be in fact a data descriptor, - # account for that - getter = res.__get__ - except AttributeError: - return res - return getter(None, self) + # running in local mode, fall back + return super().__getattribute__(name) + return getattr(remote, name) + else: + try: + # note that any attribute might be in fact a data descriptor, + # account for that; we only need it for attributes we get from __dict__[], + # because other cases are handled by super().__getattribute__ for us + getter = res.__get__ + except AttributeError: + return res + return getter(None, self) _KNOWN_DUALS = {} diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index 00d9cc94350..17d627d64f0 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -384,9 +384,6 @@ class Wrapper(override, origin_cls, metaclass=ProxyMeta): __name__ = cls_name or origin_cls.__name__ __wrapper_remote__ = remote_cls - def __new__(cls, *a, **kw): - return override.__new__(cls) - def __init__(self, *a, __remote_end__=None, **kw): if __remote_end__ is None: __remote_end__ = remote_cls(*a, **kw) @@ -420,6 +417,8 @@ def __getattribute__(self, name): 4) check if type(self).__dict__[name] exists 5) pass through to remote end """ + if name == "__class__": + return object.__getattribute__(self, "__class__") dct = object.__getattribute__(self, "__dict__") if name == "__dict__": return dct From c43a58096580efcc37c669c13e8385291e4fdbe3 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 31 Jul 2020 10:58:45 -0400 Subject: [PATCH 049/120] FIX-#1647: Support repr() on empty Series. (#1859) Signed-off-by: Itamar Turner-Trauring --- modin/pandas/series.py | 8 ++++++-- modin/pandas/test/test_series.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 51495b36422..e2527583e7b 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -310,7 +310,7 @@ def __repr__(self): num_rows = pandas.get_option("max_rows") or 60 num_cols = pandas.get_option("max_columns") or 20 temp_df = self._build_repr_df(num_rows, num_cols) - if isinstance(temp_df, pandas.DataFrame): + if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] temp_str = repr(temp_df) if self.name is not None: @@ -321,7 +321,11 @@ def __repr__(self): len_str = "Length: {}, ".format(len(self.index)) else: len_str = "" - dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1]) + dtype_str = "dtype: {}".format( + str(self.dtype) + ")" + if temp_df.empty + else temp_str.rsplit("dtype: ", 1)[-1] + ) if len(self) == 0: return "Series([], {}{}".format(name_str, dtype_str) return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index b39de9174c5..936e8a17c76 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -472,6 +472,11 @@ def test___pow__(data): inter_df_math_helper(modin_series, pandas_series, "__pow__") +def test___repr___empty(): + modin_series, pandas_series = pd.Series(), pandas.Series() + assert repr(modin_series) == repr(pandas_series) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___repr__(data): modin_series, pandas_series = create_test_series(data) @@ -1357,6 +1362,11 @@ def test_dropna_inplace(data): df_equals(modin_series, pandas_series) +def test_dtype_empty(): + modin_series, pandas_series = pd.Series(), pandas.Series() + assert modin_series.dtype == pandas_series.dtype + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dtype(data): modin_series, pandas_series = create_test_series(data) From 3ca282158939f28d14b34ba0610c9f2269d57bb0 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Fri, 31 Jul 2020 20:29:10 +0300 Subject: [PATCH 050/120] Fix recursion in experimental mode in some cases (#1874) Signed-off-by: Vasilij Litvinov --- modin/experimental/cloud/meta_magic.py | 23 +++++++++++++++-------- modin/pandas/base.py | 24 ++++++++++-------------- modin/pandas/dataframe.py | 8 ++++---- modin/pandas/series.py | 4 ++-- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/modin/experimental/cloud/meta_magic.py b/modin/experimental/cloud/meta_magic.py index f4af52923ce..6849063a132 100644 --- a/modin/experimental/cloud/meta_magic.py +++ b/modin/experimental/cloud/meta_magic.py @@ -102,10 +102,10 @@ def __getattribute__(self, name): def make_wrapped_class(local_cls: type, rpyc_wrapper_name: str): """ - Replaces given local class in its module with a descendant class - which has __new__ overridden (a dual-nature class). - This new class is instantiated differently depending o - whether this is done in remote context or local. + Replaces given local class in its module with a replacement class + which has __new__ defined (a dual-nature class). + This new class is instantiated differently depending on + whether this is done in remote or local context. In local context we effectively get the same behaviour, but in remote context the created class is actually of separate type which @@ -122,12 +122,19 @@ def make_wrapped_class(local_cls: type, rpyc_wrapper_name: str): installed, and not all users of Modin (even in experimental mode) need remote context. """ + # get a copy of local_cls attributes' dict but skip _very_ special attributes, + # because copying them to a different type leads to them not working. + # Python should create new descriptors automatically for us instead. namespace = { - "__real_cls__": None, - "__new__": None, - "__module__": local_cls.__module__, + name: value + for name, value in local_cls.__dict__.items() + if not isinstance(value, types.GetSetDescriptorType) } - result = RemoteMeta(local_cls.__name__, (local_cls,), namespace) + namespace["__real_cls__"] = None + namespace["__new__"] = None + # define a new class the same way original was defined but with replaced + # metaclass and a few more attributes in namespace + result = RemoteMeta(local_cls.__name__, local_cls.__bases__, namespace) def make_new(__class__): """ diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 03dac9e0565..0ed902f91d3 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -227,7 +227,7 @@ def _binary_op(self, op, other, **kwargs): # Broadcast is an internally used argument kwargs.pop("broadcast", None) return self._default_to_pandas( - getattr(getattr(pandas, self.__name__), op), other, **kwargs + getattr(getattr(pandas, type(self).__name__), op), other, **kwargs ) other = self._validate_other(other, axis, numeric_or_object_only=True) new_query_compiler = getattr(self._query_compiler, op)(other, **kwargs) @@ -238,7 +238,7 @@ def _default_to_pandas(self, op, *args, **kwargs): empty_self_str = "" if not self.empty else " for empty DataFrame" ErrorMessage.default_to_pandas( "`{}.{}`{}".format( - self.__name__, + type(self).__name__, op if isinstance(op, str) else op.__name__, empty_self_str, ) @@ -254,7 +254,7 @@ def _default_to_pandas(self, op, *args, **kwargs): # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` # will get the operation (`op`) from the pandas version of the class and run # it on the object after we have converted it to pandas. - result = getattr(getattr(pandas, self.__name__), op)( + result = getattr(getattr(pandas, type(self).__name__), op)( pandas_obj, *args, **kwargs ) else: @@ -307,7 +307,7 @@ def _default_to_pandas(self, op, *args, **kwargs): def _get_axis_number(self, axis): return ( - getattr(pandas, self.__name__)()._get_axis_number(axis) + getattr(pandas, type(self).__name__)()._get_axis_number(axis) if axis is not None else 0 ) @@ -455,7 +455,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if hasattr(self, "dtype"): raise NotImplementedError( "{}.{} does not implement numeric_only.".format( - self.__name__, "all" + type(self).__name__, "all" ) ) data_for_compute = self[self.columns[self.dtypes == np.bool]] @@ -512,7 +512,7 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if hasattr(self, "dtype"): raise NotImplementedError( "{}.{} does not implement numeric_only.".format( - self.__name__, "all" + type(self).__name__, "all" ) ) data_for_compute = self[self.columns[self.dtypes == np.bool]] @@ -2117,9 +2117,9 @@ def rename_axis( "copy": copy, "inplace": inplace, } - axes, kwargs = getattr(pandas, self.__name__)()._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel - ) + axes, kwargs = getattr( + pandas, type(self).__name__ + )()._construct_axes_from_arguments((), kwargs, sentinel=sentinel) if axis is not None: axis = self._get_axis_number(axis) else: @@ -3452,7 +3452,7 @@ def __getitem__(self, key): # see if we can slice the rows # This lets us reuse code in Pandas to error check indexer = convert_to_index_sliceable( - getattr(pandas, self.__name__)(index=self.index), key + getattr(pandas, type(self).__name__)(index=self.index), key ) if indexer is not None: return self._getitem_slice(indexer) @@ -3554,10 +3554,6 @@ def values(self): """ return self.to_numpy() - @property - def __name__(self): - return type(self).__name__ - def __getattribute__(self, item): default_behaviors = [ "__init__", diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 8cb7e2fc03d..62957565fc6 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -354,7 +354,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): # in pandas that verify that some results are created. This is a challenge for # empty DataFrames, but fortunately they only happen when the `func` type is # a list or a dictionary, which means that the return type won't change from - # type(self), so we catch that error and use `self.__name__` for the return + # type(self), so we catch that error and use `type(self).__name__` for the return # type. try: if axis == 0: @@ -362,12 +362,12 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): else: init_kwargs = {"columns": self.columns} return_type = type( - getattr(pandas, self.__name__)(**init_kwargs).apply( + getattr(pandas, type(self).__name__)(**init_kwargs).apply( func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds ) ).__name__ except Exception: - return_type = self.__name__ + return_type = type(self).__name__ if return_type not in ["DataFrame", "Series"]: return query_compiler.to_pandas().squeeze() else: @@ -845,7 +845,7 @@ def eval(self, expr, inplace=False, **kwargs): .astype(self.dtypes) .eval(expr, **kwargs) ).__name__ - if return_type == self.__name__: + if return_type == type(self).__name__: return self._create_or_update_from_compiler(new_query_compiler, inplace) else: if inplace: diff --git a/modin/pandas/series.py b/modin/pandas/series.py index e2527583e7b..3307a10d85f 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -492,7 +492,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # in pandas that verify that some results are created. This is a challenge for # empty DataFrames, but fortunately they only happen when the `func` type is # a list or a dictionary, which means that the return type won't change from - # type(self), so we catch that error and use `self.__name__` for the return + # type(self), so we catch that error and use `type(self).__name__` for the return # type. # Because a `Series` cannot be empty in pandas, we create a "dummy" `Series` to # do the error checking and determining the return type. @@ -503,7 +503,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): ) ).__name__ except Exception: - return_type = self.__name__ + return_type = type(self).__name__ if ( isinstance(func, str) or is_list_like(func) From 23c0a806564bf851103587cc6e5776a1b090d8e7 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Mon, 3 Aug 2020 12:23:05 +0300 Subject: [PATCH 051/120] TEST-#1876: Add tests running under experimental (#1877) Signed-off-by: Vasilij Litvinov --- .github/workflows/ci.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db2ad58ebfe..95d1413e8de 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,6 +128,27 @@ jobs: - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py if: matrix.part == 3 - run: bash <(curl -s https://codecov.io/bash) + test-experimental: + needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] + runs-on: ubuntu-latest + env: + MODIN_ENGINE: "python" + MODIN_EXPERIMENTAL: "True" + MODIN_MEMORY: 1000000000 + name: test experimental + steps: + - uses: actions/checkout@v1 + with: + fetch-depth: 1 + - uses: actions/setup-python@v1 + with: + python-version: "3.7.x" + architecture: "x64" + - run: pip install -r requirements.txt + - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrameMapMetadata + - run: python -m pytest modin/pandas/test/test_series.py + - run: python -m pytest modin/pandas/test/test_io.py + - run: bash <(curl -s https://codecov.io/bash) test-windows: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: windows-latest From e640ecd8cb785375720b8a817eb48a2b5f3a88ad Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 3 Aug 2020 19:42:31 +0300 Subject: [PATCH 052/120] FIX-#1867: establish CI (#1868) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 299 +++++++++++++++++++------ .github/workflows/push.yml | 171 +++++++++++--- environment.yml | 31 +++ modin/pandas/test/test_dataframe.py | 6 + requirements.txt | 2 - requirements/df_test_requires.txt | 1 - requirements/env_windows.yml | 1 - requirements/windows_test_requires.txt | 1 - run-tests.sh | 48 ---- setup.cfg | 7 +- 10 files changed, 410 insertions(+), 157 deletions(-) create mode 100644 environment.yml delete mode 100755 run-tests.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 95d1413e8de..ee1f36e637c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,78 +19,166 @@ jobs: - run: git remote add upstream https://github.com/modin-project/modin.git - run: git fetch upstream - run: npx commitlint --from upstream/master --to HEAD --verbose + lint-black: name: lint (black) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v2 with: python-version: "3.7.x" architecture: "x64" - run: pip install black - run: black --check --diff modin/ + lint-flake8: name: lint (flake8) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v2 with: python-version: "3.7.x" architecture: "x64" - run: pip install flake8 flake8-print - run: flake8 --enable=T modin + + prepare-cache: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ["ubuntu-latest", "windows-latest"] + python-version: ["3.6", "3.7", "3.8"] + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Cache pip if Ubuntu + if: startsWith(runner.os, 'Linux') + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - name: Cache pip if Windows + if: startsWith(runner.os, 'Windows') + uses: actions/cache@v1 + with: + path: ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + python-version: ${{matrix.python-version}} + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + auto-update-conda: true + - shell: bash -l {0} + run: pip install ray==0.8.6 + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + test-api: + needs: prepare-cache runs-on: ubuntu-latest name: test api steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: - python-version: "3.7.x" - architecture: "x64" + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.7 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + # `shell: bash -l {0}` - special way to activate modin environment + shell: bash -l {0} + run: | + conda info + conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_api.py + - name: Api tests + shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_api.py + test-headers: + needs: prepare-cache runs-on: ubuntu-latest name: test-headers steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: - python-version: "3.7.x" - architecture: "x64" - - run: pip install -r requirements.txt - - run: python -m pytest modin/test/test_headers.py + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.6 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - name: Headers tests + shell: bash -l {0} + run: python -m pytest modin/test/test_headers.py + test-internals: + needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest name: test-internals steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: - python-version: "3.6.x" - architecture: "x64" - - run: pip install -r requirements.txt - - run: python -m pytest modin/test/test_publisher.py modin/data_management/test/test_dispatcher.py + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.6 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - name: Internals tests + shell: bash -l {0} + run: python -m pytest modin/test/test_publisher.py modin/data_management/test/test_dispatcher.py + test-all: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: @@ -98,36 +186,59 @@ jobs: MODIN_MEMORY: 1000000000 name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list - name: Install HDF5 if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - - run: pip install -r requirements.txt - - run: bash run-tests.sh ${{matrix.engine}} -k "TestDataFrame${{matrix.part}}" + - shell: bash -l {0} + run: pytest modin/pandas/test/ -k "TestDataFrame${{matrix.part}}" if: matrix.part != 3 - - run: python -m pytest modin/pandas/test/test_series.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_rolling.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_rolling.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_concat.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_groupby.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_groupby.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_reshape.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_reshape.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_general.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_general.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_io.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py if: matrix.part == 3 - - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py + - shell: bash -l {0} + run: python -m pytest modin/experimental/pandas/test/test_io_exp.py if: matrix.part == 3 - - run: bash <(curl -s https://codecov.io/bash) + - shell: bash -l {0} + run: bash <(curl -s https://codecov.io/bash) + test-experimental: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest @@ -137,24 +248,41 @@ jobs: MODIN_MEMORY: 1000000000 name: test experimental steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: - python-version: "3.7.x" - architecture: "x64" - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrameMapMetadata - - run: python -m pytest modin/pandas/test/test_series.py - - run: python -m pytest modin/pandas/test/test_io.py - - run: bash <(curl -s https://codecov.io/bash) + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.7 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrameMapMetadata + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_series.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py + - shell: bash -l {0} + run: bash <(curl -s https://codecov.io/bash) + test-windows: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: windows-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: @@ -162,50 +290,87 @@ jobs: MODIN_MEMORY: 1000000000 name: test-windows steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + auto-update-conda: true # this enable `use-only-tar-bz2` feature on Windows + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} if: matrix.part != 3 - - run: python -m pytest modin/pandas/test/test_series.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_rolling.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_rolling.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_concat.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_groupby.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_groupby.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_reshape.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_reshape.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_general.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_general.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_io.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py if: matrix.part == 3 - - run: choco install codecov - - run: codecov -f .\coverage.xml -t ${{secrets.CODECOV_TOKEN}} + - shell: bash -l {0} + run: choco install codecov + - shell: bash -l {0} + run: codecov -f ./coverage.xml + test-pyarrow: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] env: MODIN_BACKEND: pyarrow MODIN_EXPERIMENTAL: "True" name: test (pyarrow, python ${{matrix.python-version}}) steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_io.py::test_from_csv + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py::test_from_csv diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index ef84f588d89..74a45317429 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -1,11 +1,48 @@ name: master on: push jobs: + prepare-cache: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ["ubuntu-latest", "windows-latest"] + python-version: ["3.6", "3.7", "3.8"] + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Cache pip if Ubuntu + if: startsWith(runner.os, 'Linux') + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - name: Cache pip if Windows + if: startsWith(runner.os, 'Windows') + uses: actions/cache@v1 + with: + path: ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + python-version: ${{matrix.python-version}} + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + auto-update-conda: true + - shell: bash -l {0} + run: pip install ray==0.8.6 + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + test-all: + needs: prepare-cache runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: @@ -13,41 +50,65 @@ jobs: MODIN_MEMORY: 1000000000 name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list - name: Install HDF5 if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - - run: pip install -r requirements.txt - - run: bash run-tests.sh ${{matrix.engine}} -k "TestDataFrame${{matrix.part}}" + - shell: bash -l {0} + run: pytest modin/pandas/test/ -k "TestDataFrame${{matrix.part}}" if: matrix.part != 3 - - run: python -m pytest modin/pandas/test/test_series.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_rolling.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_rolling.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_concat.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_groupby.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_groupby.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_reshape.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_reshape.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_general.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_general.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_io.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py if: matrix.part == 3 - - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py + - shell: bash -l {0} + run: python -m pytest modin/experimental/pandas/test/test_io_exp.py if: matrix.part == 3 - - run: bash <(curl -s https://codecov.io/bash) + - shell: bash -l {0} + run: bash <(curl -s https://codecov.io/bash) + test-windows: + needs: prepare-cache runs-on: windows-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: @@ -55,49 +116,87 @@ jobs: MODIN_MEMORY: 1000000000 name: test-windows steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + auto-update-conda: true + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} if: matrix.part != 3 - - run: python -m pytest modin/pandas/test/test_series.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_rolling.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_rolling.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_concat.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_concat.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_groupby.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_groupby.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_reshape.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_reshape.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_general.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_general.py if: matrix.part == 3 - - run: python -m pytest modin/pandas/test/test_io.py + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py if: matrix.part == 3 - - run: choco install codecov - - run: codecov -f .\coverage.xml -t ${{secrets.CODECOV_TOKEN}} + - shell: bash -l {0} + run: choco install codecov + - shell: bash -l {0} + run: codecov -f ./coverage.xml + test-pyarrow: + needs: prepare-cache runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.6.x", "3.7.x"] + python-version: ["3.6", "3.7", "3.8"] env: MODIN_BACKEND: pyarrow MODIN_EXPERIMENTAL: "True" name: test (pyarrow, python ${{matrix.python-version}}) steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 - - uses: actions/setup-python@v1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 with: + activate-environment: modin + environment-file: environment.yml python-version: ${{matrix.python-version}} - architecture: "x64" + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - - run: pip install -r requirements.txt - - run: python -m pytest modin/pandas/test/test_io.py::test_from_csv + - shell: bash -l {0} + run: python -m pytest modin/pandas/test/test_io.py::test_from_csv diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000000..36b13bf631f --- /dev/null +++ b/environment.yml @@ -0,0 +1,31 @@ +name: modin +channels: + - conda-forge +dependencies: + - pandas==1.0.5 + - numpy + - pyarrow<0.17 + - dask[complete]>=2.1.0,<=2.19.0 + - distributed>=2.3.2,<=2.19.0 + - xarray + - Jinja2 + - pathlib + - scipy + - pip + - s3fs + - feather-format + - lxml + - openpyxl + - xlrd + - matplotlib<=3.2.2 + - sqlalchemy + - pandas-gbq + - pytables + - msgpack-python + - psutil + - pytest + - pytest-cov + - pytest-xdist + - coverage<5.0 + - pip: + - ray==0.8.6 diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 487e791c63a..a92b5d54a97 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2148,6 +2148,9 @@ def test_cov(self): pandas_result = pandas.DataFrame(data).cov() df_equals(modin_result, pandas_result) + @pytest.mark.skipif( + os.name == "nt", reason="AssertionError: numpy array are different", + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dot(self, data): modin_df = pd.DataFrame(data) @@ -2194,6 +2197,9 @@ def test_dot(self, data): pandas_result = pandas.DataFrame([1]).dot(pandas_df.T) df_equals(modin_result, pandas_result) + @pytest.mark.skipif( + os.name == "nt", reason="AssertionError: numpy array are different", + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_matmul(self, data): modin_df = pd.DataFrame(data) diff --git a/requirements.txt b/requirements.txt index 59dbb076856..118fb418e72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,6 @@ s3fs pytest coverage<5.0 pytest-cov -pytest-testmon -pytest-custom_exit_code pytest-xdist feather-format lxml diff --git a/requirements/df_test_requires.txt b/requirements/df_test_requires.txt index 4020508efde..15b759d8145 100644 --- a/requirements/df_test_requires.txt +++ b/requirements/df_test_requires.txt @@ -2,7 +2,6 @@ pandas==1.0.5 numpy psutil==5.6.6 pytest -pytest-testmon pytest-custom_exit_code pytest-xdist matplotlib diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index bada08e47af..af7a6963978 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -27,7 +27,6 @@ dependencies: - pytest - coverage<5.0 - pytest-cov - - pytest-testmon - pytest-custom_exit_code - pytest-xdist - feather-format diff --git a/requirements/windows_test_requires.txt b/requirements/windows_test_requires.txt index f96a8679eb2..89cedc905b9 100644 --- a/requirements/windows_test_requires.txt +++ b/requirements/windows_test_requires.txt @@ -13,7 +13,6 @@ s3fs pytest coverage<5.0 pytest-cov -pytest-testmon pytest-custom_exit_code pytest-xdist feather-format diff --git a/run-tests.sh b/run-tests.sh deleted file mode 100755 index d716dffbcee..00000000000 --- a/run-tests.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -arg=$1 -shift - -if [ "$arg" == "" ]; then - arg="all" -fi - -if [ "$arg" == "ray" ] || [ "$arg" == "all" ]; then - echo "Running Ray tests" - aws s3 cp --no-sign-request s3://modin-testing/testmondata_ray . - mv testmondata_ray .testmondata - MODIN_ENGINE=ray pytest --testmon-forceselect modin/pandas/test/ $@ - # This happens on sqlite error from testmon if too many files were changed. - if [ "$?" -eq 3 ]; then - rm .testmondata - MODIN_ENGINE=ray pytest --testmon-forceselect modin/pandas/test/ $@ - fi -fi -if [ "$arg" == "python" ] || [ "$arg" == "all" ]; then - echo "Running Python tests" - aws s3 cp --no-sign-request s3://modin-testing/testmondata_python . - mv testmondata_python .testmondata - MODIN_ENGINE=python pytest --testmon-forceselect modin/pandas/test/ $@ - # This happens on sqlite error from testmon if too many files were changed. - if [ "$?" -eq 3 ]; then - rm .testmondata - MODIN_ENGINE=python pytest --testmon-forceselect modin/pandas/test/ $@ - fi -fi -if [ "$arg" == "dask" ] || [ "$arg" == "all" ]; then - echo "Running Dask tests" - aws s3 cp --no-sign-request s3://modin-testing/testmondata_dask . - mv testmondata_dask .testmondata - MODIN_ENGINE=dask pytest --testmon-forceselect modin/pandas/test/ $@ - # This happens on sqlite error from testmon if too many files were changed. - if [ "$?" -eq 3 ]; then - rm .testmondata - MODIN_ENGINE=dask pytest --testmon-forceselect modin/pandas/test/ $@ - fi -fi -if [ "$arg" == "pyarrow" ] || [ "$arg" == "all" ]; then - echo "Running Pyarrow tests" - aws s3 cp --no-sign-request s3://modin-testing/testmondata_pyarrow . - mv testmondata_pyarrow .testmondata - MODIN_BACKEND=pyarrow MODIN_EXPERIMENTAL=1 pytest --testmon-forceselect modin/pandas/test/test_io.py::test_from_csv $@ -fi diff --git a/setup.cfg b/setup.cfg index 572abd0eff1..b3a01861403 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ tag_prefix = parentdir_prefix = modin- [tool:pytest] -addopts = --disable-pytest-warnings --suppress-no-test-exit-code --cov-config=setup.cfg --cov=modin --cov-append +addopts = --disable-pytest-warnings --cov-config=setup.cfg --cov=modin --cov-append --cov-report=xml [flake8] max-line-length = 88 @@ -25,13 +25,18 @@ per-file-ignores = modin/_version.py:T001 [coverage:run] +source = + # modin sources + modin/* omit = # These are not covered by any test because it is an experimental API modin/sql/* # This is not used yet modin/pandas/index/* # Skip tests + modin/test/* modin/pandas/test/* + modin/data_management/test/* modin/experimental/pandas/test/* # Plotting is not tested modin/pandas/plotting.py From 102be4c9c294796abc657287ed7d1919085838cf Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 4 Aug 2020 13:59:20 +0300 Subject: [PATCH 053/120] FIX-#1887: fix versions (#1888) Signed-off-by: Anatoly Myachev --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 36b13bf631f..2f0911a7f4c 100644 --- a/environment.yml +++ b/environment.yml @@ -5,8 +5,8 @@ dependencies: - pandas==1.0.5 - numpy - pyarrow<0.17 - - dask[complete]>=2.1.0,<=2.19.0 - - distributed>=2.3.2,<=2.19.0 + - dask[complete]>=2.12.0,<=2.19.0 + - distributed>=2.12.0,<=2.19.0 - xarray - Jinja2 - pathlib From e517a09b9348f30d883a2e5f02f92c010a852594 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Wed, 5 Aug 2020 01:34:12 +0300 Subject: [PATCH 054/120] FIX-#1674: Series.apply and DataFrame.apply (#1718) Signed-off-by: Alexander Myskov --- modin/pandas/base.py | 9 +++++---- modin/pandas/series.py | 28 ++++++++++++++++++-------- modin/pandas/test/test_dataframe.py | 31 ++++++++++++++++++++++++++++- modin/pandas/test/test_series.py | 22 ++++++++++++++++++++ 4 files changed, 77 insertions(+), 13 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 0ed902f91d3..1e4f25a2480 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -583,9 +583,11 @@ def apply( axis = self._get_axis_number(axis) ErrorMessage.non_verified_udf() if isinstance(func, str): - result = self._query_compiler.apply( - func, axis=axis, raw=raw, result_type=result_type, *args, **kwds, - ) + # if axis != 1 function can be bounded to the Series, which doesn't + # support axis parameter + if axis == 1: + kwds["axis"] = axis + result = self._string_function(func, *args, **kwds) if isinstance(result, BasePandasDataset): return result._query_compiler return result @@ -741,7 +743,6 @@ def count(self, axis=0, level=None, numeric_only=False): Returns: The count, in a Series (or DataFrame if level is specified). """ - axis = self._get_axis_number(axis) if axis is not None else 0 if numeric_only is not None and numeric_only: self._validate_dtypes(numeric_only=numeric_only) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 3307a10d85f..adac3e56220 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -494,22 +494,29 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # a list or a dictionary, which means that the return type won't change from # type(self), so we catch that error and use `type(self).__name__` for the return # type. - # Because a `Series` cannot be empty in pandas, we create a "dummy" `Series` to - # do the error checking and determining the return type. + # We create a "dummy" `Series` to do the error checking and determining + # the return type. try: return_type = type( - getattr(pandas.Series([""], index=self.index[:1]), apply_func)( + getattr(pandas.Series("", index=self.index[:1]), apply_func)( func, *args, **kwds ) ).__name__ except Exception: - return_type = type(self).__name__ + try: + return_type = type( + getattr(pandas.Series(0, index=self.index[:1]), apply_func)( + func, *args, **kwds + ) + ).__name__ + except Exception: + return_type = type(self).__name__ if ( isinstance(func, str) or is_list_like(func) or return_type not in ["DataFrame", "Series"] ): - query_compiler = super(Series, self).apply(func, *args, **kwds) + result = super(Series, self).apply(func, *args, **kwds) else: # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): @@ -522,12 +529,17 @@ def f(x): with np.errstate(all="ignore"): if isinstance(f, np.ufunc): return f(self) - query_compiler = self.map(f)._query_compiler + result = self.map(f)._query_compiler if return_type not in ["DataFrame", "Series"]: - return query_compiler.to_pandas().squeeze() + # sometimes result can be not a query_compiler, but scalar (for example + # for sum or count functions) + if isinstance(result, type(self._query_compiler)): + return result.to_pandas().squeeze() + else: + return result else: result = getattr(sys.modules[self.__module__], return_type)( - query_compiler=query_compiler + query_compiler=result ) if result.name == self.index[0]: result.name = None diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index a92b5d54a97..cb24e219078 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -1739,7 +1739,36 @@ def test_apply(self, request, data, func, axis): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("level", [None, -1, 0, 1]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("func", ["count", "sum", "mean", "all", "kurt"]) + @pytest.mark.parametrize( + "func", + [ + "kurt", + pytest.param( + "count", + marks=pytest.mark.xfail( + reason="count method handle level parameter incorrectly" + ), + ), + pytest.param( + "sum", + marks=pytest.mark.xfail( + reason="sum method handle level parameter incorrectly" + ), + ), + pytest.param( + "mean", + marks=pytest.mark.xfail( + reason="mean method handle level parameter incorrectly" + ), + ), + pytest.param( + "all", + marks=pytest.mark.xfail( + reason="all method handle level parameter incorrectly" + ), + ), + ], + ) def test_apply_text_func_with_level(self, level, data, func, axis): func_kwargs = {"level": level, "axis": axis} rows_number = len(next(iter(data.values()))) # length of the first data column diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 936e8a17c76..7842d127ee9 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -777,6 +777,28 @@ def test_apply_numeric(request, data, func): df_equals(modin_result, pandas_result) +@pytest.mark.parametrize("axis", [None, 0, 1]) +@pytest.mark.parametrize("level", [None, -1, 0, 1]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", ["count", "all", "kurt", "array", "searchsorted"]) +def test_apply_text_func(level, data, func, axis): + func_kwargs = {} + if level: + func_kwargs.update({"level": level}) + if axis: + func_kwargs.update({"axis": axis}) + rows_number = len(next(iter(data.values()))) # length of the first data column + level_0 = np.random.choice([0, 1, 2], rows_number) + level_1 = np.random.choice([3, 4, 5], rows_number) + index = pd.MultiIndex.from_arrays([level_0, level_1]) + + modin_series, pandas_series = create_test_series(data) + modin_series.index = index + pandas_series.index = index + + eval_general(modin_series, pandas_series, lambda df: df.apply(func), **func_kwargs) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("skipna", [True, False]) def test_argmax(data, skipna): From 19fd1b4bcbd6d0ed8daa7be3e24c6abd65dd9858 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Wed, 5 Aug 2020 13:11:31 +0300 Subject: [PATCH 055/120] FIX-#1869: index sort for count(level=...) (#1870) * FIX-#1869: issue fix Signed-off-by: Alexander Myskov * FIX-#1869: remove comment Signed-off-by: Alexander Myskov * FIX-#1869: sort by _handle_level_agg Signed-off-by: Alexander Myskov --- modin/pandas/base.py | 6 +++--- modin/pandas/test/test_dataframe.py | 21 +++++++++++++++++++++ modin/pandas/test/test_series.py | 4 ++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 1e4f25a2480..19126f825e4 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -119,14 +119,14 @@ def _update_inplace(self, new_query_compiler): sib._query_compiler = new_query_compiler old_query_compiler.free() - def _handle_level_agg(self, axis, level, op, **kwargs): + def _handle_level_agg(self, axis, level, op, sort=False, **kwargs): """Helper method to perform error checking for aggregation functions with a level parameter. Args: axis: The axis to apply the operation on level: The level of the axis to apply the operation on op: String representation of the operation to be performed on the level """ - return getattr(self.groupby(level=level, axis=axis, sort=False), op)(**kwargs) + return getattr(self.groupby(level=level, axis=axis, sort=sort), op)(**kwargs) def _validate_other( self, @@ -752,7 +752,7 @@ def count(self, axis=0, level=None, numeric_only=False): # error thrown by pandas raise TypeError("Can only count levels on hierarchical columns.") - return self._handle_level_agg(axis, level, "count") + return self._handle_level_agg(axis=axis, level=level, op="count", sort=True) return self._reduce_dimension( self._query_compiler.count( diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index cb24e219078..66a2f376994 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -31,6 +31,7 @@ df_is_empty, arg_keys, name_contains, + test_data, test_data_values, test_data_keys, test_data_with_duplicates_values, @@ -5393,6 +5394,26 @@ def test___len__(self, data): assert len(modin_df) == len(pandas_df) + def test_index_order(self): + # see #1708 and #1869 for details + df_modin, df_pandas = ( + pd.DataFrame(test_data["dense_nan_data"]), + pandas.DataFrame(test_data["dense_nan_data"]), + ) + rows_number = len(df_modin.index) + level_0 = np.random.choice([x for x in range(10)], rows_number) + level_1 = np.random.choice([x for x in range(10)], rows_number) + index = pandas.MultiIndex.from_arrays([level_0, level_1]) + + df_modin.index = index + df_pandas.index = index + + for func in ["all", "any", "mad", "count"]: + df_equals( + getattr(df_modin, func)(level=0).index, + getattr(df_pandas, func)(level=0).index, + ) + class TestDataFrameIter: @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 7842d127ee9..c4cdddf235e 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1835,7 +1835,7 @@ def test_last(): def test_index_order(): - # see #1708 for details + # see #1708 and #1869 for details s_modin, s_pandas = create_test_series(test_data["dense_nan_data"]) rows_number = len(s_modin.index) level_0 = np.random.choice([x for x in range(10)], rows_number) @@ -1845,7 +1845,7 @@ def test_index_order(): s_modin.index = index s_pandas.index = index - for func in ["all", "any", "mad"]: + for func in ["all", "any", "mad", "count"]: df_equals( getattr(s_modin, func)(level=0).index, getattr(s_pandas, func)(level=0).index, From aa78a18afcb7facf2b90c15d115a30c992db120f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 5 Aug 2020 18:36:38 +0300 Subject: [PATCH 056/120] TEST-#1865: Add RPyC library in requirements (#1866) Signed-off-by: Anatoly Myachev --- environment.yml | 1 + requirements.txt | 1 + requirements/env_windows.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/environment.yml b/environment.yml index 2f0911a7f4c..4bc191caafa 100644 --- a/environment.yml +++ b/environment.yml @@ -29,3 +29,4 @@ dependencies: - coverage<5.0 - pip: - ray==0.8.6 + - rpyc diff --git a/requirements.txt b/requirements.txt index 118fb418e72..0e09fa3b4af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ sqlalchemy msgpack<1.0 pandas_gbq cloudpickle +rpyc diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index af7a6963978..2203119ee96 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -37,3 +37,4 @@ dependencies: - sqlalchemy - msgpack<1.0 - cloudpickle + - rpyc From 7173bae445efc6b7147e5bb45e7e35f9c7ffc9f6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 6 Aug 2020 17:57:27 +0300 Subject: [PATCH 057/120] FEAT-#1881: add scale-out feature dependencies (#1892) Signed-off-by: Anatoly Myachev --- MANIFEST.in | 1 + setup.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index bdfac6aa2e1..2121afd5956 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include versioneer.py include modin/_version.py +include modin/experimental/cloud/ray-autoscaler.yml diff --git a/setup.py b/setup.py index 59af02fa005..907f8f48753 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ except ImportError: HAS_WHEEL = False -with open("README.md", "r") as fh: +with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() if HAS_WHEEL: @@ -39,8 +39,9 @@ def is_pure(self): dask_deps = ["dask>=2.12.0", "distributed>=2.12.0"] ray_deps = ["ray==0.8.6", "pyarrow<0.17"] +remote_deps = ["rpyc", "cloudpickle", "boto3"] -all_deps = dask_deps + ray_deps +all_deps = dask_deps + ray_deps + remote_deps setup( name="modin", @@ -49,6 +50,7 @@ def is_pure(self): distclass=ModinDistribution, description="Modin: Make your pandas code run faster by changing one line of code.", packages=find_packages(), + include_package_data=True, license="Apache 2", url="https://github.com/modin-project/modin", long_description=long_description, @@ -58,6 +60,7 @@ def is_pure(self): # can be installed by pip install modin[dask] "dask": dask_deps, "ray": ray_deps, + "remote": remote_deps, "all": all_deps, }, python_requires=">=3.6.1", From c48b4ae87b88ef886962107d6dd7f33e9ac833c5 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 6 Aug 2020 15:46:25 -0400 Subject: [PATCH 058/120] FIX-#1497: Don't sort in concat() when sort=False (#1889) Signed-off-by: Itamar Turner-Trauring --- modin/engines/base/frame/data.py | 14 ++++++++++++-- modin/pandas/dataframe.py | 5 +++++ modin/pandas/test/test_concat.py | 15 +++++++++++++++ modin/pandas/test/test_dataframe.py | 8 ++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index d59fbba5dc8..b358a85bd90 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -896,6 +896,9 @@ def _join_index_objects(self, axis, other_index, how, sort): """ Joins a pair of index objects (columns or rows) by a given strategy. + Unlike Index.join() in Pandas, if axis is 1, the sort is + False, and how is "outer", the result will _not_ be sorted. + Parameters ---------- axis : 0 or 1 @@ -912,14 +915,21 @@ def _join_index_objects(self, axis, other_index, how, sort): Index Joined indices. """ + + def merge_index(obj1, obj2): + if axis == 1 and how == "outer" and not sort: + return obj1.union(obj2, sort=False) + else: + return obj1.join(obj2, how=how, sort=sort) + if isinstance(other_index, list): joined_obj = self.columns if axis else self.index # TODO: revisit for performance for obj in other_index: - joined_obj = joined_obj.join(obj, how=how, sort=sort) + joined_obj = merge_index(joined_obj, obj) return joined_obj if axis: - return self.columns.join(other_index, how=how, sort=sort) + return merge_index(self.columns, other_index) else: return self.index.join(other_index, how=how, sort=sort) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 62957565fc6..97709de0e31 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -530,6 +530,11 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): Returns: A new DataFrame containing the concatenated values. """ + if sort is False: + warnings.warn( + "Due to https://github.com/pandas-dev/pandas/issues/35092, " + "Pandas ignores sort=False; Modin correctly does not sort." + ) if isinstance(other, (Series, dict)): if isinstance(other, dict): other = Series(other) diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 9d975b0bd29..f1caf30fc81 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -201,3 +201,18 @@ def test_concat_dictionary(axis): pd.concat({"A": modin_df, "B": modin_df2}, axis=axis), pandas.concat({"A": pandas_df, "B": pandas_df2}, axis=axis), ) + + +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("join", ["inner", "outer"]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_sort_order(sort, join, axis): + pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"]) + pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"]) + modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2) + pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) + modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) + df_equals( + pandas_concat, modin_concat, + ) + assert list(pandas_concat.columns) == list(modin_concat.columns) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 66a2f376994..d8b63ffea58 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -746,6 +746,14 @@ def test_append(self, data): modin_df.append(list(modin_df.iloc[-1])) else: modin_result = modin_df.append(list(modin_df.iloc[-1])) + # Pandas has bug where sort=False is ignored + # (https://github.com/pandas-dev/pandas/issues/35092), but Modin + # now does the right thing, so for now manually sort to workaround + # this. Once the Pandas bug is fixed and Modin upgrades to that + # Pandas release, this sort will cause the test to fail, and the + # next two lines should be deleted. + assert list(modin_result.columns) == list(modin_df.columns) + [0] + modin_result = modin_result[[0] + sorted(modin_df.columns)] df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] From b1b2340d4ad42137365d125e948cc14e59163080 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Mon, 10 Aug 2020 20:31:01 +0300 Subject: [PATCH 059/120] FIX-#1904: CI fix (#1905) Signed-off-by: Alexander Myskov --- .github/workflows/ci.yml | 4 ++++ .github/workflows/push.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee1f36e637c..a8eb0166565 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -298,6 +298,10 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - name: conda clean + uses: goanpeca/setup-miniconda@v1.6.0 + shell: bash -l {0} + run: conda clean --packages - uses: goanpeca/setup-miniconda@v1.6.0 with: activate-environment: modin diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 74a45317429..179341b3249 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -124,6 +124,10 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - name: conda clean + uses: goanpeca/setup-miniconda@v1.6.0 + shell: bash -l {0} + run: conda clean --packages - uses: goanpeca/setup-miniconda@v1.6.0 with: activate-environment: modin From 10dbc72d6afffad60b4f605fe1bc0ffa3f192198 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 10 Aug 2020 11:32:14 -0700 Subject: [PATCH 060/120] Revert "FIX-#1904: CI fix" (#1907) --- .github/workflows/ci.yml | 4 ---- .github/workflows/push.yml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8eb0166565..ee1f36e637c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -298,10 +298,6 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - name: conda clean - uses: goanpeca/setup-miniconda@v1.6.0 - shell: bash -l {0} - run: conda clean --packages - uses: goanpeca/setup-miniconda@v1.6.0 with: activate-environment: modin diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 179341b3249..74a45317429 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -124,10 +124,6 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - name: conda clean - uses: goanpeca/setup-miniconda@v1.6.0 - shell: bash -l {0} - run: conda clean --packages - uses: goanpeca/setup-miniconda@v1.6.0 with: activate-environment: modin From cc60f25a17da9282283044e9db417064f1c865cc Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Mon, 10 Aug 2020 20:33:05 -0400 Subject: [PATCH 061/120] FIX-#1854: groupby() with arbitrary series (#1886) Signed-off-by: Itamar Turner-Trauring --- .../functions/groupby_function.py | 5 +- modin/engines/base/frame/data.py | 1 + modin/pandas/dataframe.py | 15 ++- modin/pandas/groupby.py | 40 +++++-- modin/pandas/test/test_groupby.py | 106 +++++++++++++++++- modin/pandas/test/utils.py | 10 +- 6 files changed, 155 insertions(+), 22 deletions(-) diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py index ddc10fdaf7f..e62b47d8b10 100644 --- a/modin/data_management/functions/groupby_function.py +++ b/modin/data_management/functions/groupby_function.py @@ -141,6 +141,9 @@ def compute_reduce(df): new_modin_frame = qc._modin_frame.groupby_reduce( axis, by._modin_frame, _map, _reduce ) - return query_compiler.__constructor__(new_modin_frame) + result = query_compiler.__constructor__(new_modin_frame) + if result.index.name == "__reduced__": + result.index.name = None + return result return caller diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index b358a85bd90..466a8c03082 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -1751,6 +1751,7 @@ def to_pandas(self): ) df.index = self.index df.columns = self.columns + return df def to_numpy(self): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 97709de0e31..18075d4df54 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -464,16 +464,21 @@ def groupby( else: mismatch = len(by) != len(self.axes[axis]) if mismatch and all( - obj in self - or (hasattr(self.index, "names") and obj in self.index.names) + isinstance(obj, str) + and ( + obj in self + or (hasattr(self.index, "names") and obj in self.index.names) + ) for obj in by ): # In the future, we will need to add logic to handle this, but for now # we default to pandas in this case. pass - elif mismatch: - raise KeyError(next(x for x in by if x not in self)) - + elif mismatch and any( + isinstance(obj, str) and obj not in self.columns for obj in by + ): + names = [o.name if isinstance(o, Series) else o for o in by] + raise KeyError(next(x for x in names if x not in self)) return DataFrameGroupBy( self, by, diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index c9f35f7a94f..258abebcfda 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -19,7 +19,7 @@ from modin.error_message import ErrorMessage -from .utils import _inherit_docstrings, wrap_udf_function +from .utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas from .series import Series @@ -65,7 +65,11 @@ def __init__( ) or ( not isinstance(by, type(self._query_compiler)) and axis == 0 - and all(obj in self._query_compiler.columns for obj in self._by) + and all( + (isinstance(obj, str) and obj in self._query_compiler.columns) + or isinstance(obj, Series) + for obj in self._by + ) ) else: self._is_multi_by = False @@ -120,12 +124,14 @@ def _index_grouped(self): # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") - self._index_grouped_cache = { - k: v.index - for k, v in self._df._query_compiler.getitem_column_array(by) - .to_pandas() - .groupby(by=by) - } + if isinstance(by, list) and all(isinstance(o, str) for o in by): + pandas_df = self._df._query_compiler.getitem_column_array( + by + ).to_pandas() + else: + by = try_cast_to_pandas(by) + pandas_df = self._df._to_pandas() + self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values @@ -309,6 +315,15 @@ def __getitem__(self, key): drop=self._drop, **kwargs, ) + if ( + self._is_multi_by + and isinstance(self._by, list) + and not all(isinstance(o, str) for o in self._by) + ): + raise NotImplementedError( + "Column lookups on GroupBy with arbitrary Series in by" + " is not yet supported." + ) return SeriesGroupBy( self._df[key], self._by, @@ -412,6 +427,9 @@ def all(self, **kwargs): ) def size(self): + if is_list_like(self._by) and any(isinstance(o, Series) for o in self._by): + # We don't have good way to handle this right now, fall back to Pandas. + return self._default_to_pandas(lambda df: df.size()) if self._axis == 0: # Size always works in as_index=True mode so it is necessary to make a # copy of _kwargs and change as_index in it @@ -666,12 +684,14 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): if self._idx_name is not None and self._as_index: new_manager.index.name = self._idx_name result = type(self._df)(query_compiler=new_manager) + if result.index.name == "__reduced__": + result.index.name = None if self._kwargs.get("squeeze", False): return result.squeeze() return result def _default_to_pandas(self, f, *args, **kwargs): - """Defailts the execution of this function to pandas. + """Defaults the execution of this function to pandas. Args: f: The function to apply to each group. @@ -689,6 +709,8 @@ def _default_to_pandas(self, f, *args, **kwargs): else: by = self._by + by = try_cast_to_pandas(by) + def groupby_on_multiple_columns(df, *args, **kwargs): return f( df.groupby(by=by, axis=self._axis, **self._kwargs), *args, **kwargs diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 32c9e81a68c..f0d79b9f6e8 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -15,7 +15,7 @@ import pandas import numpy as np import modin.pandas as pd -from modin.pandas.utils import from_pandas, to_pandas +from modin.pandas.utils import from_pandas, to_pandas, try_cast_to_pandas from .utils import ( df_equals, check_df_columns_have_nans, @@ -209,6 +209,16 @@ def test_mixed_dtypes_groupby(as_index): eval_groups(modin_groupby, pandas_groupby) +class GetColumn: + """Indicate to the test that it should do gc(df).""" + + def __init__(self, name): + self.name = name + + def __call__(self, df): + return df[self.name] + + @pytest.mark.parametrize( "by", [ @@ -241,6 +251,18 @@ def test_mixed_dtypes_groupby(as_index): ["col5", "col4"], ["col4", "col5"], ["col5", "col4", "col1"], + ["col1", pd.Series([1, 5, 7, 8])], + [pd.Series([1, 5, 7, 8])], + [ + pd.Series([1, 5, 7, 8]), + pd.Series([1, 5, 7, 8]), + pd.Series([1, 5, 7, 8]), + pd.Series([1, 5, 7, 8]), + pd.Series([1, 5, 7, 8]), + ], + ["col1", GetColumn("col5")], + [GetColumn("col1"), GetColumn("col5")], + [GetColumn("col1")], ], ) @pytest.mark.parametrize("as_index", [True, False]) @@ -261,8 +283,19 @@ def test_simple_row_groupby(by, as_index, col1_category): modin_df = from_pandas(pandas_df) n = 1 - modin_groupby = modin_df.groupby(by=by, as_index=as_index) - pandas_groupby = pandas_df.groupby(by=by, as_index=as_index) + + def maybe_get_columns(df, by): + if isinstance(by, list): + return [o(df) if isinstance(o, GetColumn) else o for o in by] + else: + return by + + modin_groupby = modin_df.groupby( + by=maybe_get_columns(modin_df, by), as_index=as_index + ) + + pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by)) + pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) @@ -295,7 +328,7 @@ def test_simple_row_groupby(by, as_index, col1_category): ) # Workaround for Pandas bug #34656. Recreate groupby object for Pandas - pandas_groupby = pandas_df.groupby(by=by, as_index=as_index) + pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) @@ -372,7 +405,11 @@ def test_simple_row_groupby(by, as_index, col1_category): eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) - eval___getattr__(modin_groupby, pandas_groupby, "col3") + if isinstance(by, list) and not any( + isinstance(o, (pd.Series, pandas.Series)) for o in by + ): + # Not yet supported for non-original-column-from-dataframe Series in by: + eval___getattr__(modin_groupby, pandas_groupby, "col3") eval_groups(modin_groupby, pandas_groupby) @@ -1188,3 +1225,62 @@ def get_columns(df): df2 = pd.concat([df2]) exp = df2.groupby(get_columns(df2)).size() df_equals(ref, exp) + + +@pytest.mark.parametrize( + # When True, use (df[name] + 1), otherwise just use name + "columns", + [ + [(True, "a"), (True, "b"), (True, "c")], + [(True, "a"), (True, "b")], + [(False, "a"), (False, "b"), (True, "c")], + [(False, "a"), (True, "c")], + ], +) +def test_mixed_columns_not_from_df(columns): + """ + Unlike the previous test, in this case the Series is not just a column from + the original DataFrame, so you can't use a fasttrack. + """ + + def get_columns(df): + return [(df[name] + 1) if lookup else name for (lookup, name) in columns] + + data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]} + + df1 = pandas.DataFrame(data) + df1 = pandas.concat([df1]) + ref = df1.groupby(get_columns(df1)).size() + + df2 = pd.DataFrame(data) + df2 = pd.concat([df2]) + exp = df2.groupby(get_columns(df2)).size() + df_equals(ref, exp) + + +@pytest.mark.parametrize( + # When True, do df[obj], otherwise just use the obj + "columns", + [ + [(False, "a")], + [(False, "a"), (False, "b"), (False, "c")], + [(False, "a"), (False, "b")], + [(False, "b"), (False, "a")], + [(True, "a"), (True, "b"), (True, "c")], + [(True, "a"), (True, "b")], + [(False, "a"), (False, "b"), (True, "c")], + [(False, "a"), (True, "c")], + [(False, "a"), (False, pd.Series([5, 6, 7, 8]))], + ], +) +def test_unknown_groupby(columns): + def get_columns(df): + return [df[name] if lookup else name for (lookup, name) in columns] + + data = {"b": [11, 11, 22, 200], "c": [111, 111, 222, 7000]} + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + with pytest.raises(KeyError): + pandas_df.groupby(by=get_columns(pandas_df)) + with pytest.raises(KeyError): + modin_df.groupby(by=get_columns(modin_df)) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index ebf0ce97828..f9d045a9bf6 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -584,8 +584,14 @@ def check_df_columns_have_nans(df, cols): """ return ( pandas.api.types.is_list_like(cols) - and any(x in df.columns and df[x].hasnans for x in cols) - or not pandas.api.types.is_list_like(cols) + and ( + any(isinstance(x, str) and x in df.columns and df[x].hasnans for x in cols) + or any( + isinstance(x, pd.Series) and x._parent is df and x.hasnans for x in cols + ) + ) + ) or ( + not pandas.api.types.is_list_like(cols) and cols in df.columns and df[cols].hasnans ) From 1865f95562f0bf1220747139ea4d2957fdc4e42b Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 18 Aug 2020 13:46:07 +0300 Subject: [PATCH 062/120] REFACTOR-#1917: move part of reset_index code to backend (#1920) Signed-off-by: ienkovich --- modin/backends/pandas/query_compiler.py | 4 ++++ modin/pandas/base.py | 12 +----------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index ef9c0314230..3981113a944 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -472,6 +472,10 @@ def reset_index(self, **kwargs): A new QueryCompiler with updated data and reset index. """ drop = kwargs.get("drop", False) + level = kwargs.get("level", None) + # TODO Implement level + if level is not None or isinstance(self.index, pandas.MultiIndex): + return self.default_to_pandas(pandas.DataFrame.reset_index, **kwargs) if not drop: new_column_name = ( self.index.name diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 19126f825e4..bd2f693246f 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2289,20 +2289,10 @@ def reset_index( A new DataFrame if inplace is False, None otherwise. """ inplace = validate_bool_kwarg(inplace, "inplace") - # TODO Implement level - if level is not None or isinstance(self.index, pandas.MultiIndex): - new_query_compiler = self._default_to_pandas( - "reset_index", - level=level, - drop=drop, - inplace=False, - col_level=col_level, - col_fill=col_fill, - )._query_compiler # Error checking for matching Pandas. Pandas does not allow you to # insert a dropped index into a DataFrame if these columns already # exist. - elif ( + if ( not drop and not isinstance(self.index, pandas.MultiIndex) and all(n in self.columns for n in ["level_0", "index"]) From b160a7190e79928562cbc047ea08429cf61c5006 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 18 Aug 2020 20:52:43 +0300 Subject: [PATCH 063/120] FEAT-#1925: add import from arrow table (#1913) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 22 ++++++++++ modin/backends/pandas/query_compiler.py | 4 ++ modin/data_management/dispatcher.py | 4 ++ modin/data_management/factories.py | 4 ++ modin/engines/base/frame/data.py | 41 ++++++++++++++++++- modin/engines/base/frame/partition_manager.py | 4 ++ modin/engines/base/io/io.py | 4 ++ modin/pandas/test/test_io.py | 8 +++- modin/pandas/utils.py | 19 +++++++++ 9 files changed, 108 insertions(+), 2 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 77860aca235..58678f32f52 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -124,6 +124,28 @@ def from_pandas(cls, df, data_cls): # END To/From Pandas + # From Arrow + @classmethod + @abc.abstractmethod + def from_arrow(cls, at, data_cls): + """Improve simple Arrow Table to an advanced and superior Modin DataFrame. + + Parameters + ---------- + at : Arrow Table + The Arrow Table to convert from. + data_cls : + Modin DataFrame object to convert to. + + Returns + ------- + BaseQueryCompiler + QueryCompiler containing data from the Pandas DataFrame. + """ + pass + + # END From Arrow + # To NumPy @abc.abstractmethod def to_numpy(self): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 3981113a944..6ec7080d04c 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -192,6 +192,10 @@ def to_pandas(self): def from_pandas(cls, df, data_cls): return cls(data_cls.from_pandas(df)) + @classmethod + def from_arrow(cls, at, data_cls): + return cls(data_cls.from_arrow(at)) + index = property(_get_axis(0), _set_axis(0)) columns = property(_get_axis(1), _set_axis(1)) diff --git a/modin/data_management/dispatcher.py b/modin/data_management/dispatcher.py index 60a473ca757..9a7a71a86d1 100644 --- a/modin/data_management/dispatcher.py +++ b/modin/data_management/dispatcher.py @@ -98,6 +98,10 @@ def _update_engine(cls, _): def from_pandas(cls, df): return cls.__engine._from_pandas(df) + @classmethod + def from_arrow(cls, at): + return cls.__engine._from_arrow(at) + @classmethod def from_non_pandas(cls, *args, **kwargs): return cls.__engine._from_non_pandas(*args, **kwargs) diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index eafbc40d7fb..8509b902911 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -39,6 +39,10 @@ def prepare(cls): def _from_pandas(cls, df): return cls.io_cls.from_pandas(df) + @classmethod + def _from_arrow(cls, at): + return cls.io_cls.from_arrow(at) + @classmethod def _from_non_pandas(cls, *args, **kwargs): return cls.io_cls.from_non_pandas(*args, **kwargs) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 466a8c03082..347ade92166 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -14,7 +14,7 @@ from collections import OrderedDict import numpy as np import pandas -from pandas.core.indexes.api import ensure_index +from pandas.core.indexes.api import ensure_index, Index, RangeIndex from pandas.core.dtypes.common import is_numeric_dtype from typing import Union @@ -1732,6 +1732,45 @@ def from_pandas(cls, df): dtypes=new_dtypes, ) + @classmethod + def from_arrow(cls, at): + """Improve simple Arrow Table to an advanced and superior Modin DataFrame. + + Parameters + ---------- + at : Arrow Table + The Arrow Table to convert from. + + Returns + ------- + BasePandasFrame + A new dataframe. + """ + new_frame, new_lengths, new_widths = cls._frame_mgr_cls.from_arrow( + at, return_dims=True + ) + new_columns = Index.__new__(Index, data=at.column_names, dtype="O") + new_index = Index.__new__(RangeIndex, data=range(at.num_rows)) + new_dtypes = pandas.Series( + [cls._arrow_type_to_dtype(col.type) for col in at.columns], + index=at.column_names, + ) + return cls( + partitions=new_frame, + index=new_index, + columns=new_columns, + row_lengths=new_lengths, + column_widths=new_widths, + dtypes=new_dtypes, + ) + + @classmethod + def _arrow_type_to_dtype(cls, arrow_type): + res = arrow_type.to_pandas_dtype() + if not isinstance(res, (np.dtype, str)): + return np.dtype(res) + return res + def to_pandas(self): """Converts Modin DataFrame to Pandas DataFrame. diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 451f0573a1f..9208b3b3741 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -398,6 +398,10 @@ def from_pandas(cls, df, return_dims=False): ] return np.array(parts), row_lengths, col_widths + @classmethod + def from_arrow(cls, at, return_dims=False): + return cls.from_pandas(at.to_pandas(), return_dims=return_dims) + @classmethod def get_indices(cls, axis, partitions, index_func=None): """This gets the internal indices stored in the partitions. diff --git a/modin/engines/base/io/io.py b/modin/engines/base/io/io.py index 251e91269ca..11cbae36778 100644 --- a/modin/engines/base/io/io.py +++ b/modin/engines/base/io/io.py @@ -29,6 +29,10 @@ def from_non_pandas(cls, *args, **kwargs): def from_pandas(cls, df): return cls.query_compiler_cls.from_pandas(df, cls.frame_cls) + @classmethod + def from_arrow(cls, at): + return cls.query_compiler_cls.from_arrow(at, cls.frame_cls) + @classmethod def read_parquet(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a Modin DataFrame. diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 0331f5013b3..a4e9fa55090 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -16,7 +16,7 @@ import pandas from pandas.errors import ParserWarning from collections import OrderedDict -from modin.pandas.utils import to_pandas +from modin.pandas.utils import to_pandas, from_arrow from pathlib import Path import pyarrow as pa import pyarrow.parquet as pq @@ -1643,3 +1643,9 @@ def test_cleanup(): os.remove(f) except PermissionError: pass + + +def test_from_arrow(): + pandas_df = create_test_pandas_dataframe() + modin_df = from_arrow(pa.Table.from_pandas(pandas_df)) + df_equals(modin_df, pandas_df) diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 0696ef325c9..d4c17fbeece 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -39,6 +39,25 @@ def from_pandas(df): return DataFrame(query_compiler=EngineDispatcher.from_pandas(df)) +def from_arrow(at): + """Converts an Arrow Table to a Modin DataFrame. + + Parameters + ---------- + at : Arrow Table + The Arrow Table to convert from. + + Returns + ------- + DataFrame + A new Modin DataFrame object. + """ + from modin.data_management.dispatcher import EngineDispatcher + from .dataframe import DataFrame + + return DataFrame(query_compiler=EngineDispatcher.from_arrow(at)) + + def to_pandas(modin_obj): """Converts a Modin DataFrame/Series to a pandas DataFrame/Series. From 9edc7647fd82381ef7c3704d1b102aeddb1c4466 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 18 Aug 2020 20:56:58 +0300 Subject: [PATCH 064/120] REFACTOR-#1928: move columnarization to backend (#1914) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 20 ++++++++++++++++++++ modin/backends/pandas/query_compiler.py | 22 ++++++++++++++++++++++ modin/pandas/series.py | 10 ++-------- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 58678f32f52..afb46b10050 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -325,6 +325,26 @@ def transpose(self, *args, **kwargs): """ pass + @abc.abstractmethod + def columnarize(self): + """ + Transposes this QueryCompiler if it has a single row but multiple columns. + + This method should be called for QueryCompilers representing a Series object, + i.e. self.is_series_like() should be True. + + Returns + ------- + BaseQueryCompiler + Transposed new QueryCompiler or self. + """ + pass + + @abc.abstractmethod + def is_series_like(self): + """Return True if QueryCompiler has a single column or row""" + pass + # END Abstract Transpose # Abstract reindex/reset_index (may shuffle data) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 6ec7080d04c..87757cc0d0a 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -515,6 +515,28 @@ def transpose(self, *args, **kwargs): # Switch the index and columns and transpose the data within the blocks. return self.__constructor__(self._modin_frame.transpose()) + def columnarize(self): + """ + Transposes this QueryCompiler if it has a single row but multiple columns. + + This method should be called for QueryCompilers representing a Series object, + i.e. self.is_series_like() should be True. + + Returns + ------- + PandasQueryCompiler + Transposed new QueryCompiler or self. + """ + if len(self.columns) != 1 or ( + len(self.index) == 1 and self.index[0] == "__reduced__" + ): + return self.transpose() + return self + + def is_series_like(self): + """Return True if QueryCompiler has a single column or row""" + return len(self.columns) == 1 or len(self.index) == 1 + # END Transpose # MapReduce operations diff --git a/modin/pandas/series.py b/modin/pandas/series.py index adac3e56220..7065d77885e 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -92,11 +92,7 @@ def __init__( ) ) )._query_compiler - if len(query_compiler.columns) != 1 or ( - len(query_compiler.index) == 1 and query_compiler.index[0] == "__reduced__" - ): - query_compiler = query_compiler.transpose() - self._query_compiler = query_compiler + self._query_compiler = query_compiler.columnarize() if name is not None: self._query_compiler = self._query_compiler self.name = name @@ -148,9 +144,7 @@ def _create_or_update_from_compiler(self, new_query_compiler, inplace=False): isinstance(new_query_compiler, type(self._query_compiler)) or type(new_query_compiler) in self._query_compiler.__class__.__bases__ ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) - if not inplace and ( - len(new_query_compiler.columns) == 1 or len(new_query_compiler.index) == 1 - ): + if not inplace and new_query_compiler.is_series_like(): return Series(query_compiler=new_query_compiler) elif not inplace: # This can happen with things like `reset_index` where we can add columns. From 16cb248fb7875e57b322184da3ce9b8a3cb88cf7 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 18 Aug 2020 21:01:58 +0300 Subject: [PATCH 065/120] REFACTOR-#1929: avoid unnecessary index access in groupby (#1910) Signed-off-by: ienkovich --- modin/pandas/groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 258abebcfda..4773d8f877a 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -49,7 +49,6 @@ def __init__( self._idx_name = idx_name self._df = df self._query_compiler = self._df._query_compiler - self._index = self._query_compiler.index self._columns = self._query_compiler.columns self._by = by self._drop = drop @@ -83,6 +82,10 @@ def __init__( } self._kwargs.update(kwargs) + @property + def _index(self): + return self._query_compiler.index + @property def _sort(self): return self._kwargs.get("sort") From a1fa46a02762f0502bdff68749422d3448b6504d Mon Sep 17 00:00:00 2001 From: ienkovich Date: Wed, 19 Aug 2020 10:08:13 +0300 Subject: [PATCH 066/120] FEAT-#1911: support cat methods (#1912) Signed-off-by: ienkovich --- modin/backends/pandas/query_compiler.py | 8 ++ modin/pandas/series.py | 75 ++++++++++++- modin/pandas/test/test_series.py | 140 ++++++++++++++++++++++++ modin/pandas/test/utils.py | 8 ++ 4 files changed, 230 insertions(+), 1 deletion(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 87757cc0d0a..a0e9db57d0a 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -179,6 +179,8 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): result = pandas_op(self.to_pandas(), *args, **kwargs) if isinstance(result, pandas.Series): + if result.name is None: + result.name = "__reduced__" result = result.to_frame() if isinstance(result, pandas.DataFrame): return self.from_pandas(result, type(self._modin_frame)) @@ -2270,3 +2272,9 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): by=rows, axis=1, ascending=ascending, kind=kind, na_position=na_position, ).columns return self.reindex(1, new_columns) + + # Cat operations + def cat_codes(self): + return self.default_to_pandas(lambda df: df[df.columns[0]].cat.codes) + + # END Cat operations diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 7065d77885e..7d86adcd3a7 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1621,7 +1621,7 @@ def axes(self): @property def cat(self): - return self._default_to_pandas(pandas.Series.cat) + return CategoryMethods(self) @property def dt(self): @@ -2234,3 +2234,76 @@ def _default_to_pandas(self, op, *args, **kwargs): return self._series._default_to_pandas( lambda series: op(series.str, *args, **kwargs) ) + + +class CategoryMethods(object): + def __init__(self, series): + self._series = series + self._query_compiler = series._query_compiler + + @property + def categories(self): + return self._series._default_to_pandas(pandas.Series.cat).categories + + @categories.setter + def categories(self, categories): + def set_categories(series, categories): + series.cat.categories = categories + + self._series._default_to_pandas(set_categories, categories=categories) + + @property + def ordered(self): + return self._series._default_to_pandas(pandas.Series.cat).ordered + + @property + def codes(self): + return Series(query_compiler=self._query_compiler.cat_codes()) + + def rename_categories(self, new_categories, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.rename_categories, new_categories, inplace=inplace + ) + + def reorder_categories(self, new_categories, ordered=None, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.reorder_categories, + new_categories, + ordered=ordered, + inplace=inplace, + ) + + def add_categories(self, new_categories, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.add_categories, new_categories, inplace=inplace + ) + + def remove_categories(self, removals, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.remove_categories, removals, inplace=inplace + ) + + def remove_unused_categories(self, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.remove_unused_categories, inplace=inplace + ) + + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + return self._default_to_pandas( + pandas.Series.cat.set_categories, + new_categories, + ordered=ordered, + rename=rename, + inplace=inplace, + ) + + def as_ordered(self, inplace=False): + return self._default_to_pandas(pandas.Series.cat.as_ordered, inplace=inplace) + + def as_unordered(self, inplace=False): + return self._default_to_pandas(pandas.Series.cat.as_unordered, inplace=inplace) + + def _default_to_pandas(self, op, *args, **kwargs): + return self._series._default_to_pandas( + lambda series: op(series.cat, *args, **kwargs) + ) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index c4cdddf235e..5c4f21b2a9a 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -57,6 +57,8 @@ eval_general, test_data_small_values, test_data_small_keys, + test_data_categorical_values, + test_data_categorical_keys, ) pd.DEFAULT_NPARTITIONS = 4 @@ -4100,3 +4102,141 @@ def test_hasattr_sparse(data): else: modin_result = hasattr(modin_series, "sparse") assert modin_result == pandas_result + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +def test_cat_categories(data): + modin_series, pandas_series = create_test_series(data.copy()) + df_equals(modin_series.cat.categories, pandas_series.cat.categories) + pandas_series.cat.categories = list("qwert") + modin_series.cat.categories = list("qwert") + df_equals(modin_series, pandas_series) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +def test_cat_ordered(data): + modin_series, pandas_series = create_test_series(data.copy()) + assert modin_series.cat.ordered == pandas_series.cat.ordered + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +def test_cat_codes(data): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.codes + modin_result = modin_series.cat.codes + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_rename_categories(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.rename_categories(list("qwert"), inplace=inplace) + modin_result = modin_series.cat.rename_categories(list("qwert"), inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("ordered", bool_arg_values, ids=bool_arg_keys) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_reorder_categories(data, ordered, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.reorder_categories( + list("tades"), ordered=ordered, inplace=inplace + ) + modin_result = modin_series.cat.reorder_categories( + list("tades"), ordered=ordered, inplace=inplace + ) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_add_categories(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.add_categories(list("qw"), inplace=inplace) + modin_result = modin_series.cat.add_categories(list("qw"), inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_remove_categories(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.remove_categories(list("at"), inplace=inplace) + modin_result = modin_series.cat.remove_categories(list("at"), inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_remove_unused_categories(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_series[1] = np.nan + pandas_result = pandas_series.cat.remove_unused_categories(inplace=inplace) + modin_series[1] = np.nan + modin_result = modin_series.cat.remove_unused_categories(inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("ordered", bool_arg_values, ids=bool_arg_keys) +@pytest.mark.parametrize("rename", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_set_categories(data, ordered, rename, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.set_categories( + list("qwert"), ordered=ordered, rename=rename, inplace=inplace + ) + modin_result = modin_series.cat.set_categories( + list("qwert"), ordered=ordered, rename=rename, inplace=inplace + ) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_as_ordered(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.as_ordered(inplace=inplace) + modin_result = modin_series.cat.as_ordered(inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "data", test_data_categorical_values, ids=test_data_categorical_keys +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_cat_as_unordered(data, inplace): + modin_series, pandas_series = create_test_series(data.copy()) + pandas_result = pandas_series.cat.as_unordered(inplace=inplace) + modin_result = modin_series.cat.as_unordered(inplace=inplace) + df_equals(modin_series, pandas_series) + df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index f9d045a9bf6..4e7f37ad281 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -173,6 +173,14 @@ test_data_with_duplicates_values = list(test_data_with_duplicates.values()) test_data_with_duplicates_keys = list(test_data_with_duplicates.keys()) +test_data_categorical = { + "ordered": pandas.Categorical(list("testdata"), ordered=True), + "unordered": pandas.Categorical(list("testdata"), ordered=False), +} + +test_data_categorical_values = list(test_data_categorical.values()) +test_data_categorical_keys = list(test_data_categorical.keys()) + numeric_dfs = [ "empty_data", "columns_only", From 36106b42eb0d731545be53f23ce6a2a7f6f1bc46 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Fri, 21 Aug 2020 11:32:49 +0300 Subject: [PATCH 067/120] REFACTOR-#1938: avoid index access in a simple column reference (#1939) Signed-off-by: ienkovich --- modin/pandas/base.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index bd2f693246f..14d908553e4 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3442,9 +3442,14 @@ def __getitem__(self, key): return self._default_to_pandas("__getitem__", key) # see if we can slice the rows # This lets us reuse code in Pandas to error check - indexer = convert_to_index_sliceable( - getattr(pandas, type(self).__name__)(index=self.index), key - ) + indexer = None + if isinstance(key, slice) or ( + isinstance(key, str) + and (not hasattr(self, "columns") or key not in self.columns) + ): + indexer = convert_to_index_sliceable( + getattr(pandas, type(self).__name__)(index=self.index), key + ) if indexer is not None: return self._getitem_slice(indexer) else: From 39afc07455dd0dad1807cdb1b8b1d18c4c3d8775 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Sat, 22 Aug 2020 14:43:31 +0300 Subject: [PATCH 068/120] FEAT-#1936: avoid empty frame checks for lazy backend (#1937) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 2 ++ modin/pandas/base.py | 4 ++-- modin/pandas/concat.py | 3 ++- modin/pandas/dataframe.py | 4 ++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index afb46b10050..59d64543080 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -45,6 +45,8 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): # some of these abstract methods, but for the sake of generality they are # treated differently. + lazy_execution = False + # Metadata modification abstract methods @abc.abstractmethod def add_prefix(self, prefix, axis=1): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 14d908553e4..b569d0dc9d7 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3438,7 +3438,7 @@ def __ge__(self, right): return self.ge(right) def __getitem__(self, key): - if len(self) == 0: + if not self._query_compiler.lazy_execution and len(self) == 0: return self._default_to_pandas("__getitem__", key) # see if we can slice the rows # This lets us reuse code in Pandas to error check @@ -3574,7 +3574,7 @@ def __getattribute__(self, item): "_create_or_update_from_compiler", "_update_inplace", ] - if item not in default_behaviors: + if item not in default_behaviors and not self._query_compiler.lazy_execution: method = object.__getattribute__(self, item) is_callable = callable(method) # We default to pandas on empty DataFrames. This avoids a large amount of diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py index f7988328960..806b624435a 100644 --- a/modin/pandas/concat.py +++ b/modin/pandas/concat.py @@ -106,7 +106,8 @@ def concat( list_of_objs = [ obj._query_compiler for obj in list_of_objs - if len(obj.index) or len(obj.columns) + if (not obj._query_compiler.lazy_execution and len(obj.index)) + or len(obj.columns) ] if keys is not None: if all_series: diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 18075d4df54..4065bf35876 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1134,7 +1134,7 @@ def insert(self, loc, column, value, allow_duplicates=False): # TODO: Remove broadcast of Series value = value._to_pandas() - if len(self.index) == 0: + if not self._query_compiler.lazy_execution and len(self.index) == 0: try: value = pandas.Series(value) except (TypeError, ValueError, IndexError): @@ -2577,7 +2577,7 @@ def setitem_without_string_columns(df): if not isinstance(value, Series): value = list(value) - if len(self.index) == 0: + if not self._query_compiler.lazy_execution and len(self.index) == 0: new_self = DataFrame({key: value}, columns=self.columns) self._update_inplace(new_self._query_compiler) else: From c03f98c4db4fece38093425e5c54b68b96a2d73f Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Mon, 24 Aug 2020 10:22:32 -0400 Subject: [PATCH 069/120] FIX-#1898: Support DataFrame.__setitem__ with boolean mask. (#1899) Signed-off-by: Itamar Turner-Trauring --- modin/pandas/dataframe.py | 17 +++++++++++++++-- modin/pandas/test/test_dataframe.py | 27 +++++++++++++++++++++++++++ modin/pandas/utils.py | 9 +++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 4065bf35876..de377078c56 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -33,7 +33,13 @@ import warnings from modin.error_message import ErrorMessage -from .utils import from_pandas, from_non_pandas, to_pandas, _inherit_docstrings +from .utils import ( + from_pandas, + from_non_pandas, + to_pandas, + _inherit_docstrings, + hashable, +) from .iterator import PartitionIterator from .series import Series from .base import BasePandasDataset @@ -2515,7 +2521,7 @@ def __setattr__(self, key, value): object.__setattr__(self, key, value) def __setitem__(self, key, value): - if key not in self.columns: + if hashable(key) and key not in self.columns: # Handle new column case first if isinstance(value, Series): if len(self.columns) == 0: @@ -2554,6 +2560,13 @@ def __setitem__(self, key, value): if not isinstance(key, str): + if isinstance(key, DataFrame) or isinstance(key, np.ndarray): + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array must be same shape as DataFrame") + key = DataFrame(key, columns=self.columns) + return self.mask(key, value, inplace=True) + def setitem_without_string_columns(df): # Arrow makes memory-mapped objects immutable, so copy will allow them # to be mutable again. diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index d8b63ffea58..8f1ac522462 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5369,6 +5369,33 @@ def test___setitem__(self, data): df_equals(modin_df, pandas_df) + def test___setitem__mask(self): + # DataFrame mask: + data = test_data["int_data"] + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + mean = int((RAND_HIGH + RAND_LOW) / 2) + pandas_df[pandas_df > mean] = -50 + modin_df[modin_df > mean] = -50 + + df_equals(modin_df, pandas_df) + + # Array mask: + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + array = (pandas_df > mean).to_numpy() + + modin_df[array] = -50 + pandas_df[array] = -50 + + df_equals(modin_df, pandas_df) + + # Array mask of wrong size: + with pytest.raises(ValueError): + array = np.array([[1, 2], [3, 4]]) + modin_df[array] = 20 + @pytest.mark.parametrize( "data", [ diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index d4c17fbeece..d2c346d05ff 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -146,3 +146,12 @@ def wrapper(*args, **kwargs): wrapper.__name__ = func.__name__ return wrapper + + +def hashable(obj): + """Return whether the object is hashable.""" + try: + hash(obj) + except TypeError: + return False + return True From 5d59c99ffba35df7eedaec71eac924a35a8cd908 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Mon, 24 Aug 2020 19:57:24 +0300 Subject: [PATCH 070/120] FIX-#1784: removed columns sort from 'df_equals' (#1896) Signed-off-by: Dmitry Chigarev --- modin/pandas/test/utils.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 4e7f37ad281..352013b4b26 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -496,26 +496,15 @@ def df_equals(df1, df2): return False if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame): - try: - assert_frame_equal( - df1.sort_index(axis=1), - df2.sort_index(axis=1), - check_dtype=False, - check_datetimelike_compat=True, - check_index_type=False, - check_column_type=False, - check_categorical=False, - ) - except Exception: - assert_frame_equal( - df1, - df2, - check_dtype=False, - check_datetimelike_compat=True, - check_index_type=False, - check_column_type=False, - check_categorical=False, - ) + assert_frame_equal( + df1, + df2, + check_dtype=False, + check_datetimelike_compat=True, + check_index_type=False, + check_column_type=False, + check_categorical=False, + ) df_categories_equals(df1, df2) elif isinstance(df1, types_for_almost_equals) and isinstance( df2, types_for_almost_equals From 82849bb9ccfb0cde899a02386f6f3b9bc336a333 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 25 Aug 2020 16:43:27 +0300 Subject: [PATCH 071/120] REFACTOR-#1941: move index access from DataFrame.insert to backend (#1942) Signed-off-by: ienkovich --- modin/backends/pandas/query_compiler.py | 2 ++ modin/pandas/dataframe.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index a0e9db57d0a..0c92ca12b80 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1898,6 +1898,8 @@ def insert(self, loc, column, value): value = value.reindex(self.index) else: value = list(value) + else: + value = [value] * len(self.index) def insert(df, internal_indices=[]): internal_idx = int(internal_indices[0]) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index de377078c56..b0e15435de5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1159,9 +1159,11 @@ def insert(self, loc, column, value, allow_duplicates=False): data=value, columns=[column], index=self.index )._query_compiler else: - if not is_list_like(value): - value = np.full(len(self.index), value) - if not isinstance(value, pandas.Series) and len(value) != len(self.index): + if ( + is_list_like(value) + and not isinstance(value, pandas.Series) + and len(value) != len(self.index) + ): raise ValueError("Length of values does not match length of index") if not allow_duplicates and column in self.columns: raise ValueError("cannot insert {0}, already exists".format(column)) From 66a4a87f9c894a5b8687740512767463777e064a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 26 Aug 2020 13:39:52 +0300 Subject: [PATCH 072/120] TEST-#1955: speed up TestDataFrameDefault test (#1956) * TEST-#1955: speed up TestDataFrameDefault test Signed-off-by: Anatoly Myachev * TEST-#1955: use random_state Signed-off-by: Anatoly Myachev --- modin/pandas/test/test_dataframe.py | 120 ++++++++++++++++++---------- modin/pandas/test/utils.py | 8 ++ 2 files changed, 86 insertions(+), 42 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 8f1ac522462..b20e7bb37b1 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -61,6 +61,7 @@ test_data_small_keys, udf_func_values, udf_func_keys, + generate_multiindex, ) pd.DEFAULT_NPARTITIONS = 4 @@ -2333,11 +2334,31 @@ def test_infer_objects(self): pd.DataFrame(data).infer_objects() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("verbose", [None, True, False]) - @pytest.mark.parametrize("max_cols", [None, 10, 99999999]) - @pytest.mark.parametrize("memory_usage", [None, True, False, "deep"]) - @pytest.mark.parametrize("null_counts", [None, True, False]) - def test_info(self, data, verbose, max_cols, memory_usage, null_counts): + def test_info_default_param(self, data): + with io.StringIO() as first, io.StringIO() as second: + eval_general( + pd.DataFrame(data), + pandas.DataFrame(data), + verbose=None, + max_cols=None, + memory_usage=None, + null_counts=None, + operation=lambda df, **kwargs: df.info(**kwargs), + buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, + ) + modin_info = first.getvalue().splitlines() + pandas_info = second.getvalue().splitlines() + + assert modin_info[0] == str(pd.DataFrame) + assert pandas_info[0] == str(pandas.DataFrame) + assert modin_info[1:] == pandas_info[1:] + + @pytest.mark.parametrize("verbose", [True, False]) + @pytest.mark.parametrize("max_cols", [10, 99999999]) + @pytest.mark.parametrize("memory_usage", [True, False, "deep"]) + @pytest.mark.parametrize("null_counts", [True, False]) + def test_info(self, verbose, max_cols, memory_usage, null_counts): + data = test_data_values[0] with io.StringIO() as first, io.StringIO() as second: eval_general( pd.DataFrame(data), @@ -2361,39 +2382,41 @@ def test_interpolate(self): with pytest.warns(UserWarning): pd.DataFrame(data).interpolate() + def test_kurt_kurtosis_equals(self): + # It's optimization. If failed, df.kurt should be tested explicitly + # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. + data = test_data_values[0] + df_modin = pd.DataFrame(data) + assert df_modin.kurt == df_modin.kurtosis + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("skipna", bool_arg_values, ids=bool_arg_keys) - @pytest.mark.parametrize("level", [None, -1, 0, 1]) @pytest.mark.parametrize("numeric_only", bool_arg_values, ids=bool_arg_keys) - def test_kurt_kurtosis(self, axis, skipna, level, numeric_only): - func_kwargs = { - "axis": axis, - "skipna": skipna, - "level": level, - "numeric_only": numeric_only, - } + def test_kurt_kurtosis(self, axis, skipna, numeric_only): data = test_data_values[0] df_modin = pd.DataFrame(data) df_pandas = pandas.DataFrame(data) eval_general( - df_modin, df_pandas, lambda df: df.kurtosis(**func_kwargs), + df_modin, + df_pandas, + lambda df: df.kurtosis( + axis=axis, skipna=skipna, level=None, numeric_only=numeric_only + ), ) - if level is not None: - cols_number = len(data.keys()) - arrays = [ - np.random.choice(["bar", "baz", "foo", "qux"], cols_number), - np.random.choice(["one", "two"], cols_number), - ] - index = pd.MultiIndex.from_tuples( - list(zip(*arrays)), names=["first", "second"] - ) - df_modin.columns = index - df_pandas.columns = index - eval_general( - df_modin, df_pandas, lambda df: df.kurtosis(**func_kwargs), - ) + @pytest.mark.parametrize("level", [-1, 0, 1]) + def test_kurt_kurtosis_level(self, level): + data = test_data_values[0] + df_modin = pd.DataFrame(data) + df_pandas = pandas.DataFrame(data) + + index = generate_multiindex(len(data.keys())) + df_modin.columns = index + df_pandas.columns = index + eval_general( + df_modin, df_pandas, lambda df: df.kurtosis(axis=1, level=level), + ) def test_last(self): modin_index = pd.date_range("2010-04-09", periods=400, freq="2D") @@ -2415,12 +2438,23 @@ def test_lookup(self): @pytest.mark.parametrize("data", test_data_values) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("skipna", [None, True, False]) - @pytest.mark.parametrize("level", [0, -1, None]) - def test_mad(self, level, data, axis, skipna): + def test_mad(self, data, axis, skipna): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) df_equals( - modin_df.mad(axis=axis, skipna=skipna, level=level), - pandas_df.mad(axis=axis, skipna=skipna, level=level), + modin_df.mad(axis=axis, skipna=skipna, level=None), + pandas_df.mad(axis=axis, skipna=skipna, level=None), + ) + + @pytest.mark.parametrize("level", [-1, 0, 1]) + def test_mad_level(self, level): + data = test_data_values[0] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + index = generate_multiindex(len(data.keys())) + modin_df.columns = index + pandas_df.columns = index + eval_general( + modin_df, pandas_df, lambda df: df.mad(axis=1, level=level), ) def test_mask(self): @@ -2677,19 +2711,21 @@ def test_style(self): pd.DataFrame(data).style @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis1", [0, 1, "columns", "index"]) - @pytest.mark.parametrize("axis2", [0, 1, "columns", "index"]) + @pytest.mark.parametrize("axis1", [0, 1]) + @pytest.mark.parametrize("axis2", [0, 1]) def test_swapaxes(self, data, axis1, axis2): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) - try: - pandas_result = pandas_df.swapaxes(axis1, axis2) - except Exception as e: - with pytest.raises(type(e)): - modin_df.swapaxes(axis1, axis2) - else: - modin_result = modin_df.swapaxes(axis1, axis2) - df_equals(modin_result, pandas_result) + + pandas_result = pandas_df.swapaxes(axis1, axis2) + modin_result = modin_df.swapaxes(axis1, axis2) + df_equals(modin_result, pandas_result) + + def test_swapaxes_axes_names(self): + modin_df = pd.DataFrame(test_data_values[0]) + modin_result1 = modin_df.swapaxes(0, 1) + modin_result2 = modin_df.swapaxes("columns", "index") + df_equals(modin_result1, modin_result2) def test_swaplevel(self): data = np.random.randint(1, 100, 12) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 352013b4b26..8c794b0cfc6 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -681,6 +681,14 @@ def generate_multiindex(index): return df1, df2 +def generate_multiindex(cols_number): + arrays = [ + random_state.choice(["bar", "baz", "foo", "qux"], cols_number), + random_state.choice(["one", "two"], cols_number), + ] + return pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["first", "second"]) + + def generate_none_dfs(): df = pandas.DataFrame( { From f8732275901e97e956d5dee6277442b49eca682b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 26 Aug 2020 13:52:58 +0300 Subject: [PATCH 073/120] TEST-#1950: improve TestDataFrameIter test time (#1947) * TEST-#1950: improve TestDataFrameIter test time Signed-off-by: Anatoly Myachev * TEST-#1950: speed up some iter tests Signed-off-by: Anatoly Myachev --- modin/pandas/test/test_dataframe.py | 46 ++++++++++------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index b20e7bb37b1..23695eaf20b 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5487,10 +5487,9 @@ def test_index_order(self): class TestDataFrameIter: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_items(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) + def test_items(self): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) modin_items = modin_df.items() pandas_items = pandas_df.items() @@ -5500,10 +5499,9 @@ def test_items(self, data): df_equals(pandas_series, modin_series) assert pandas_index == modin_index - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_iteritems(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) + def test_iteritems(self): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) modin_items = modin_df.iteritems() pandas_items = pandas_df.iteritems() @@ -5513,10 +5511,9 @@ def test_iteritems(self, data): df_equals(pandas_series, modin_series) assert pandas_index == modin_index - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_iterrows(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) + def test_iterrows(self): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) modin_iterrows = modin_df.iterrows() pandas_iterrows = pandas_df.iterrows() @@ -5528,16 +5525,9 @@ def test_iterrows(self, data): @pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) @pytest.mark.parametrize("index", [True, False]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_itertuples(self, name, index, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # test default - modin_it_default = modin_df.itertuples() - pandas_it_default = pandas_df.itertuples() - for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): - np.testing.assert_equal(modin_row, pandas_row) + def test_itertuples(self, name, index): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) modin_it_custom = modin_df.itertuples(index=index, name=name) pandas_it_custom = pandas_df.itertuples(index=index, name=name) @@ -5552,20 +5542,14 @@ def test_itertuples(self, name, index, data): ) modin_df.columns = mi_index_modin pandas_df.columns = mi_index_pandas - modin_it_default = modin_df.itertuples() - pandas_it_default = pandas_df.itertuples() - for modin_row, pandas_row in zip(modin_it_default, pandas_it_default): - np.testing.assert_equal(modin_row, pandas_row) - modin_it_custom = modin_df.itertuples(index=index, name=name) pandas_it_custom = pandas_df.itertuples(index=index, name=name) for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): np.testing.assert_equal(modin_row, pandas_row) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___iter__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) + def test___iter__(self): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) modin_iterator = modin_df.__iter__() From 9b27e9347a4fe711154f329228085d4f5495f544 Mon Sep 17 00:00:00 2001 From: YarShev Date: Wed, 26 Aug 2020 15:44:48 +0300 Subject: [PATCH 074/120] FEAT-#1196: Support `replace` method for `DataFrame` and `Series` (#1943) Signed-off-by: Igoshev, Yaroslav --- docs/supported_apis/dataframe_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 ++ modin/backends/pandas/query_compiler.py | 1 + modin/pandas/base.py | 19 ------- modin/pandas/dataframe.py | 58 ++++++++++++++++++++ modin/pandas/series.py | 59 +++++++++++++++++++++ modin/pandas/test/test_dataframe.py | 34 ++++++++++-- modin/pandas/test/test_series.py | 15 ++++-- 8 files changed, 164 insertions(+), 28 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 7ff26076737..1b35b23480e 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -301,7 +301,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``reorder_levels`` | `reorder_levels`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``replace`` | `replace`_ | D | | +| ``replace`` | `replace`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``resample`` | `resample`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 59d64543080..9accf099c4f 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -501,6 +501,10 @@ def notna(self): def round(self, **kwargs): pass + @abc.abstractmethod + def replace(self, **kwargs): + pass + @abc.abstractmethod def series_view(self, **kwargs): pass diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 0c92ca12b80..0a8059fff2f 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1054,6 +1054,7 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs): negative = MapFunction.register(pandas.DataFrame.__neg__) notna = MapFunction.register(pandas.DataFrame.notna, dtypes=np.bool) round = MapFunction.register(pandas.DataFrame.round) + replace = MapFunction.register(pandas.DataFrame.replace) series_view = MapFunction.register( lambda df, *args, **kwargs: pandas.DataFrame( df.squeeze(axis=1).view(*args, **kwargs) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index b569d0dc9d7..175ec39ca67 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2176,25 +2176,6 @@ def reorder_levels(self, order, axis=0): new_labels = self.axes[axis].reorder_levels(order) return self.set_axis(new_labels, axis=axis, inplace=False) - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method="pad", - ): - return self._default_to_pandas( - "replace", - to_replace=to_replace, - value=value, - inplace=inplace, - limit=limit, - regex=regex, - method=method, - ) - def resample( self, rule, diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b0e15435de5..84ec19b0918 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1872,6 +1872,64 @@ def rename( if not inplace: return obj + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + """ + Replace values given in `to_replace` with `value`. + + Values of the DaraFrame are replaced with other values dynamically. + This differs from updating with .loc or .iloc, which require + you to specify a location to update with some value. + + Parameters + ---------- + to_replace : str, regex, list, dict, Series, int, float, or None + How to find the values that will be replaced. + value : scalar, dict, list, str, regex, default None + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + inplace : bool, default False + If True, in place. Note: this will modify any + other views on this object (e.g. a column from a DataFrame). + Returns the caller if this is True. + limit : int, default None + Maximum size gap to forward or backward fill. + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + `to_replace` must be ``None``. + method : {{'pad', 'ffill', 'bfill', `None`}} + The method to use when for replacement, when `to_replace` is a + scalar, list or tuple and `value` is ``None``. + + Returns + ------- + DataFrame + Object after replacement. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + new_query_compiler = self._query_compiler.replace( + to_replace=to_replace, + value=value, + inplace=False, + limit=limit, + regex=regex, + method=method, + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + def _set_axis_name(self, name, axis=0, inplace=False): """Alter the name or names of the axis. diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 7d86adcd3a7..46290d7f56a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -15,6 +15,7 @@ import numpy as np import pandas from pandas.core.common import apply_if_callable, is_bool_indexer +from pandas.util._validators import validate_bool_kwarg import pandas._libs.lib as lib from pandas.core.dtypes.common import ( is_dict_like, @@ -1304,6 +1305,64 @@ def quantile(self, q=0.5, interpolation="linear"): def reorder_levels(self, order): return super(Series, self).reorder_levels(order) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + """ + Replace values given in `to_replace` with `value`. + + Values of the Series are replaced with other values dynamically. + This differs from updating with .loc or .iloc, which require + you to specify a location to update with some value. + + Parameters + ---------- + to_replace : str, regex, list, dict, Series, int, float, or None + How to find the values that will be replaced. + value : scalar, dict, list, str, regex, default None + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + inplace : bool, default False + If True, in place. Note: this will modify any + other views on this object (e.g. a column from a DataFrame). + Returns the caller if this is True. + limit : int, default None + Maximum size gap to forward or backward fill. + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + `to_replace` must be ``None``. + method : {{'pad', 'ffill', 'bfill', `None`}} + The method to use when for replacement, when `to_replace` is a + scalar, list or tuple and `value` is ``None``. + + Returns + ------- + Series + Object after replacement. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + new_query_compiler = self._query_compiler.replace( + to_replace=to_replace, + value=value, + inplace=False, + limit=limit, + regex=regex, + method=method, + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + def searchsorted(self, value, side="left", sorter=None): return self._default_to_pandas( pandas.Series.searchsorted, value, side=side, sorter=sorter diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 23695eaf20b..4ebf4fc313a 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2546,9 +2546,37 @@ def test_plot(self, request, data): assert np.array_equal(left.get_xdata(), right.get_xdata()) def test_replace(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).replace() + modin_df = pd.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} + ) + pandas_df = pandas.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} + ) + modin_result = modin_df.replace({"A": 0, "B": 5}, 100) + pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.replace({"A": {0: 100, 4: 400}}) + pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}}) + df_equals(modin_result, pandas_result) + + modin_df = pd.DataFrame( + {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} + ) + pandas_df = pandas.DataFrame( + {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} + ) + modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) + pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new") + pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new") + df_equals(modin_result, pandas_result) + + modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) + pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) + df_equals(modin_df, pandas_df) @pytest.mark.parametrize("rule", ["5T", pandas.offsets.Hour()]) @pytest.mark.parametrize("axis", [0, "columns"]) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 5c4f21b2a9a..88b89549812 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2400,11 +2400,16 @@ def test_repeat_lists(data, repeats): ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_replace(data): - modin_series, _ = create_test_series(data) # noqa: F841 - with pytest.warns(UserWarning): - modin_series.replace(0, 5) +def test_replace(): + modin_series = pd.Series([0, 1, 2, 3, 4]) + pandas_series = pandas.Series([0, 1, 2, 3, 4]) + modin_result = modin_series.replace(0, 5) + pandas_result = pandas_series.replace(0, 5) + df_equals(modin_result, pandas_result) + + modin_result = modin_series.replace([1, 2], method="bfill") + pandas_result = pandas_series.replace([1, 2], method="bfill") + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("closed", ["left", "right"]) From 843816ecbccb6435ba7f29688301f4198277cd27 Mon Sep 17 00:00:00 2001 From: YarShev Date: Wed, 26 Aug 2020 15:49:30 +0300 Subject: [PATCH 075/120] FEAT-#1922: Support cases for `DataFrame.join` (#1923) when `on` is set to `left` and `inner` Signed-off-by: Igoshev, Yaroslav --- docs/supported_apis/dataframe_supported.rst | 3 +- modin/backends/base/query_compiler.py | 21 ++++++ modin/backends/pandas/query_compiler.py | 35 ++++++++++ modin/pandas/dataframe.py | 23 ++++--- modin/pandas/test/test_dataframe.py | 71 ++++++++++++++++++++- 5 files changed, 139 insertions(+), 14 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 1b35b23480e..1ad585ce228 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -197,7 +197,8 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``itertuples`` | `itertuples`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``join`` | `join`_ | Y | | +| ``join`` | `join`_ | P | When ``on`` is set to ``right`` or ``outer`` | +| | | | it defaults to pandas | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``keys`` | `keys`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 9accf099c4f..f25ad9b6791 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -315,6 +315,27 @@ def merge(self, right, **kwargs): """ pass + @abc.abstractmethod + def join(self, right, **kwargs): + """ + Join columns of another DataFrame. + + Parameters + ---------- + right : BaseQueryCompiler + The query compiler of the right DataFrame to join with. + + Returns + ------- + BaseQueryCompiler + A new query compiler that contains result of the join. + + Notes + ----- + See pd.DataFrame.join for more info on kwargs. + """ + pass + # END Abstract inter-data operations # Abstract Transpose diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 0a8059fff2f..281ac366730 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -448,6 +448,41 @@ def map_func(left, right=right, kwargs=kwargs): else: return self.default_to_pandas(pandas.DataFrame.merge, right, **kwargs) + def join(self, right, **kwargs): + """ + Join columns of another DataFrame. + + Parameters + ---------- + right : BaseQueryCompiler + The query compiler of the right DataFrame to join with. + + Returns + ------- + BaseQueryCompiler + A new query compiler that contains result of the join. + + Notes + ----- + See pd.DataFrame.join for more info on kwargs. + """ + on = kwargs.get("on", None) + how = kwargs.get("how", "left") + sort = kwargs.get("sort", False) + + if how in ["left", "inner"]: + right = right.to_pandas() + + def map_func(left, right=right, kwargs=kwargs): + return pandas.DataFrame.join(left, right, **kwargs) + + new_self = self.__constructor__( + self._modin_frame._apply_full_axis(1, map_func) + ) + return new_self.sort_rows_by_column_values(on) if sort else new_self + else: + return self.default_to_pandas(pandas.DataFrame.join, right, **kwargs) + # END Inter-Data operations # Reindex/reset_index (may shuffle data) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 84ec19b0918..922a8e37cb5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1311,22 +1311,21 @@ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): DataFrame A dataframe containing columns from both the caller and other. """ - if on is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.join, - other, - on=on, - how=how, - lsuffix=lsuffix, - rsuffix=rsuffix, - sort=sort, - ) if isinstance(other, Series): if other.name is None: raise ValueError("Other Series must have a name") other = DataFrame({other.name: other}) + if on is not None: + return self.__constructor__( + query_compiler=self._query_compiler.join( + other._query_compiler, + on=on, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + sort=sort, + ) + ) if isinstance(other, DataFrame): # Joining the empty DataFrames with either index or columns is # fast. It gives us proper error checking for the edge cases that diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 4ebf4fc313a..21c5dfc2e16 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5864,7 +5864,76 @@ def test_combine(self, data): pandas_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 ) - def test_join(self): + @pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ], + ) + def test_join(self, test_data, test_data2): + modin_df = pd.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + ) + pandas_df = pandas.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pandas.Index( + [i for i in range(1, test_data.shape[0] + 1)], name="key" + ), + ) + modin_df2 = pd.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + ) + pandas_df2 = pandas.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pandas.Index( + [i for i in range(1, test_data2.shape[0] + 1)], name="key" + ), + ) + + hows = ["inner", "left", "right", "outer"] + ons = ["col33", "col34"] + sorts = [False, True] + for i in range(4): + for j in range(2): + modin_result = modin_df.join( + modin_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + pandas_result = pandas_df.join( + pandas_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + df_equals(modin_result, pandas_result) + frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], From 81cf80c45febec2187b14dc428d7a6848ca98bb5 Mon Sep 17 00:00:00 2001 From: YarShev Date: Thu, 27 Aug 2020 17:33:12 +0300 Subject: [PATCH 076/120] REFACTOR-#1973: Refactor code in accordance with (#1974) formatting style of new released version of black Signed-off-by: Igoshev, Yaroslav --- modin/backends/base/query_compiler.py | 3 +- modin/backends/pandas/parsers.py | 6 +- modin/backends/pandas/query_compiler.py | 31 +++-- modin/engines/base/frame/axis_partition.py | 74 ++++++------ modin/engines/base/frame/data.py | 20 +++- modin/engines/base/frame/partition.py | 12 +- .../io/column_stores/column_store_reader.py | 7 +- .../pandas_on_dask/frame/axis_partition.py | 8 +- .../dask/pandas_on_dask/frame/partition.py | 12 +- .../pandas_on_python/frame/axis_partition.py | 8 +- .../pandas_on_python/frame/partition.py | 12 +- .../ray/pandas_on_ray/frame/axis_partition.py | 8 +- modin/experimental/cloud/rpyc_proxy.py | 4 +- .../engines/pandas_on_ray/io_exp.py | 2 +- .../experimental/engines/pandas_on_ray/sql.py | 20 ++-- .../pyarrow_on_ray/frame/axis_partition.py | 8 +- modin/experimental/pandas/io_exp.py | 2 +- modin/pandas/base.py | 109 +++++++++++++----- modin/pandas/general.py | 6 +- modin/pandas/indexing.py | 9 +- modin/pandas/io.py | 2 +- modin/pandas/plotting.py | 2 +- modin/pandas/series.py | 22 ++-- modin/pandas/test/test_concat.py | 3 +- modin/pandas/test/test_dataframe.py | 31 +++-- modin/pandas/test/test_io.py | 3 +- modin/pandas/test/test_rolling.py | 29 +++-- modin/pandas/test/test_series.py | 23 ++-- 28 files changed, 299 insertions(+), 177 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index f25ad9b6791..ffed651c639 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -88,8 +88,7 @@ def concat(self, axis, other, **kwargs): # Data Management Methods @abc.abstractmethod def free(self): - """In the future, this will hopefully trigger a cleanup of this object. - """ + """In the future, this will hopefully trigger a cleanup of this object.""" # TODO create a way to clean up this object. pass diff --git a/modin/backends/pandas/parsers.py b/modin/backends/pandas/parsers.py index fe71ff86d1a..e08da2a78ce 100644 --- a/modin/backends/pandas/parsers.py +++ b/modin/backends/pandas/parsers.py @@ -73,8 +73,10 @@ def single_worker_read(cls, fname, **kwargs): pandas_frame = cls.parse(fname, **kwargs) if isinstance(pandas_frame, pandas.io.parsers.TextFileReader): pd_read = pandas_frame.read - pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( - pd_read(*args, **kwargs), cls.frame_cls + pandas_frame.read = ( + lambda *args, **kwargs: cls.query_compiler_cls.from_pandas( + pd_read(*args, **kwargs), cls.frame_cls + ) ) return pandas_frame elif isinstance(pandas_frame, (OrderedDict, dict)): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 281ac366730..0239a4c5108 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -144,7 +144,7 @@ def caller(df, *args, **kwargs): class PandasQueryCompiler(BaseQueryCompiler): """This class implements the logic necessary for operating on partitions - with a Pandas backend. This logic is specific to Pandas.""" + with a Pandas backend. This logic is specific to Pandas.""" def __init__(self, modin_frame): self._modin_frame = modin_frame @@ -272,8 +272,7 @@ def concat(self, axis, other, **kwargs): # Data Management Methods def free(self): - """In the future, this will hopefully trigger a cleanup of this object. - """ + """In the future, this will hopefully trigger a cleanup of this object.""" # TODO create a way to clean up this object. return @@ -1172,7 +1171,9 @@ def unique(self): The unique values returned as a NumPy array. """ new_modin_frame = self._modin_frame._apply_full_axis( - 0, lambda x: x.squeeze(axis=1).unique(), new_columns=self.columns, + 0, + lambda x: x.squeeze(axis=1).unique(), + new_columns=self.columns, ) return self.__constructor__(new_modin_frame) @@ -1728,7 +1729,9 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): ) new_index = pandas.RangeIndex(len(self.index) * len(value_vars)) new_modin_frame = self._modin_frame.__constructor__( - new_parts, index=new_index, columns=id_vars + [var_name, value_name], + new_parts, + index=new_index, + columns=id_vars + [var_name, value_name], ) result = self.__constructor__(new_modin_frame) # this assigment needs to propagate correct indices into partitions @@ -2267,11 +2270,19 @@ def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): broadcast_values2 = broadcast_values2.reset_index(drop=True) # Index may contain duplicates new_index1 = broadcast_values1.sort_values( - by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position, + by=columns, + axis=0, + ascending=ascending, + kind=kind, + na_position=na_position, ).index # Index without duplicates new_index2 = broadcast_values2.sort_values( - by=columns, axis=0, ascending=ascending, kind=kind, na_position=na_position, + by=columns, + axis=0, + ascending=ascending, + kind=kind, + na_position=na_position, ).index result = self.reset_index(drop=True).reindex(0, new_index2) @@ -2307,7 +2318,11 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): ) broadcast_values.columns = self.columns new_columns = broadcast_values.sort_values( - by=rows, axis=1, ascending=ascending, kind=kind, na_position=na_position, + by=rows, + axis=1, + ascending=ascending, + kind=kind, + na_position=na_position, ).columns return self.reindex(1, new_columns) diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py index f7ba343b9a2..bb8a92c1006 100644 --- a/modin/engines/base/frame/axis_partition.py +++ b/modin/engines/base/frame/axis_partition.py @@ -19,23 +19,23 @@ class BaseFrameAxisPartition(object): # pragma: no cover """This abstract class represents the Parent class for any - `ColumnPartition` or `RowPartition` class. This class is intended to - simplify the way that operations are performed - - Note 0: The procedures that use this class and its methods assume that - they have some global knowledge about the entire axis. This may - require the implementation to use concatenation or append on the - list of block partitions in this object. - - Note 1: The `BaseFrameManager` object that controls these objects - (through the API exposed here) has an invariant that requires that - this object is never returned from a function. It assumes that - there will always be `BaseFramePartition` object stored and structures - itself accordingly. - - The abstract methods that need implemented are `apply` and `shuffle`. - The children classes must also implement `instance_type` and `partition_type` - (see below). + `ColumnPartition` or `RowPartition` class. This class is intended to + simplify the way that operations are performed + + Note 0: The procedures that use this class and its methods assume that + they have some global knowledge about the entire axis. This may + require the implementation to use concatenation or append on the + list of block partitions in this object. + + Note 1: The `BaseFrameManager` object that controls these objects + (through the API exposed here) has an invariant that requires that + this object is never returned from a function. It assumes that + there will always be `BaseFramePartition` object stored and structures + itself accordingly. + + The abstract methods that need implemented are `apply` and `shuffle`. + The children classes must also implement `instance_type` and `partition_type` + (see below). """ def apply( @@ -99,15 +99,15 @@ def _wrap_partitions(self, partitions): class PandasFrameAxisPartition(BaseFrameAxisPartition): """This abstract class is created to simplify and consolidate the code for - AxisPartitions that run pandas. Because much of the code is similar, this allows - us to reuse this code. + AxisPartitions that run pandas. Because much of the code is similar, this allows + us to reuse this code. - Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition` - objects and creates something interpretable as a pandas DataFrame. + Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition` + objects and creates something interpretable as a pandas DataFrame. - See `modin.engines.ray.pandas_on_ray.axis_partition.PandasOnRayFrameAxisPartition` - for an example on how to override/use this class when the implementation needs - to be augmented. + See `modin.engines.ray.pandas_on_ray.axis_partition.PandasOnRayFrameAxisPartition` + for an example on how to override/use this class when the implementation needs + to be augmented. """ def apply( @@ -181,19 +181,19 @@ def deploy_axis_func( ): """Deploy a function along a full axis in Ray. - Args: - axis: The axis to perform the function along. - func: The function to perform. - num_splits: The number of splits to return - (see `split_result_of_axis_func_pandas`) - kwargs: A dictionary of keyword arguments. - maintain_partitioning: If True, keep the old partitioning if possible. - If False, create a new partition layout. - partitions: All partitions that make up the full axis (row or column) - - Returns: - A list of Pandas DataFrames. - """ + Args: + axis: The axis to perform the function along. + func: The function to perform. + num_splits: The number of splits to return + (see `split_result_of_axis_func_pandas`) + kwargs: A dictionary of keyword arguments. + maintain_partitioning: If True, keep the old partitioning if possible. + If False, create a new partition layout. + partitions: All partitions that make up the full axis (row or column) + + Returns: + A list of Pandas DataFrames. + """ # Pop these off first because they aren't expected by the function. manual_partition = kwargs.pop("manual_partition", False) lengths = kwargs.pop("_lengths", None) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 347ade92166..e5becdf4189 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -288,7 +288,8 @@ def _validate_axis_equality(self, axis: int, force: bool = False): self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches) else: self._set_axis( - axis, self.axes[axis], + axis, + self.axes[axis], ) def _validate_internal_indices(self, mode=None, **kwargs): @@ -1176,7 +1177,12 @@ def filter_full_axis(self, axis, func): ) def _apply_full_axis( - self, axis, func, new_index=None, new_columns=None, dtypes=None, + self, + axis, + func, + new_index=None, + new_columns=None, + dtypes=None, ): """ Perform a function across an entire axis. @@ -1272,8 +1278,14 @@ def _apply_full_axis_select_indices( # Get the indices for the axis being applied to (it is the opposite of axis # being applied over) dict_indices = self._get_dict_of_block_index(axis ^ 1, numeric_indices) - new_partitions = self._frame_mgr_cls.apply_func_to_select_indices_along_full_axis( - axis, self._partitions, func, dict_indices, keep_remaining=keep_remaining + new_partitions = ( + self._frame_mgr_cls.apply_func_to_select_indices_along_full_axis( + axis, + self._partitions, + func, + dict_indices, + keep_remaining=keep_remaining, + ) ) # TODO Infer columns and index from `keep_remaining` and `apply_indices` if new_index is None: diff --git a/modin/engines/base/frame/partition.py b/modin/engines/base/frame/partition.py index 3a716be214d..4dc04c8f469 100644 --- a/modin/engines/base/frame/partition.py +++ b/modin/engines/base/frame/partition.py @@ -16,14 +16,14 @@ class BaseFramePartition(object): # pragma: no cover """This abstract class holds the data and metadata for a single partition. - The methods required for implementing this abstract class are listed in - the section immediately following this. + The methods required for implementing this abstract class are listed in + the section immediately following this. - The API exposed by the children of this object is used in - `BaseFrameManager`. + The API exposed by the children of this object is used in + `BaseFrameManager`. - Note: These objects are treated as immutable by `BaseFrameManager` - subclasses. There is no logic for updating inplace. + Note: These objects are treated as immutable by `BaseFrameManager` + subclasses. There is no logic for updating inplace. """ # Abstract methods and fields. These must be implemented in order to diff --git a/modin/engines/base/io/column_stores/column_store_reader.py b/modin/engines/base/io/column_stores/column_store_reader.py index dcb22a96b16..e13cf8619f4 100644 --- a/modin/engines/base/io/column_stores/column_store_reader.py +++ b/modin/engines/base/io/column_stores/column_store_reader.py @@ -113,7 +113,12 @@ def build_query_compiler(cls, path, columns, **kwargs): dtypes = cls.build_dtypes(partition_ids[-1], columns) new_query_compiler = cls.query_compiler_cls( cls.frame_cls( - remote_parts, index, columns, row_lens, column_widths, dtypes=dtypes, + remote_parts, + index, + columns, + row_lens, + column_widths, + dtypes=dtypes, ) ) return new_query_compiler diff --git a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py index 0b4d354b50a..5c69e51e46c 100644 --- a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py +++ b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py @@ -79,8 +79,8 @@ def deploy_func_between_two_axis_partitions( class PandasOnDaskFrameColumnPartition(PandasOnDaskFrameAxisPartition): """The column partition implementation for Multiprocess. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 0 @@ -88,8 +88,8 @@ class PandasOnDaskFrameColumnPartition(PandasOnDaskFrameAxisPartition): class PandasOnDaskFrameRowPartition(PandasOnDaskFrameAxisPartition): """The row partition implementation for Multiprocess. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 1 diff --git a/modin/engines/dask/pandas_on_dask/frame/partition.py b/modin/engines/dask/pandas_on_dask/frame/partition.py index a702173abc0..4ea55c2a2be 100644 --- a/modin/engines/dask/pandas_on_dask/frame/partition.py +++ b/modin/engines/dask/pandas_on_dask/frame/partition.py @@ -30,14 +30,14 @@ def apply_list_of_funcs(funcs, df): class PandasOnDaskFramePartition(BaseFramePartition): """This abstract class holds the data and metadata for a single partition. - The methods required for implementing this abstract class are listed in - the section immediately following this. + The methods required for implementing this abstract class are listed in + the section immediately following this. - The API exposed by the children of this object is used in - `BaseFrameManager`. + The API exposed by the children of this object is used in + `BaseFrameManager`. - Note: These objects are treated as immutable by `BaseFrameManager` - subclasses. There is no logic for updating inplace. + Note: These objects are treated as immutable by `BaseFrameManager` + subclasses. There is no logic for updating inplace. """ def __init__(self, future, length=None, width=None, call_queue=None): diff --git a/modin/engines/python/pandas_on_python/frame/axis_partition.py b/modin/engines/python/pandas_on_python/frame/axis_partition.py index d0bbc8c2be5..5467111e87c 100644 --- a/modin/engines/python/pandas_on_python/frame/axis_partition.py +++ b/modin/engines/python/pandas_on_python/frame/axis_partition.py @@ -30,8 +30,8 @@ def __init__(self, list_of_blocks): class PandasOnPythonFrameColumnPartition(PandasOnPythonFrameAxisPartition): """The column partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 0 @@ -39,8 +39,8 @@ class PandasOnPythonFrameColumnPartition(PandasOnPythonFrameAxisPartition): class PandasOnPythonFrameRowPartition(PandasOnPythonFrameAxisPartition): """The row partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 1 diff --git a/modin/engines/python/pandas_on_python/frame/partition.py b/modin/engines/python/pandas_on_python/frame/partition.py index d2cfc8f238a..f6014e7aa1b 100644 --- a/modin/engines/python/pandas_on_python/frame/partition.py +++ b/modin/engines/python/pandas_on_python/frame/partition.py @@ -19,14 +19,14 @@ class PandasOnPythonFramePartition(BaseFramePartition): """This abstract class holds the data and metadata for a single partition. - The methods required for implementing this abstract class are listed in - the section immediately following this. + The methods required for implementing this abstract class are listed in + the section immediately following this. - The API exposed by the children of this object is used in - `BaseFrameManager`. + The API exposed by the children of this object is used in + `BaseFrameManager`. - Note: These objects are treated as immutable by `BaseFrameManager` - subclasses. There is no logic for updating inplace. + Note: These objects are treated as immutable by `BaseFrameManager` + subclasses. There is no logic for updating inplace. """ def __init__(self, data, length=None, width=None, call_queue=None): diff --git a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py index 8aba64c431b..515bbd495ea 100644 --- a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py +++ b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py @@ -72,8 +72,8 @@ def _wrap_partitions(self, partitions): class PandasOnRayFrameColumnPartition(PandasOnRayFrameAxisPartition): """The column partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 0 @@ -81,8 +81,8 @@ class PandasOnRayFrameColumnPartition(PandasOnRayFrameAxisPartition): class PandasOnRayFrameRowPartition(PandasOnRayFrameAxisPartition): """The row partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 1 diff --git a/modin/experimental/cloud/rpyc_proxy.py b/modin/experimental/cloud/rpyc_proxy.py index 17d627d64f0..d7ec8ae824b 100644 --- a/modin/experimental/cloud/rpyc_proxy.py +++ b/modin/experimental/cloud/rpyc_proxy.py @@ -614,7 +614,9 @@ def make_dataframe_groupby_wrapper(DataFrameGroupBy): Look for deatils in make_dataframe_wrapper() and _deliveringWrapper(). """ DeliveringDataFrameGroupBy = _deliveringWrapper( - DataFrameGroupBy, ["agg", "aggregate", "apply"], target_name="DataFrameGroupBy", + DataFrameGroupBy, + ["agg", "aggregate", "apply"], + target_name="DataFrameGroupBy", ) return DeliveringDataFrameGroupBy diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py index a3116c22958..c093e93708c 100644 --- a/modin/experimental/engines/pandas_on_ray/io_exp.py +++ b/modin/experimental/engines/pandas_on_ray/io_exp.py @@ -70,7 +70,7 @@ def read_sql( upper_bound=None, max_sessions=None, ): - """ Read SQL query or database table into a DataFrame. + """Read SQL query or database table into a DataFrame. Args: sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. diff --git a/modin/experimental/engines/pandas_on_ray/sql.py b/modin/experimental/engines/pandas_on_ray/sql.py index 2b24e822737..1fa74e367d8 100644 --- a/modin/experimental/engines/pandas_on_ray/sql.py +++ b/modin/experimental/engines/pandas_on_ray/sql.py @@ -16,7 +16,7 @@ def is_distributed(partition_column, lower_bound, upper_bound): - """ Check if is possible distribute a query given that args + """Check if is possible distribute a query given that args Args: partition_column: column used to share the data between the workers @@ -45,7 +45,7 @@ def is_distributed(partition_column, lower_bound, upper_bound): def is_table(engine, sql): - """ Check with the given sql arg is query or table + """Check with the given sql arg is query or table Args: engine: SQLAlchemy connection engine @@ -60,7 +60,7 @@ def is_table(engine, sql): def get_table_metadata(engine, table): - """ Extract all useful infos from the given table + """Extract all useful infos from the given table Args: engine: SQLAlchemy connection engine @@ -76,7 +76,7 @@ def get_table_metadata(engine, table): def get_table_columns(metadata): - """ Extract columns names and python typos from metadata + """Extract columns names and python typos from metadata Args: metadata: Table metadata @@ -92,7 +92,7 @@ def get_table_columns(metadata): def build_query_from_table(name): - """ Create a query given the table name + """Create a query given the table name Args: name: Table name @@ -104,7 +104,7 @@ def build_query_from_table(name): def check_query(query): - """ Check query sanity + """Check query sanity Args: query: query string @@ -120,7 +120,7 @@ def check_query(query): def get_query_columns(engine, query): - """ Extract columns names and python typos from query + """Extract columns names and python typos from query Args: engine: SQLAlchemy connection engine @@ -140,7 +140,7 @@ def get_query_columns(engine, query): def check_partition_column(partition_column, cols): - """ Check partition_column existence and type + """Check partition_column existence and type Args: partition_column: partition_column name @@ -163,7 +163,7 @@ def check_partition_column(partition_column, cols): def get_query_info(sql, con, partition_column): - """ Return a columns name list and the query string + """Return a columns name list and the query string Args: sql: SQL query or table name @@ -189,7 +189,7 @@ def get_query_info(sql, con, partition_column): def query_put_bounders(query, partition_column, start, end): - """ Put bounders in the query + """Put bounders in the query Args: query: SQL query string diff --git a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py index 4896f772adb..be82e790e7b 100644 --- a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py +++ b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py @@ -80,8 +80,8 @@ def shuffle(self, func, num_splits=None, **kwargs): class PyarrowOnRayFrameColumnPartition(PyarrowOnRayFrameAxisPartition): """The column partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 0 @@ -89,8 +89,8 @@ class PyarrowOnRayFrameColumnPartition(PyarrowOnRayFrameAxisPartition): class PyarrowOnRayFrameRowPartition(PyarrowOnRayFrameAxisPartition): """The row partition implementation for Ray. All of the implementation - for this class is in the parent class, and this class defines the axis - to perform the computation over. + for this class is in the parent class, and this class defines the axis + to perform the computation over. """ axis = 1 diff --git a/modin/experimental/pandas/io_exp.py b/modin/experimental/pandas/io_exp.py index a4a58d94439..a0d37c2beb0 100644 --- a/modin/experimental/pandas/io_exp.py +++ b/modin/experimental/pandas/io_exp.py @@ -32,7 +32,7 @@ def read_sql( upper_bound=None, max_sessions=None, ): - """ Read SQL query or database table into a DataFrame. + """Read SQL query or database table into a DataFrame. Args: sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 175ec39ca67..29f3ad892af 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -42,9 +42,9 @@ class BasePandasDataset(object): """This object is the base for most of the common code that exists in - DataFrame/Series. Since both objects share the same underlying representation, - and the algorithms are the same, we use this object to define the general - behavior of those objects and then use those objects to define the output type. + DataFrame/Series. Since both objects share the same underlying representation, + and the algorithms are the same, we use this object to define the general + behavior of those objects and then use those objects to define the output type. """ # Siblings are other objects that share the same query compiler. We use this list @@ -606,7 +606,12 @@ def apply( elif not callable(func) and not is_list_like(func): raise TypeError("{} object is not callable".format(type(func))) query_compiler = self._query_compiler.apply( - func, axis, args=args, raw=raw, result_type=result_type, **kwds, + func, + axis, + args=args, + raw=raw, + result_type=result_type, + **kwds, ) return query_compiler @@ -1077,19 +1082,19 @@ def droplevel(self, level, axis=0): def drop_duplicates(self, keep="first", inplace=False, **kwargs): """Return DataFrame with duplicate rows removed, optionally only considering certain columns - Args: - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns - keep : {'first', 'last', False}, default 'first' - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - inplace : boolean, default False - Whether to drop duplicates in place or to return a copy - - Returns: - deduplicated : DataFrame + Args: + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy + + Returns: + deduplicated : DataFrame """ inplace = validate_bool_kwarg(inplace, "inplace") subset = kwargs.get("subset", None) @@ -1147,8 +1152,7 @@ def expanding(self, min_periods=1, center=False, axis=0): ) def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for fillna(method='ffill') - """ + """Synonym for fillna(method='ffill')""" return self.fillna( method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace ) @@ -3656,7 +3660,12 @@ def apply(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_app_ser dataframe = DataFrame( - query_compiler=query_comp_op(self.resample_args, func, *args, **kwargs,) + query_compiler=query_comp_op( + self.resample_args, + func, + *args, + **kwargs, + ) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -3675,7 +3684,12 @@ def aggregate(self, func, *args, **kwargs): query_comp_op = self._query_compiler.resample_agg_ser dataframe = DataFrame( - query_compiler=query_comp_op(self.resample_args, func, *args, **kwargs,) + query_compiler=query_comp_op( + self.resample_args, + func, + *args, + **kwargs, + ) ) if is_list_like(func) or isinstance(self._dataframe, DataFrame): return dataframe @@ -3786,21 +3800,30 @@ def nunique(self, _method="nunique", *args, **kwargs): def first(self, _method="first", *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_first( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) def last(self, _method="last", *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_last( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) def max(self, _method="max", *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_max( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) @@ -3821,7 +3844,10 @@ def median(self, _method="median", *args, **kwargs): def min(self, _method="min", *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_min( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) @@ -3831,13 +3857,19 @@ def ohlc(self, _method="ohlc", *args, **kwargs): if isinstance(self._dataframe, DataFrame): return DataFrame( query_compiler=self._query_compiler.resample_ohlc_df( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) else: return DataFrame( query_compiler=self._query_compiler.resample_ohlc_ser( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) @@ -3858,7 +3890,10 @@ def size(self): def sem(self, _method="sem", *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.resample_sem( - self.resample_args, _method, *args, **kwargs, + self.resample_args, + _method, + *args, + **kwargs, ) ) @@ -4077,18 +4112,30 @@ def apply( ): return self._dataframe.__constructor__( query_compiler=self._query_compiler.rolling_apply( - self.rolling_args, func, raw, engine, engine_kwargs, args, kwargs, + self.rolling_args, + func, + raw, + engine, + engine_kwargs, + args, + kwargs, ) ) def aggregate( - self, func, *args, **kwargs, + self, + func, + *args, + **kwargs, ): from .dataframe import DataFrame dataframe = DataFrame( query_compiler=self._query_compiler.rolling_aggregate( - self.rolling_args, func, *args, **kwargs, + self.rolling_args, + func, + *args, + **kwargs, ) ) if isinstance(self._dataframe, DataFrame): diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 3737ce7e008..c8b635ef4f2 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -372,5 +372,9 @@ def value_counts( It slightly differ from pandas where indices are located in random order. """ return Series(values).value_counts( - sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, ) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 0b3c3d872e0..dd313c379a7 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -101,8 +101,7 @@ def _parse_tuple(tup): def _compute_ndim(row_loc, col_loc): - """Compute the ndim of result from locators - """ + """Compute the ndim of result from locators""" row_scaler = is_scalar(row_loc) or is_tuple(row_loc) col_scaler = is_scalar(col_loc) or is_tuple(col_loc) @@ -117,8 +116,7 @@ def _compute_ndim(row_loc, col_loc): class _LocationIndexerBase(object): - """Base class for location indexer like loc and iloc - """ + """Base class for location indexer like loc and iloc""" def __init__(self, modin_df): self.df = modin_df @@ -226,8 +224,7 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape): ) def _write_items(self, row_lookup, col_lookup, item): - """Perform remote write and replace blocks. - """ + """Perform remote write and replace blocks.""" new_qc = self.qc.write_items(row_lookup, col_lookup, item) self.df._create_or_update_from_compiler(new_qc, inplace=True) diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 75878855555..f8dceb5db8c 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -345,7 +345,7 @@ def read_sql( columns=None, chunksize=None, ): - """ Read SQL query or database table into a DataFrame. + """Read SQL query or database table into a DataFrame. Args: sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. diff --git a/modin/pandas/plotting.py b/modin/pandas/plotting.py index 1d0f629f023..3ba9af5d7d4 100644 --- a/modin/pandas/plotting.py +++ b/modin/pandas/plotting.py @@ -34,7 +34,7 @@ def __dir__(self): def __getattribute__(self, item): """This method will override the parameters passed and convert any Modin - DataFrames to pandas so that they can be plotted normally + DataFrames to pandas so that they can be plotted normally """ if hasattr(pdplot, item): func = getattr(pdplot, item) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 46290d7f56a..c2c308be80d 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -51,17 +51,17 @@ def __init__( query_compiler=None, ): """ - One-dimensional ndarray with axis labels (including time series). - - Args: - data: Contains data stored in Series. - index: Values must be hashable and have the same length as `data`. - dtype: Data type for the output Series. If not specified, this will be - inferred from `data`. - name: The name to give to the Series. - copy: Copy input data. - query_compiler: A query compiler object to create the Series from. - """ + One-dimensional ndarray with axis labels (including time series). + + Args: + data: Contains data stored in Series. + index: Values must be hashable and have the same length as `data`. + dtype: Data type for the output Series. If not specified, this will be + inferred from `data`. + name: The name to give to the Series. + copy: Copy input data. + query_compiler: A query compiler object to create the Series from. + """ if isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index f1caf30fc81..9747c3d5a27 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -213,6 +213,7 @@ def test_sort_order(sort, join, axis): pandas_concat = pandas.concat([pandas_df, pandas_df2], join=join, sort=sort) modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort) df_equals( - pandas_concat, modin_concat, + pandas_concat, + modin_concat, ) assert list(pandas_concat.columns) == list(modin_concat.columns) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 21c5dfc2e16..3aa43bce7ae 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2188,7 +2188,8 @@ def test_cov(self): df_equals(modin_result, pandas_result) @pytest.mark.skipif( - os.name == "nt", reason="AssertionError: numpy array are different", + os.name == "nt", + reason="AssertionError: numpy array are different", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dot(self, data): @@ -2237,7 +2238,8 @@ def test_dot(self, data): df_equals(modin_result, pandas_result) @pytest.mark.skipif( - os.name == "nt", reason="AssertionError: numpy array are different", + os.name == "nt", + reason="AssertionError: numpy array are different", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_matmul(self, data): @@ -2415,7 +2417,9 @@ def test_kurt_kurtosis_level(self, level): df_modin.columns = index df_pandas.columns = index eval_general( - df_modin, df_pandas, lambda df: df.kurtosis(axis=1, level=level), + df_modin, + df_pandas, + lambda df: df.kurtosis(axis=1, level=level), ) def test_last(self): @@ -2454,7 +2458,9 @@ def test_mad_level(self, level): modin_df.columns = index pandas_df.columns = index eval_general( - modin_df, pandas_df, lambda df: df.mad(axis=1, level=level), + modin_df, + pandas_df, + lambda df: df.mad(axis=1, level=level), ) def test_mask(self): @@ -2644,10 +2650,12 @@ def test_resample(self, rule, axis, closed, label, on, level): pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), ) df_equals( - pandas_resampler.aggregate("max"), modin_resampler.aggregate("max"), + pandas_resampler.aggregate("max"), + modin_resampler.aggregate("max"), ) df_equals( - modin_resampler.apply("sum"), pandas_resampler.apply("sum"), + modin_resampler.apply("sum"), + pandas_resampler.apply("sum"), ) df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), @@ -2660,7 +2668,8 @@ def test_resample(self, rule, axis, closed, label, on, level): # Upsampling from level= or on= selection is not supported if on is None and level is None: df_equals( - modin_resampler.interpolate(), pandas_resampler.interpolate(), + modin_resampler.interpolate(), + pandas_resampler.interpolate(), ) df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) df_equals( @@ -5474,7 +5483,9 @@ def test___setitem__mask(self): ids=["empty", "empty_columns"], ) @pytest.mark.parametrize( - "value", [np.array(["one", "two"]), [11, 22]], ids=["ndarray", "list"], + "value", + [np.array(["one", "two"]), [11, 22]], + ids=["ndarray", "list"], ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) @@ -5766,7 +5777,9 @@ def test_inplace_series_ops(self, data): modin_df[col0].fillna(0, inplace=True) df_equals(modin_df, pandas_df) - def test___setattr__(self,): + def test___setattr__( + self, + ): pandas_df = pandas.DataFrame([1, 2, 3]) modin_df = pd.DataFrame([1, 2, 3]) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index a4e9fa55090..ec307a5a911 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -527,7 +527,8 @@ def test_from_json_lines(): @pytest.mark.parametrize( - "data", [json_short_string, json_short_bytes, json_long_string, json_long_bytes], + "data", + [json_short_string, json_short_bytes, json_long_string, json_long_bytes], ) def test_read_json_string_bytes(data): with pytest.warns(UserWarning): diff --git a/modin/pandas/test/test_rolling.py b/modin/pandas/test/test_rolling.py index 66193903faf..8c41f7323bd 100644 --- a/modin/pandas/test/test_rolling.py +++ b/modin/pandas/test/test_rolling.py @@ -39,10 +39,16 @@ def test_dataframe(data, window, min_periods, win_type): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) pandas_rolled = pandas_df.rolling( - window=window, min_periods=min_periods, win_type=win_type, center=True, + window=window, + min_periods=min_periods, + win_type=win_type, + center=True, ) modin_rolled = modin_df.rolling( - window=window, min_periods=min_periods, win_type=win_type, center=True, + window=window, + min_periods=min_periods, + win_type=win_type, + center=True, ) # Testing of Window class if win_type is not None: @@ -115,7 +121,8 @@ def test_dataframe_dt_index(axis, on, closed, window): df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( - modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), + modin_rolled.apply(np.sum, raw=True), + pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) @@ -129,10 +136,16 @@ def test_series(data, window, min_periods, win_type): modin_series, pandas_series = create_test_series(data) pandas_rolled = pandas_series.rolling( - window=window, min_periods=min_periods, win_type=win_type, center=True, + window=window, + min_periods=min_periods, + win_type=win_type, + center=True, ) modin_rolled = modin_series.rolling( - window=window, min_periods=min_periods, win_type=win_type, center=True, + window=window, + min_periods=min_periods, + win_type=win_type, + center=True, ) # Testing of Window class if win_type is not None: @@ -151,7 +164,8 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.min(), pandas_rolled.min()) df_equals(modin_rolled.max(), pandas_rolled.max()) df_equals( - modin_rolled.corr(modin_series), pandas_rolled.corr(pandas_series), + modin_rolled.corr(modin_series), + pandas_rolled.corr(pandas_series), ) df_equals( modin_rolled.cov(modin_series, True), pandas_rolled.cov(pandas_series, True) @@ -165,7 +179,8 @@ def test_series(data, window, min_periods, win_type): df_equals(modin_rolled.apply(np.sum), pandas_rolled.apply(np.sum)) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals( - modin_rolled.agg([np.sum, np.mean]), pandas_rolled.agg([np.sum, np.mean]), + modin_rolled.agg([np.sum, np.mean]), + pandas_rolled.agg([np.sum, np.mean]), ) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 88b89549812..1a3461fd4b9 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1437,7 +1437,8 @@ def test_dt(): modin_series.dt.to_pydatetime(), pandas_series.dt.to_pydatetime() ) df_equals( - modin_series.dt.tz_localize(None), pandas_series.dt.tz_localize(None), + modin_series.dt.tz_localize(None), + pandas_series.dt.tz_localize(None), ) df_equals( modin_series.dt.tz_convert(tz="Europe/Berlin"), @@ -1823,7 +1824,9 @@ def test_kurt_kurtosis(data, axis, skipna, level, numeric_only, method): modin_series, pandas_series = create_test_series(data) eval_general( - modin_series, pandas_series, lambda df: df.kurtosis(**func_kwargs), + modin_series, + pandas_series, + lambda df: df.kurtosis(**func_kwargs), ) @@ -2396,7 +2399,9 @@ def test_repeat(data, repeats): ) def test_repeat_lists(data, repeats): eval_general( - pd.Series(data), pandas.Series(data), lambda df: df.repeat(repeats), + pd.Series(data), + pandas.Series(data), + lambda df: df.repeat(repeats), ) @@ -2461,10 +2466,12 @@ def test_resample(closed, label, level): pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), ) df_equals( - pandas_resampler.aggregate("max"), modin_resampler.aggregate("max"), + pandas_resampler.aggregate("max"), + modin_resampler.aggregate("max"), ) df_equals( - modin_resampler.apply("sum"), pandas_resampler.apply("sum"), + modin_resampler.apply("sum"), + pandas_resampler.apply("sum"), ) df_equals( modin_resampler.get_group(name=list(modin_resampler.groups)[0]), @@ -2476,7 +2483,8 @@ def test_resample(closed, label, level): # Upsampling from level= or on= selection is not supported if level is None: df_equals( - modin_resampler.interpolate(), pandas_resampler.interpolate(), + modin_resampler.interpolate(), + pandas_resampler.interpolate(), ) df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) df_equals( @@ -3022,7 +3030,8 @@ def test_tz_localize(): pandas_series.tz_localize("America/Los_Angeles"), ) df_equals( - modin_series.tz_localize("UTC"), pandas_series.tz_localize("UTC"), + modin_series.tz_localize("UTC"), + pandas_series.tz_localize("UTC"), ) From b867edfba345e5f3c28398274b876d7b31f5a5ba Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Fri, 28 Aug 2020 11:03:50 +0300 Subject: [PATCH 077/120] FEAT-#1838: Lazy map evaluation at Pandas backend (#1940) --- modin/engines/base/frame/data.py | 8 +++++--- modin/engines/base/frame/partition_manager.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index e5becdf4189..b6bfdec1801 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -706,7 +706,9 @@ def astype(self, col_dtypes): def astype_builder(df): return df.astype({k: v for k, v in col_dtypes.items() if k in df}) - new_frame = self._frame_mgr_cls.map_partitions(self._partitions, astype_builder) + new_frame = self._frame_mgr_cls.lazy_map_partitions( + self._partitions, astype_builder + ) return self.__constructor__( new_frame, self.index, @@ -1041,7 +1043,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): else: reduce_func = self._build_mapreduce_func(axis, reduce_func) - map_parts = self._frame_mgr_cls.map_partitions(self._partitions, map_func) + map_parts = self._frame_mgr_cls.lazy_map_partitions(self._partitions, map_func) reduce_parts = self._frame_mgr_cls.map_axis_partitions( axis, map_parts, reduce_func ) @@ -1079,7 +1081,7 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): ------- A new dataframe. """ - new_partitions = self._frame_mgr_cls.map_partitions(self._partitions, func) + new_partitions = self._frame_mgr_cls.lazy_map_partitions(self._partitions, func) if dtypes == "copy": dtypes = self._dtypes elif dtypes is not None: diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 9208b3b3741..40925001d93 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -83,7 +83,7 @@ def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): new_partitions = np.array( [ [ - part.apply( + part.add_to_apply_calls( map_func, other=by_parts[col_idx].get() if axis @@ -191,7 +191,7 @@ def broadcast_apply(cls, axis, apply_func, left, right): return np.array( [ [ - part.apply( + part.add_to_apply_calls( apply_func, r=right_parts[col_idx].get() if axis @@ -451,7 +451,9 @@ def _apply_func_to_list_of_partitions_broadcast( ): preprocessed_func = cls.preprocess_func(func) return [ - obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs) + obj.add_to_apply_calls( + preprocessed_func, other=[o.get() for o in broadcasted], **kwargs + ) for obj, broadcasted in zip(partitions, other.T) ] @@ -469,7 +471,9 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): A list of BaseFramePartition objects. """ preprocessed_func = cls.preprocess_func(func) - return [obj.apply(preprocessed_func, **kwargs) for obj in partitions] + return [ + obj.add_to_apply_calls(preprocessed_func, **kwargs) for obj in partitions + ] @classmethod def apply_func_to_select_indices( From 4ea6f9347a2c97041481a966e376cdb25c1977ff Mon Sep 17 00:00:00 2001 From: ienkovich Date: Fri, 28 Aug 2020 11:12:39 +0300 Subject: [PATCH 078/120] REFACTOR-#1934: move MultiIndex checks to backend (#1935) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 17 ++++++++++++++++ modin/backends/pandas/query_compiler.py | 26 ++++++++++++++++++++----- modin/pandas/base.py | 8 ++++---- modin/pandas/dataframe.py | 6 +++--- modin/pandas/indexing.py | 4 ++-- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index ffed651c639..1f4d6d596f0 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1250,3 +1250,20 @@ def delitem(self, key): return self.drop(columns=[key]) # END __delitem__ + + @abc.abstractmethod + def has_multiindex(self, axis=0): + """ + Check if specified axis is indexed by MultiIndex. + + Parameters + ---------- + axis : 0 or 1, default 0 + The axis to check (0 - index, 1 - columns). + + Returns + ------- + bool + True if index at specified axis is MultiIndex and False otherwise. + """ + pass diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 0239a4c5108..490cdf8db3f 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -514,7 +514,7 @@ def reset_index(self, **kwargs): drop = kwargs.get("drop", False) level = kwargs.get("level", None) # TODO Implement level - if level is not None or isinstance(self.index, pandas.MultiIndex): + if level is not None or self.has_multiindex(): return self.default_to_pandas(pandas.DataFrame.reset_index, **kwargs) if not drop: new_column_name = ( @@ -1626,10 +1626,7 @@ def sort_index(self, **kwargs): sort_remaining = kwargs.pop("sort_remaining", True) kwargs["inplace"] = False - if level is not None or ( - (axis == 0 and isinstance(self.index, pandas.MultiIndex)) - or (axis == 1 and isinstance(self.columns, pandas.MultiIndex)) - ): + if level is not None or self.has_multiindex(axis=axis): return self.default_to_pandas( pandas.DataFrame.sort_index, axis=axis, @@ -2331,3 +2328,22 @@ def cat_codes(self): return self.default_to_pandas(lambda df: df[df.columns[0]].cat.codes) # END Cat operations + + def has_multiindex(self, axis=0): + """ + Check if specified axis is indexed by MultiIndex. + + Parameters + ---------- + axis : 0 or 1, default 0 + The axis to check (0 - index, 1 - columns). + + Returns + ------- + bool + True if index at specified axis is MultiIndex and False otherwise. + """ + if axis == 0: + return isinstance(self.index, pandas.MultiIndex) + assert axis == 1 + return isinstance(self.columns, pandas.MultiIndex) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 29f3ad892af..00fb5123648 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -753,7 +753,7 @@ def count(self, axis=0, level=None, numeric_only=False): self._validate_dtypes(numeric_only=numeric_only) if level is not None: - if not isinstance(self.axes[axis], pandas.MultiIndex): + if not self._query_compiler.has_multiindex(axis=axis): # error thrown by pandas raise TypeError("Can only count levels on hierarchical columns.") @@ -2046,11 +2046,11 @@ def reindex( level is not None or ( (columns is not None or axis == 1) - and isinstance(self.columns, pandas.MultiIndex) + and self._query_compiler.has_multiindex(axis=1) ) or ( (index is not None or axis == 0) - and isinstance(self.index, pandas.MultiIndex) + and self._query_compiler.has_multiindex() ) ): return self._default_to_pandas( @@ -2279,7 +2279,7 @@ def reset_index( # exist. if ( not drop - and not isinstance(self.index, pandas.MultiIndex) + and not self._query_compiler.has_multiindex() and all(n in self.columns for n in ["level_0", "index"]) ): raise ValueError("cannot insert level_0, already exists") diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 922a8e37cb5..a03481f3ac7 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -432,7 +432,7 @@ def groupby( drop = by in self.columns idx_name = by if ( - isinstance(self.axes[axis], pandas.MultiIndex) + self._query_compiler.has_multiindex(axis=axis) and by in self.axes[axis].names ): # In this case we pass the string value of the name through to the @@ -2070,7 +2070,7 @@ def set_index( names = [] if append: names = [x for x in self.index.names] - if isinstance(self.index, pandas.MultiIndex): + if self._query_compiler.has_multiindex(): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: @@ -2482,7 +2482,7 @@ def _getitem(self, key): """ key = apply_if_callable(key, self) # Shortcut if key is an actual column - is_mi_columns = isinstance(self.columns, pandas.MultiIndex) + is_mi_columns = self._query_compiler.has_multiindex(axis=1) try: if key in self.columns and not is_mi_columns: return self._getitem_column(key) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index dd313c379a7..84d32115622 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -335,7 +335,7 @@ def _compute_lookup(self, row_loc, col_loc): row_lookup = self.qc.index.get_indexer_for( self.qc.index.to_series().loc[row_loc] ) - elif isinstance(self.qc.index, pandas.MultiIndex): + elif self.qc.has_multiindex(): if isinstance(row_loc, pandas.MultiIndex): row_lookup = self.qc.index.get_indexer_for(row_loc) else: @@ -349,7 +349,7 @@ def _compute_lookup(self, row_loc, col_loc): col_lookup = self.qc.columns.get_indexer_for( self.qc.columns.to_series().loc[col_loc] ) - elif isinstance(self.qc.columns, pandas.MultiIndex): + elif self.qc.has_multiindex(axis=1): if isinstance(col_loc, pandas.MultiIndex): col_lookup = self.qc.columns.get_indexer_for(col_loc) else: From f35b7a4f54727319bd7a3be125082ae1a24e2729 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Fri, 28 Aug 2020 12:05:57 +0300 Subject: [PATCH 079/120] FIX-#1679: Assignment df.loc["row"]["col"] fixed in case of one row (#1951) Signed-off-by: Dmitry Chigarev --- modin/engines/base/frame/partition_manager.py | 4 +++ modin/pandas/test/test_dataframe.py | 30 +++++++------------ 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 40925001d93..f1340fd1e5b 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -302,6 +302,10 @@ def concat(cls, axis, left_parts, right_parts): A new BaseFrameManager object, the type of object that called this. """ if type(right_parts) is list: + # `np.array` with partitions of empty ModinFrame has a shape (0,) + # but `np.concatenate` can concatenate arrays only if its shapes at + # specified axis are equals, so filtering empty frames to avoid concat error + right_parts = [o for o in right_parts if o.size != 0] return np.concatenate([left_parts] + right_parts, axis=axis) else: return np.append(left_parts, right_parts, axis=axis) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 3aa43bce7ae..7362b137faf 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -4677,26 +4677,16 @@ def test_loc_multi_index(self): df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]]) - def test_loc_assignment(self): - modin_df = pd.DataFrame( - index=["row1", "row2", "row3"], columns=["col1", "col2"] - ) - pandas_df = pandas.DataFrame( - index=["row1", "row2", "row3"], columns=["col1", "col2"] - ) - modin_df.loc["row1"]["col1"] = 11 - modin_df.loc["row2"]["col1"] = 21 - modin_df.loc["row3"]["col1"] = 31 - modin_df.loc["row1"]["col2"] = 12 - modin_df.loc["row2"]["col2"] = 22 - modin_df.loc["row3"]["col2"] = 32 - pandas_df.loc["row1"]["col1"] = 11 - pandas_df.loc["row2"]["col1"] = 21 - pandas_df.loc["row3"]["col1"] = 31 - pandas_df.loc["row1"]["col2"] = 12 - pandas_df.loc["row2"]["col2"] = 22 - pandas_df.loc["row3"]["col2"] = 32 - df_equals(modin_df, pandas_df) + @pytest.mark.parametrize("index", [["row1", "row2", "row3"], ["row1"]]) + @pytest.mark.parametrize("columns", [["col1", "col2"], ["col1"]]) + def test_loc_assignment(self, index, columns): + md_df, pd_df = create_test_dfs(index=index, columns=columns) + for i, ind in enumerate(index): + for j, col in enumerate(columns): + value_to_assign = int(str(i) + str(j)) + md_df.loc[ind][col] = value_to_assign + pd_df.loc[ind][col] = value_to_assign + df_equals(md_df, pd_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_loc_nested_assignment(self, data): From dd42a40679575be87fc770a3fc10886fb3e0b5e0 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Fri, 28 Aug 2020 14:14:25 +0300 Subject: [PATCH 080/120] FEAT-#1291 #1187: Add `DataFrame.unstack`, `Series.unstack` (#1649) Signed-off-by: Alexey Prutskov --- docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 + modin/backends/pandas/query_compiler.py | 87 +++++++++++++++++++-- modin/pandas/base.py | 3 - modin/pandas/dataframe.py | 31 ++++++++ modin/pandas/series.py | 23 ++++++ modin/pandas/test/test_dataframe.py | 76 +++++++++++++++++- modin/pandas/test/test_series.py | 46 +++++++---- 9 files changed, 244 insertions(+), 30 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 1ad585ce228..5725788b2fb 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -430,7 +430,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``tz_localize`` | `tz_localize`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``unstack`` | `unstack`_ | D | | +| ``unstack`` | `unstack`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``update`` | `update`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index 962aef16f87..3c89d23ade6 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -462,7 +462,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``unique`` | Y | +-----------------------------+---------------------------------+ -| ``unstack`` | D | +| ``unstack`` | Y | +-----------------------------+---------------------------------+ | ``update`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 1f4d6d596f0..c13be057439 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1194,6 +1194,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args): # END Manual Partitioning methods + @abc.abstractmethod + def unstack(self, level, fill_value): + pass + @abc.abstractmethod def get_dummies(self, columns, **kwargs): """Convert categorical variables to dummy variables for certain columns. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 490cdf8db3f..a5493311ffa 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -793,7 +793,7 @@ def resample_app_ser(self, resample_args, func, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), func=func, *args, - **kwargs + **kwargs, ) def resample_app_df(self, resample_args, func, *args, **kwargs): @@ -806,7 +806,7 @@ def resample_agg_ser(self, resample_args, func, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), func=func, *args, - **kwargs + **kwargs, ) def resample_agg_df(self, resample_args, func, *args, **kwargs): @@ -851,7 +851,7 @@ def resample_interpolate( limit_direction, limit_area, downcast, - **kwargs + **kwargs, ): return self._resample_func( resample_args, @@ -862,7 +862,7 @@ def resample_interpolate( limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, - **kwargs + **kwargs, ) def resample_count(self, resample_args): @@ -910,7 +910,7 @@ def resample_ohlc_ser(self, resample_args, _method, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), _method=_method, *args, - **kwargs + **kwargs, ) def resample_ohlc_df(self, resample_args, _method, *args, **kwargs): @@ -1075,6 +1075,79 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs): ) return self.__constructor__(new_modin_frame) + def unstack(self, level, fill_value): + if not isinstance(self.index, pandas.MultiIndex) or ( + isinstance(self.index, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + axis = 1 + new_columns = ["__reduced__"] + need_reindex = True + else: + axis = 0 + new_columns = None + need_reindex = False + + def map_func(df): + return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value)) + + is_all_multi_list = False + if ( + isinstance(self.index, pandas.MultiIndex) + and isinstance(self.columns, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + is_all_multi_list = True + real_cols_bkp = self.columns + obj = self.copy() + obj.columns = np.arange(len(obj.columns)) + else: + obj = self + + new_modin_frame = obj._modin_frame._apply_full_axis( + axis, map_func, new_columns=new_columns + ) + result = self.__constructor__(new_modin_frame) + + if is_all_multi_list: + result = result.sort_index() + index_level_values = [lvl for lvl in obj.index.levels] + columns_level_values = [ + real_cols_bkp.get_level_values(lvl).unique() + for lvl in np.arange(real_cols_bkp.nlevels) + ] + result.index = pandas.MultiIndex.from_product( + [*columns_level_values, *index_level_values] + ) + return result + + if need_reindex: + if isinstance(self.index, pandas.MultiIndex): + index_level_values = [ + self.index.get_level_values(lvl).unique() + for lvl in np.arange(self.index.nlevels) + ] + new_index = pandas.MultiIndex.from_product( + [self.columns, *index_level_values] + ) + else: + if isinstance(self.columns, pandas.MultiIndex): + columns_level_values = [ + self.columns.get_level_values(lvl).unique() + for lvl in np.arange(self.columns.nlevels) + ] + new_index = pandas.MultiIndex.from_product( + [*columns_level_values, self.index] + ) + else: + new_index = pandas.MultiIndex.from_product( + [self.columns, self.index] + ) + result = result.reindex(0, new_index) + return result + # Map partitions operations # These operations are operations that apply a function to every partition. abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy") @@ -1632,7 +1705,7 @@ def sort_index(self, **kwargs): axis=axis, level=level, sort_remaining=sort_remaining, - **kwargs + **kwargs, ) # sort_index can have ascending be None and behaves as if it is False. @@ -2147,7 +2220,7 @@ def compute_groupby(df): try: agg_func( pandas.DataFrame(index=[1], columns=[1]).groupby(level=0), - **agg_args + **agg_args, ) except Exception as e: raise type(e)("No numeric types to aggregate.") diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 00fb5123648..71450b75182 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3333,9 +3333,6 @@ def tz_localize( ) return self.set_axis(labels=new_labels, axis=axis, inplace=not copy) - def unstack(self, level=-1, fill_value=None): - return self._default_to_pandas("unstack", level=level, fill_value=fill_value) - def var( self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs ): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index a03481f3ac7..cf1700c01c5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1680,6 +1680,37 @@ def slice_shift(self, periods=1, axis=0): new_df.columns = new_columns return new_df + def unstack(self, level=-1, fill_value=None): + """ + Pivot a level of the (necessarily hierarchical) index labels. + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + The level involved will automatically get sorted. + Parameters + ---------- + level : int, str, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name. + fill_value : int, str or dict + Replace NaN with this value if the unstack produces missing values. + Returns + ------- + Series or DataFrame + """ + if not isinstance(self.index, pandas.MultiIndex) or ( + isinstance(self.index, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + return self._reduce_dimension( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + else: + return DataFrame( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + def pivot(self, index=None, columns=None, values=None): return self._default_to_pandas( pandas.DataFrame.pivot, index=index, columns=columns, values=values diff --git a/modin/pandas/series.py b/modin/pandas/series.py index c2c308be80d..939806c3bda 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1079,6 +1079,29 @@ def slice_shift(self, periods=1, axis=0): ) ) + def unstack(self, level=-1, fill_value=None): + """ + Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + The level involved will automatically get sorted. + Parameters + ---------- + level : int, str, or list of these, default last level + Level(s) to unstack, can pass level name. + fill_value : scalar value, default None + Value to use when replacing NaN values. + Returns + ------- + DataFrame + Unstacked Series. + """ + from .dataframe import DataFrame + + result = DataFrame( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + + return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result + @property def plot( self, diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 7362b137faf..b96a1eef2b6 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2950,10 +2950,78 @@ def test_tz_localize(self): pandas_df.tz_localize("America/Los_Angeles", axis=0), ) - def test_unstack(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).unstack() + @pytest.mark.parametrize( + "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"] + ) + @pytest.mark.parametrize( + "is_multi_col", [True, False], ids=["col_multi", "col_index"] + ) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_unstack(self, data, is_multi_idx, is_multi_col): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if is_multi_idx: + if len(pandas_df.index) == 256: + index = pd.MultiIndex.from_product( + [ + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_df.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + else: + index = pandas_df.index + + if is_multi_col: + if len(pandas_df.columns) == 64: + columns = pd.MultiIndex.from_product( + [ + ["A", "B", "C", "D"], + ["xx", "yy", "zz", "LAST"], + [10, 20, 30, 40], + ] + ) + elif len(pandas_df.columns) == 100: + columns = pd.MultiIndex.from_product( + [ + ["xx", "yy", "zz", "LAST"], + ["A", "B", "C", "D", "F"], + ["I", "J", "K", "L", "INDEX"], + ] + ) + else: + columns = pandas_df.columns + + pandas_df.columns = columns + pandas_df.index = index + + modin_df.columns = columns + modin_df.index = index + + df_equals(modin_df.unstack(), pandas_df.unstack()) + + if is_multi_idx: + df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) + df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) + df_equals( + modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2]) + ) + + if len(pandas_df.index) == 256: + df_equals( + modin_df.unstack(level=[0, 1, 2, 3]), + pandas_df.unstack(level=[0, 1, 2, 3]), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___array__(self, data): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 1a3461fd4b9..b866e585ef3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3073,21 +3073,39 @@ def test_unique(data): assert_array_equal(modin_result, pandas_result) -def test_unstack(): - s = pd.Series( - np.random.randint(1, 100, 12), - index=pd.MultiIndex.from_tuples( +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_unstack(data): + modin_series, pandas_series = create_test_series(data) + if len(pandas_series.index) == 256: + index = pd.MultiIndex.from_product( [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - with pytest.warns(UserWarning): - s.unstack() + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_series.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + + modin_series = pd.Series(data[next(iter(data.keys()))], index=index) + pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index) + + df_equals(modin_series.unstack(), pandas_series.unstack()) + df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0)) + df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1])) + + if len(pandas_series.index) == 256: + df_equals( + modin_series.unstack(level=[0, 1, 2]), + pandas_series.unstack(level=[0, 1, 2]), + ) @pytest.mark.parametrize( From 14827d382cb56f4b3925374936e2a250bc8cff68 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Fri, 28 Aug 2020 16:40:50 +0300 Subject: [PATCH 081/120] FIX-#1646: Fix representation of `Series` with datetimelike index (#1954) Signed-off-by: Alexey Prutskov --- modin/pandas/series.py | 11 ++++++++--- modin/pandas/test/test_series.py | 26 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 939806c3bda..ce8c1459d88 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -308,6 +308,11 @@ def __repr__(self): if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] temp_str = repr(temp_df) + freq_str = ( + "Freq: {}, ".format(self.index.freqstr) + if isinstance(self.index, pandas.DatetimeIndex) + else "" + ) if self.name is not None: name_str = "Name: {}, ".format(str(self.name)) else: @@ -322,9 +327,9 @@ def __repr__(self): else temp_str.rsplit("dtype: ", 1)[-1] ) if len(self) == 0: - return "Series([], {}{}".format(name_str, dtype_str) - return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( - name_str, len_str, dtype_str + return "Series([], {}{}{}".format(freq_str, name_str, dtype_str) + return temp_str.rsplit("\n", 1)[0] + "\n{}{}{}{}".format( + freq_str, name_str, len_str, dtype_str ) def __round__(self, decimals=0): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index b866e585ef3..57dc0c8a3de 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -474,14 +474,24 @@ def test___pow__(data): inter_df_math_helper(modin_series, pandas_series, "__pow__") -def test___repr___empty(): - modin_series, pandas_series = pd.Series(), pandas.Series() - assert repr(modin_series) == repr(pandas_series) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test___repr__(data): - modin_series, pandas_series = create_test_series(data) +@pytest.mark.parametrize("name", ["Dates", None]) +@pytest.mark.parametrize( + "dt_index", [True, False], ids=["dt_index_true", "dt_index_false"] +) +@pytest.mark.parametrize( + "data", [*test_data_values, "empty"], ids=[*test_data_keys, "empty"] +) +def test___repr__(name, dt_index, data): + if data == "empty": + modin_series, pandas_series = pd.Series(), pandas.Series() + else: + modin_series, pandas_series = create_test_series(data) + pandas_series.name = modin_series.name = name + if dt_index: + index = pandas.date_range( + "1/1/2000", periods=len(pandas_series.index), freq="T" + ) + pandas_series.index = modin_series.index = index assert repr(modin_series) == repr(pandas_series) From 633a8b0442471559a90c6217ba5839fb4c7c6f6a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 31 Aug 2020 11:47:14 +0300 Subject: [PATCH 082/120] TEST-#1961: speed up TestDataFrameReduction_* test (#1962) * TEST-#1961: speed up TestDataFrameReduction_A test Signed-off-by: Anatoly Myachev * TEST-#1961: draft speed up test_count Signed-off-by: Anatoly Myachev * TEST-#1961: combine Reduction_A and Reduction_B Signed-off-by: Anatoly Myachev * TEST-#1961: refactor some reduction tests Signed-off-by: Anatoly Myachev * TEST-#1961: speed up test_sum Signed-off-by: Anatoly Myachev * TEST-#1961: speed up test_prod Signed-off-by: Anatoly Myachev * TEST-#1961: refactor some reduction tests Signed-off-by: Anatoly Myachev * TEST-#1961: add new dataset into utils.py Signed-off-by: Anatoly Myachev * TEST-#1961: return skipif for test_prod Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 4 +- .github/workflows/push.yml | 4 +- modin/pandas/test/test_dataframe.py | 688 ++++++++-------------------- modin/pandas/test/utils.py | 20 +- 4 files changed, 203 insertions(+), 513 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee1f36e637c..d31ccc3e720 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,7 +180,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -284,7 +284,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 74a45317429..4a1c95faf26 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -44,7 +44,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -110,7 +110,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["Reduction_A", "Reduction_B", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index b96a1eef2b6..11ee3d513dd 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -57,11 +57,10 @@ int_arg_values, eval_general, create_test_dfs, - test_data_small_values, - test_data_small_keys, udf_func_values, udf_func_keys, generate_multiindex, + test_data_diff_dtype, ) pd.DEFAULT_NPARTITIONS = 4 @@ -3071,316 +3070,125 @@ def test_hasattr_sparse(self, data): assert modin_result == pandas_result -class TestDataFrameReduction_A: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +class TestDataFrameReduction: + @pytest.mark.parametrize("method", ["all", "any"]) + @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + def test_all_any(self, data, axis, skipna, is_transposed, method): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( + axis=axis, skipna=skipna, bool_only=None + ), + ) + + @pytest.mark.parametrize("method", ["all", "any"]) @pytest.mark.parametrize( "bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys) ) - def test_all(self, data, axis, skipna, bool_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.all(axis=axis, skipna=skipna, bool_only=bool_only) - except Exception as e: - with pytest.raises(type(e)): - modin_df.all(axis=axis, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.all(axis=axis, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) - - # Test when axis is None. This will get repeated but easier than using list in parameterize decorator - try: - pandas_result = pandas_df.all(axis=None, skipna=skipna, bool_only=bool_only) - except Exception as e: - with pytest.raises(type(e)): - modin_df.all(axis=None, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.all(axis=None, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) + def test_all_any_specific(self, bool_only, method): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: getattr(df, method)(bool_only=bool_only), + ) - try: - pandas_result = pandas_df.T.all( - axis=axis, skipna=skipna, bool_only=bool_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("method", ["all", "any"]) + @pytest.mark.parametrize("level", [-1, 0, 1]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("data", [test_data["int_data"]]) + def test_all_any_level(self, data, axis, level, method): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - # Test when axis is None. This will get repeated but easier than using list in parameterize decorator - try: - pandas_result = pandas_df.T.all( - axis=None, skipna=skipna, bool_only=bool_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only) + if axis == 0: + new_idx = generate_multiindex(len(modin_df.index)) + modin_df.index = new_idx + pandas_df.index = new_idx else: - modin_result = modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) - - # test level - modin_df_multi_level = modin_df.copy() - pandas_df_multi_level = pandas_df.copy() - axis = modin_df._get_axis_number(axis) if axis is not None else 0 - levels = 3 - axis_names_list = [["a", "b", "c"], None] - for axis_names in axis_names_list: - if axis == 0: - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.index))], - names=axis_names, - ) - modin_df_multi_level.index = new_idx - pandas_df_multi_level.index = new_idx - else: - new_col = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.columns))], - names=axis_names, - ) - modin_df_multi_level.columns = new_col - pandas_df_multi_level.columns = new_col + new_col = generate_multiindex(len(modin_df.columns)) + modin_df.columns = new_col + pandas_df.columns = new_col - for level in list(range(levels)) + (axis_names if axis_names else []): - try: - pandas_multi_level_result = pandas_df_multi_level.all( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - - except Exception as e: - with pytest.raises(type(e)): - modin_df_multi_level.all( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - else: - modin_multi_level_result = modin_df_multi_level.all( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - - df_equals(modin_multi_level_result, pandas_multi_level_result) + eval_general( + modin_df, + pandas_df, + lambda df: getattr(df, method)(axis=axis, level=level), + ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + def test_count(self, data, axis): + eval_general( + *create_test_dfs(data), + lambda df: df.count(axis=axis), + ) + @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys) + "numeric_only", + [ + pytest.param(True, marks=pytest.mark.xfail(reason="See #1965 for details")), + False, + None, + ], ) - def test_any(self, data, axis, skipna, bool_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.any(axis=axis, skipna=skipna, bool_only=bool_only) - except Exception as e: - with pytest.raises(type(e)): - modin_df.any(axis=axis, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.any(axis=axis, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.any(axis=None, skipna=skipna, bool_only=bool_only) - except Exception as e: - with pytest.raises(type(e)): - modin_df.any(axis=None, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.any(axis=None, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) + def test_count_specific(self, numeric_only): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.count(numeric_only=numeric_only), + ) - try: - pandas_result = pandas_df.T.any( - axis=axis, skipna=skipna, bool_only=bool_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only) - else: - modin_result = modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("level", [-1, 0, 1]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("data", [test_data["int_data"]]) + def test_count_level(self, data, axis, level): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - try: - pandas_result = pandas_df.T.any( - axis=None, skipna=skipna, bool_only=bool_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only) + if axis == 0: + new_idx = generate_multiindex(len(modin_df.index)) + modin_df.index = new_idx + pandas_df.index = new_idx else: - modin_result = modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only) - df_equals(modin_result, pandas_result) - - # test level - modin_df_multi_level = modin_df.copy() - pandas_df_multi_level = pandas_df.copy() - axis = modin_df._get_axis_number(axis) if axis is not None else 0 - levels = 3 - axis_names_list = [["a", "b", "c"], None] - for axis_names in axis_names_list: - if axis == 0: - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.index))], - names=axis_names, - ) - modin_df_multi_level.index = new_idx - pandas_df_multi_level.index = new_idx - else: - new_col = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.columns))], - names=axis_names, - ) - modin_df_multi_level.columns = new_col - pandas_df_multi_level.columns = new_col - - for level in list(range(levels)) + (axis_names if axis_names else []): - try: - pandas_multi_level_result = pandas_df_multi_level.any( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - - except Exception as e: - with pytest.raises(type(e)): - modin_df_multi_level.any( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - else: - modin_multi_level_result = modin_df_multi_level.any( - axis=axis, bool_only=bool_only, level=level, skipna=skipna - ) - - df_equals(modin_multi_level_result, pandas_multi_level_result) + new_col = generate_multiindex(len(modin_df.columns)) + modin_df.columns = new_col + pandas_df.columns = new_col + eval_general( + modin_df, + pandas_df, + lambda df: df.count(axis=axis, level=level), + ) -class TestDataFrameReduction_B: + @pytest.mark.parametrize("percentiles", [None, 0.10, 0.11, 0.44, 0.78, 0.99]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + def test_describe(self, data, percentiles): + eval_general( + *create_test_dfs(data), + lambda df: df.describe(percentiles=percentiles), + ) + @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) + "exclude,include", + [ + ([np.float64], None), + (np.float64, None), + (None, [np.timedelta64, np.datetime64, np.object, np.bool]), + (None, "all"), + (None, np.number), + ], ) - def test_count(self, request, data, axis, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_result = modin_df.count(axis=axis, numeric_only=numeric_only) - pandas_result = pandas_df.count(axis=axis, numeric_only=numeric_only) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.T.count(axis=axis, numeric_only=numeric_only) - pandas_result = pandas_df.T.count(axis=axis, numeric_only=numeric_only) - df_equals(modin_result, pandas_result) - - # test level - modin_df_multi_level = modin_df.copy() - pandas_df_multi_level = pandas_df.copy() - axis = modin_df._get_axis_number(axis) if axis is not None else 0 - levels = 3 - axis_names_list = [["a", "b", "c"], None] - for axis_names in axis_names_list: - if axis == 0: - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.index))], - names=axis_names, - ) - modin_df_multi_level.index = new_idx - pandas_df_multi_level.index = new_idx - try: # test error - pandas_df_multi_level.count( - axis=1, numeric_only=numeric_only, level=0 - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df_multi_level.count( - axis=1, numeric_only=numeric_only, level=0 - ) - else: - new_col = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.columns))], - names=axis_names, - ) - modin_df_multi_level.columns = new_col - pandas_df_multi_level.columns = new_col - try: # test error - pandas_df_multi_level.count( - axis=0, numeric_only=numeric_only, level=0 - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df_multi_level.count( - axis=0, numeric_only=numeric_only, level=0 - ) - - for level in list(range(levels)) + (axis_names if axis_names else []): - modin_multi_level_result = modin_df_multi_level.count( - axis=axis, numeric_only=numeric_only, level=level - ) - pandas_multi_level_result = pandas_df_multi_level.count( - axis=axis, numeric_only=numeric_only, level=level - ) - df_equals(modin_multi_level_result, pandas_multi_level_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_describe(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.describe(), pandas_df.describe()) - percentiles = [0.10, 0.11, 0.44, 0.78, 0.99] - df_equals( - modin_df.describe(percentiles=percentiles), - pandas_df.describe(percentiles=percentiles), + def test_describe_specific(self, exclude, include): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.drop("str_col", axis=1).describe( + exclude=exclude, include=include + ), ) - try: - pandas_result = pandas_df.describe(exclude=[np.float64]) - except Exception as e: - with pytest.raises(type(e)): - modin_df.describe(exclude=[np.float64]) - else: - modin_result = modin_df.describe(exclude=[np.float64]) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.describe(exclude=np.float64) - except Exception as e: - with pytest.raises(type(e)): - modin_df.describe(exclude=np.float64) - else: - modin_result = modin_df.describe(exclude=np.float64) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.describe( - include=[np.timedelta64, np.datetime64, np.object, np.bool] - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.describe( - include=[np.timedelta64, np.datetime64, np.object, np.bool] - ) - else: - modin_result = modin_df.describe( - include=[np.timedelta64, np.datetime64, np.object, np.bool] - ) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.describe(include=str(modin_df.dtypes.values[0])) - pandas_result = pandas_df.describe(include=str(pandas_df.dtypes.values[0])) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.describe(include=[np.number]) - pandas_result = pandas_df.describe(include=[np.number]) - df_equals(modin_result, pandas_result) - - df_equals(modin_df.describe(include="all"), pandas_df.describe(include="all")) - + @pytest.mark.parametrize("data", [test_data["int_data"]]) + def test_describe_str(self, data): modin_df = pd.DataFrame(data).applymap(str) pandas_df = pandas.DataFrame(data).applymap(str) @@ -3396,217 +3204,67 @@ def test_describe(self, data): ) def test_describe_dtypes(self): - modin_df = pd.DataFrame( - { - "col1": list("abc"), - "col2": list("abc"), - "col3": list("abc"), - "col4": [1, 2, 3], - } - ) - pandas_df = pandas.DataFrame( - { - "col1": list("abc"), - "col2": list("abc"), - "col3": list("abc"), - "col4": [1, 2, 3], - } - ) - - modin_result = modin_df.describe() - pandas_result = pandas_df.describe() - - df_equals(modin_result, pandas_result) + data = { + "col1": list("abc"), + "col2": list("abc"), + "col3": list("abc"), + "col4": [1, 2, 3], + } + eval_general(*create_test_dfs(data), lambda df: df.describe()) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("method", ["idxmin", "idxmax"]) + @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) - def test_idxmax(self, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - pandas_result = pandas_df.idxmax(axis=axis, skipna=skipna) - modin_result = modin_df.idxmax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - pandas_result = pandas_df.T.idxmax(axis=axis, skipna=skipna) - modin_result = modin_df.T.idxmax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - def test_idxmin(self, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_result = modin_df.idxmin(axis=axis, skipna=skipna) - pandas_result = pandas_df.idxmin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.T.idxmin(axis=axis, skipna=skipna) - pandas_result = pandas_df.T.idxmin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + def test_idxmin_idxmax(self, data, axis, skipna, is_transposed, method): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( + axis=axis, skipna=skipna + ), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_last_valid_index(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert modin_df.last_valid_index() == (pandas_df.last_valid_index()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - def test_max(self, request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.max( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.max(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.max( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.max( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.max(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.max( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - def test_mean(self, request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.mean( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.mean(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.mean( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.mean( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.mean(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.mean( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + assert modin_df.last_valid_index() == pandas_df.last_valid_index() - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "index", bool_arg_values, ids=arg_keys("index", bool_arg_keys) ) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_memory_usage(self, data, index): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - modin_result = modin_df.memory_usage(index=index) - pandas_result = pandas_df.memory_usage(index=index) - df_equals(modin_result, pandas_result) + eval_general(*create_test_dfs(data), lambda df: df.memory_usage(index=index)) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("method", ["min", "max", "mean"]) + @pytest.mark.parametrize("is_transposed", [False, True]) @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) ) @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) - def test_min(self, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.min( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.min(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.min( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.min( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.min(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.min( + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + def test_min_max_mean( + self, data, axis, skipna, numeric_only, is_transposed, method + ): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) + ), + ) @pytest.mark.skipif( os.name == "nt", reason="Windows has a memory issue for large numbers on this test", ) @pytest.mark.parametrize( - "data", - test_data_values + test_data_small_values, - ids=test_data_keys + test_data_small_keys, - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - @pytest.mark.parametrize( - "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) - ) - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "operation", + "method", [ "prod", pytest.param( @@ -3614,61 +3272,80 @@ def test_min(self, data, axis, skipna, numeric_only): marks=pytest.mark.skipif( pandas.DataFrame.product == pandas.DataFrame.prod and pd.DataFrame.product == pd.DataFrame.prod, - reason="That operation was already tested.", + reason="That method was already tested.", ), ), ], ) + @pytest.mark.parametrize("is_transposed", [False, True]) + @pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) + ) + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) def test_prod( self, - request, data, axis, skipna, - numeric_only, - min_count, is_transposed, - operation, + method, ): eval_general( *create_test_dfs(data), - lambda df, *args, **kwargs: getattr( - df.T if is_transposed else df, operation - )(*args, **kwargs), - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, + lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( + axis=axis, + skipna=skipna, + ), ) @pytest.mark.parametrize( - "data", - test_data_values + test_data_small_values, - ids=test_data_keys + test_data_small_keys, - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) + "numeric_only", + [ + pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), + False, + True, + ], ) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) + def test_prod_specific(self, min_count, numeric_only): + if min_count == 5 and numeric_only: + pytest.xfail("see #1953 for details") + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), + ) + @pytest.mark.parametrize("is_transposed", [False, True]) - def test_sum( - self, request, data, axis, skipna, numeric_only, min_count, is_transposed - ): + @pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) + ) + @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) + @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + def test_sum(self, data, axis, skipna, is_transposed): eval_general( *create_test_dfs(data), - lambda df, *args, **kwargs: (df.T if is_transposed else df).sum( - *args, **kwargs + lambda df: (df.T if is_transposed else df).sum( + axis=axis, + skipna=skipna, ), - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, + ) + + @pytest.mark.parametrize( + "numeric_only", + [ + pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), + False, + True, + ], + ) + @pytest.mark.parametrize("min_count", int_arg_values) + def test_sum_specific(self, min_count, numeric_only): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.sum(min_count=min_count, numeric_only=numeric_only), ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -4485,7 +4162,6 @@ def test_var(self, request, data, axis, skipna, numeric_only, ddof): modin_result = modin_df.T.var( axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof ) - df_equals(modin_result, pandas_result) class TestDataFrameIndexing: diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 8c794b0cfc6..8e6cf1f2042 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -127,6 +127,13 @@ test_data_values = list(test_data.values()) test_data_keys = list(test_data.keys()) +test_bool_data = { + "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): random_state.choice( + [True, False], size=(NROWS) + ) + for i in range(NCOLS) +} + test_data_with_duplicates = { "no_duplicates": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): range(NROWS) @@ -167,6 +174,13 @@ } } +test_data_diff_dtype = { + "int_col": [-5, 2, 7, 16], + "float_col": [np.NaN, -9.4, 10.1, np.NaN], + "str_col": ["a", np.NaN, "c", "d"], + "bool_col": [False, True, True, False], +} + test_data_small_values = list(test_data_small.values()) test_data_small_keys = list(test_data_small.keys()) @@ -681,10 +695,10 @@ def generate_multiindex(index): return df1, df2 -def generate_multiindex(cols_number): +def generate_multiindex(elements_number): arrays = [ - random_state.choice(["bar", "baz", "foo", "qux"], cols_number), - random_state.choice(["one", "two"], cols_number), + random_state.choice(["bar", "baz", "foo", "qux"], elements_number), + random_state.choice(["one", "two"], elements_number), ] return pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["first", "second"]) From 3ddd5c0bee9252e98f721e62091d051bf0b608dc Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Mon, 31 Aug 2020 17:50:18 +0300 Subject: [PATCH 083/120] FEAT-#1189: Add `DataFrame.stack` (#1673) Signed-off-by: Alexey Prutskov --- docs/supported_apis/dataframe_supported.rst | 2 +- modin/backends/base/query_compiler.py | 3 + modin/backends/pandas/query_compiler.py | 17 ++++++ modin/pandas/dataframe.py | 44 ++++++++++++- modin/pandas/series.py | 2 + modin/pandas/test/test_dataframe.py | 68 +++++++++++++++++++-- 6 files changed, 128 insertions(+), 8 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 5725788b2fb..45ed1992442 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -352,7 +352,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``squeeze`` | `squeeze`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``stack`` | `stack`_ | D | | +| ``stack`` | `stack`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``std`` | `std`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index c13be057439..98d085736c4 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -542,6 +542,9 @@ def unique(self, **kwargs): def value_counts(self, **kwargs): pass + def stack(self, level, dropna): + pass + # Abstract map partitions across select indices @abc.abstractmethod def astype(self, col_dtypes, **kwargs): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index a5493311ffa..74f462722ee 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -1148,6 +1148,23 @@ def map_func(df): result = result.reindex(0, new_index) return result + def stack(self, level, dropna): + if not isinstance(self.columns, pandas.MultiIndex) or ( + isinstance(self.columns, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.columns.nlevels + ): + new_columns = ["__reduced__"] + else: + new_columns = None + + new_modin_frame = self._modin_frame._apply_full_axis( + 1, + lambda df: pandas.DataFrame(df.stack(level=level, dropna=dropna)), + new_columns=new_columns, + ) + return self.__constructor__(new_modin_frame) + # Map partitions operations # These operations are operations that apply a function to every partition. abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy") diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index cf1700c01c5..4d86a1e8ba4 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1688,12 +1688,14 @@ def unstack(self, level=-1, fill_value=None): If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). The level involved will automatically get sorted. + Parameters ---------- level : int, str, or list of these, default -1 (last level) Level(s) of index to unstack, can pass level name. fill_value : int, str or dict Replace NaN with this value if the unstack produces missing values. + Returns ------- Series or DataFrame @@ -2162,9 +2164,45 @@ def squeeze(self, axis=None): return self.copy() def stack(self, level=-1, dropna=True): - return self._default_to_pandas( - pandas.DataFrame.stack, level=level, dropna=dropna - ) + """ + Stack the prescribed level(s) from columns to index. + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index + axis, defined as one index or label, or a list of indices + or labels. + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with + missing values. Stacking a column level onto the index + axis can create combinations of index and column values + that are missing from the original dataframe. See Examples + section. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + """ + if not isinstance(self.columns, pandas.MultiIndex) or ( + isinstance(self.columns, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.columns.nlevels + ): + return self._reduce_dimension( + query_compiler=self._query_compiler.stack(level, dropna) + ) + else: + return DataFrame(query_compiler=self._query_compiler.stack(level, dropna)) def sub(self, other, axis="columns", level=None, fill_value=None): return self._binary_op( diff --git a/modin/pandas/series.py b/modin/pandas/series.py index ce8c1459d88..b9c41f60454 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1088,12 +1088,14 @@ def unstack(self, level=-1, fill_value=None): """ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. The level involved will automatically get sorted. + Parameters ---------- level : int, str, or list of these, default last level Level(s) to unstack, can pass level name. fill_value : scalar value, default None Value to use when replacing NaN values. + Returns ------- DataFrame diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 11ee3d513dd..54886286603 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2736,10 +2736,70 @@ def test_slice_shift(self, data, index, axis, periods): pandas_df.slice_shift(periods=periods, axis=axis), ) - def test_stack(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).stack() + @pytest.mark.parametrize( + "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"] + ) + @pytest.mark.parametrize( + "is_multi_col", [True, False], ids=["col_multi", "col_index"] + ) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_stack(self, data, is_multi_idx, is_multi_col): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if is_multi_idx: + if len(pandas_df.index) == 256: + index = pd.MultiIndex.from_product( + [ + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_df.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + else: + index = pandas_df.index + + if is_multi_col: + if len(pandas_df.columns) == 64: + columns = pd.MultiIndex.from_product( + [ + ["A", "B", "C", "D"], + ["xx", "yy", "zz", "LAST"], + [10, 20, 30, 40], + ] + ) + elif len(pandas_df.columns) == 100: + columns = pd.MultiIndex.from_product( + [ + ["xx", "yy", "zz", "LAST"], + ["A", "B", "C", "D", "F"], + ["I", "J", "K", "L", "INDEX"], + ] + ) + else: + columns = pandas_df.columns + + pandas_df.columns = columns + pandas_df.index = index + + modin_df.columns = columns + modin_df.index = index + + df_equals(modin_df.stack(), pandas_df.stack()) + + if is_multi_col: + df_equals(modin_df.stack(level=0), pandas_df.stack(level=0)) + df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1])) + df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2])) def test_style(self): data = test_data_values[0] From 363da6dfcf1998fd1ade5917e8743c8579fb6753 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Mon, 31 Aug 2020 18:26:50 +0300 Subject: [PATCH 084/120] FEAT-#1201: pivot implementation via unstack (#1645) Signed-off-by: Dmitry Chigarev --- docs/supported_apis/dataframe_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 ++ modin/backends/pandas/query_compiler.py | 49 +++++++++++++++++++++ modin/pandas/dataframe.py | 23 +++++++++- modin/pandas/test/test_dataframe.py | 26 ++++++----- modin/pandas/test/test_general.py | 6 +-- 6 files changed, 94 insertions(+), 16 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 45ed1992442..fc364e540bf 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -268,7 +268,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pipe`` | `pipe`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``pivot`` | `pivot`_ | D | | +| ``pivot`` | `pivot`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``pivot_table`` | `pivot_table`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 98d085736c4..fea95f65fd8 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1201,6 +1201,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args): def unstack(self, level, fill_value): pass + @abc.abstractmethod + def pivot(self, index, columns, values): + pass + @abc.abstractmethod def get_dummies(self, columns, **kwargs): """Convert categorical variables to dummy variables for certain columns. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 74f462722ee..bdf05597049 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2253,6 +2253,55 @@ def compute_groupby(df): # END Manual Partitioning methods + def pivot(self, index, columns, values): + from pandas.core.reshape.pivot import _convert_by + + def __convert_by(by): + if isinstance(by, pandas.Index): + by = list(by) + by = _convert_by(by) + if ( + len(by) > 0 + and (not is_list_like(by[0]) or isinstance(by[0], tuple)) + and not all([key in self.columns for key in by]) + ): + by = [by] + return by + + index, columns, values = map(__convert_by, [index, columns, values]) + is_custom_index = ( + len(index) == 1 + and is_list_like(index[0]) + and not isinstance(index[0], tuple) + ) + + if is_custom_index or len(index) == 0: + to_reindex = columns + else: + to_reindex = index + columns + + if len(values) != 0: + obj = self.getitem_column_array(to_reindex + values) + else: + obj = self + + if is_custom_index: + obj.index = index + + reindexed = self.__constructor__( + obj._modin_frame._apply_full_axis( + 1, + lambda df: df.set_index(to_reindex, append=(len(to_reindex) == 1)), + new_columns=obj.columns.drop(to_reindex), + ) + ) + + unstacked = reindexed.unstack(level=columns, fill_value=None) + if len(reindexed.columns) == 1 and unstacked.columns.nlevels > 1: + unstacked.columns = unstacked.columns.droplevel(0) + + return unstacked + # Get_dummies def get_dummies(self, columns, **kwargs): """Convert categorical variables to dummy variables for certain columns. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 4d86a1e8ba4..9e9bfa384b8 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1714,8 +1714,27 @@ def unstack(self, level=-1, fill_value=None): ) def pivot(self, index=None, columns=None, values=None): - return self._default_to_pandas( - pandas.DataFrame.pivot, index=index, columns=columns, values=values + """ + Return reshaped DataFrame organized by given index / column values. + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. + Parameters + ---------- + index : str or object, optional + Column to use to make new frame's index. If None, uses + existing index. + columns : str or object + Column to use to make new frame's columns. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + """ + return self.__constructor__( + query_compiler=self._query_compiler.pivot( + index=index, columns=columns, values=values + ) ) def pivot_table( diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 54886286603..e3f1f61db6b 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2493,17 +2493,23 @@ def test_pct_change(self): with pytest.warns(UserWarning): pd.DataFrame(data).pct_change() - def test_pivot(self): - df = pd.DataFrame( - { - "foo": ["one", "one", "one", "two", "two", "two"], - "bar": ["A", "B", "C", "A", "B", "C"], - "baz": [1, 2, 3, 4, 5, 6], - "zoo": ["x", "y", "z", "q", "w", "t"], - } + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + @pytest.mark.parametrize( + "index", [lambda df: df.columns[0], lambda df: df[df.columns[0]].values, None] + ) + @pytest.mark.parametrize("columns", [lambda df: df.columns[len(df.columns) // 2]]) + @pytest.mark.parametrize( + "values", [lambda df: df.columns[-1], lambda df: df.columns[-2:], None] + ) + def test_pivot(self, data, index, columns, values): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.pivot(*args, **kwargs), + index=index, + columns=columns, + values=values, + check_exception_type=None, ) - with pytest.warns(UserWarning): - df.pivot(index="foo", columns="bar", values="baz") def test_pivot_table(self): df = pd.DataFrame( diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index 95006d9d61e..4b5ab621d5b 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -225,9 +225,9 @@ def test_pivot(): "zoo": ["x", "y", "z", "q", "w", "t"], } ) - with pytest.warns(UserWarning): - df = pd.pivot(test_df, index="foo", columns="bar", values="baz") - assert isinstance(df, pd.DataFrame) + + df = pd.pivot(test_df, index="foo", columns="bar", values="baz") + assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.pivot(test_df["bar"], index="foo", columns="bar", values="baz") From f6b60404f66a98c1371791178f86613e362e52f9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 1 Sep 2020 12:32:49 +0300 Subject: [PATCH 085/120] TEST-#1985: optimize common test dataset (#1984) * TEST-#9999: remove float_data from test_data Signed-off-by: Anatoly Myachev * TEST-#9999: merge sparse_nan_data, dense_nan_data in one dataset Signed-off-by: Anatoly Myachev * TEST-#9999: remove 100x100 dataset from test_data Signed-off-by: Anatoly Myachev * TEST-#9999: remove with_index_column dataset from test_data Signed-off-by: Anatoly Myachev * TEST-#1985: remove explicit restriction on nan for test_data Signed-off-by: Anatoly Myachev --- modin/pandas/test/test_dataframe.py | 166 +++++++++++++--------------- modin/pandas/test/test_series.py | 10 +- modin/pandas/test/utils.py | 41 ++----- 3 files changed, 87 insertions(+), 130 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index e3f1f61db6b..67401100ac2 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -375,10 +375,7 @@ def test_rmul(self, data): def test_rpow(self, request, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) - # TODO: Revert to others once we have an efficient way of preprocessing for positive values - # We need to check that negative integers are not used efficiently - if "100x100" not in request.node.name: - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rpow") + self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rpow") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_rsub(self, data): @@ -597,25 +594,23 @@ def test_add_suffix(self, data): df_equals(new_modin_df.columns, new_pandas_df.columns) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_at(self, request, data): + def test_at(self, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) - # We skip nan datasets because nan != nan - if "nan" not in request.node.name: - key1 = modin_df.columns[0] - # Scaler - assert modin_df.at[0, key1] == pandas_df.at[0, key1] + key1 = modin_df.columns[0] + # Scaler + df_equals(modin_df.at[0, key1], pandas_df.at[0, key1]) - # Series - df_equals(modin_df.loc[0].at[key1], pandas_df.loc[0].at[key1]) + # Series + df_equals(modin_df.loc[0].at[key1], pandas_df.loc[0].at[key1]) - # Write Item - modin_df_copy = modin_df.copy() - pandas_df_copy = pandas_df.copy() - modin_df_copy.at[1, key1] = modin_df.at[0, key1] - pandas_df_copy.at[1, key1] = pandas_df.at[0, key1] - df_equals(modin_df_copy, pandas_df_copy) + # Write Item + modin_df_copy = modin_df.copy() + pandas_df_copy = pandas_df.copy() + modin_df_copy.at[1, key1] = modin_df.at[0, key1] + pandas_df_copy.at[1, key1] = pandas_df.at[0, key1] + df_equals(modin_df_copy, pandas_df_copy) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_axes(self, data): @@ -2888,28 +2883,17 @@ def test_take(self): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_records(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # Skips nan because only difference is nan instead of NaN - if not name_contains(request.node.name, ["nan"]): - try: - pandas_result = pandas_df.to_records() - except Exception as e: - with pytest.raises(type(e)): - modin_df.to_records() - else: - modin_result = modin_df.to_records() - assert np.array_equal(modin_result, pandas_result) + eval_general( + *create_test_dfs(data), + lambda df: df.dropna().to_records(), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_to_string(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - # Skips nan because only difference is nan instead of NaN - if not name_contains(request.node.name, ["nan"]): - assert modin_df.to_string() == to_pandas(modin_df).to_string() + def test_to_string(self, data): + eval_general( + *create_test_dfs(data), + lambda df: df.to_string(), + ) def test_to_timestamp(self): idx = pd.date_range("1/1/2012", periods=5, freq="M") @@ -3143,7 +3127,7 @@ class TestDataFrameReduction: "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_all_any(self, data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), @@ -3185,7 +3169,7 @@ def test_all_any_level(self, data, axis, level, method): ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_count(self, data, axis): eval_general( *create_test_dfs(data), @@ -3284,7 +3268,7 @@ def test_describe_dtypes(self): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_idxmin_idxmax(self, data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), @@ -3314,7 +3298,7 @@ def test_memory_usage(self, data, index): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_min_max_mean( self, data, axis, skipna, numeric_only, is_transposed, method ): @@ -3348,7 +3332,7 @@ def test_min_max_mean( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_prod( self, data, @@ -3389,7 +3373,7 @@ def test_prod_specific(self, min_count, numeric_only): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["dense_nan_data"]]) + @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) def test_sum(self, data, axis, skipna, is_transposed): eval_general( *create_test_dfs(data), @@ -4347,65 +4331,63 @@ def test_keys(self, data): df_equals(modin_df.keys(), pandas_df.keys()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_loc(self, request, data): + def test_loc(self, data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) - # We skip nan datasets because nan != nan - if "nan" not in request.node.name: - key1 = modin_df.columns[0] - key2 = modin_df.columns[1] - # Scaler - assert modin_df.loc[0, key1] == pandas_df.loc[0, key1] + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + # Scaler + df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1]) - # Series - df_equals(modin_df.loc[0], pandas_df.loc[0]) - df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) - df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) + # Series + df_equals(modin_df.loc[0], pandas_df.loc[0]) + df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) + df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) - # DataFrame - df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) + # DataFrame + df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) - # List-like of booleans - indices = [i % 3 == 0 for i in range(len(modin_df.index))] - columns = [i % 5 == 0 for i in range(len(modin_df.columns))] - modin_result = modin_df.loc[indices, columns] - pandas_result = pandas_df.loc[indices, columns] - df_equals(modin_result, pandas_result) + # List-like of booleans + indices = [i % 3 == 0 for i in range(len(modin_df.index))] + columns = [i % 5 == 0 for i in range(len(modin_df.columns))] + modin_result = modin_df.loc[indices, columns] + pandas_result = pandas_df.loc[indices, columns] + df_equals(modin_result, pandas_result) - modin_result = modin_df.loc[:, columns] - pandas_result = pandas_df.loc[:, columns] - df_equals(modin_result, pandas_result) + modin_result = modin_df.loc[:, columns] + pandas_result = pandas_df.loc[:, columns] + df_equals(modin_result, pandas_result) - modin_result = modin_df.loc[indices] - pandas_result = pandas_df.loc[indices] - df_equals(modin_result, pandas_result) + modin_result = modin_df.loc[indices] + pandas_result = pandas_df.loc[indices] + df_equals(modin_result, pandas_result) - # See issue #80 - # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) - df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) + # See issue #80 + # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) + df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) - # From issue #421 - df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) - df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) + # From issue #421 + df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) + df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) - # From issue #1023 - key1 = modin_df.columns[0] - key2 = modin_df.columns[-2] - df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) + # From issue #1023 + key1 = modin_df.columns[0] + key2 = modin_df.columns[-2] + df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) - # Write Item - modin_df_copy = modin_df.copy() - pandas_df_copy = pandas_df.copy() - modin_df_copy.loc[[1, 2]] = 42 - pandas_df_copy.loc[[1, 2]] = 42 - df_equals(modin_df_copy, pandas_df_copy) + # Write Item + modin_df_copy = modin_df.copy() + pandas_df_copy = pandas_df.copy() + modin_df_copy.loc[[1, 2]] = 42 + pandas_df_copy.loc[[1, 2]] = 42 + df_equals(modin_df_copy, pandas_df_copy) - # From issue #1775 - df_equals( - modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], - pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], - ) + # From issue #1775 + df_equals( + modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + ) # From issue #1374 with pytest.raises(KeyError): @@ -5307,8 +5289,8 @@ def test___len__(self, data): def test_index_order(self): # see #1708 and #1869 for details df_modin, df_pandas = ( - pd.DataFrame(test_data["dense_nan_data"]), - pandas.DataFrame(test_data["dense_nan_data"]), + pd.DataFrame(test_data["float_nan_data"]), + pandas.DataFrame(test_data["float_nan_data"]), ) rows_number = len(df_modin.index) level_0 = np.random.choice([x for x in range(10)], rows_number) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 57dc0c8a3de..95fa93ec505 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1851,7 +1851,7 @@ def test_last(): def test_index_order(): # see #1708 and #1869 for details - s_modin, s_pandas = create_test_series(test_data["dense_nan_data"]) + s_modin, s_pandas = create_test_series(test_data["float_nan_data"]) rows_number = len(s_modin.index) level_0 = np.random.choice([x for x in range(10)], rows_number) level_1 = np.random.choice([x for x in range(10)], rows_number) @@ -2921,10 +2921,10 @@ def test_to_numpy(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_string(request, data): - modin_series, pandas_series = create_test_series(data) - # Skips nan because only difference is nan instead of NaN - if not name_contains(request.node.name, ["nan"]): - assert modin_series.to_string() == pandas_series.to_string() + eval_general( + *create_test_series(data), + lambda df: df.to_string(), + ) def test_to_timestamp(): diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 8e6cf1f2042..a412a730704 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -12,7 +12,6 @@ # governing permissions and limitations under the License. import pytest -import copy import numpy as np import pandas from pandas.util.testing import ( @@ -45,24 +44,11 @@ ) for i in range(NCOLS) }, - "float_data": { - "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): random_state.uniform( - RAND_LOW, RAND_HIGH, size=(NROWS) - ) - for i in range(NCOLS) - }, - "sparse_nan_data": { + "float_nan_data": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ - x if j != i else np.NaN - for j, x in enumerate( - random_state.uniform(RAND_LOW, RAND_HIGH, size=(NROWS)) - ) - ] - for i in range(NCOLS) - }, - "dense_nan_data": { - "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ - x if j % 4 == 0 else np.NaN + x + if (j % 4 == 0 and i > NCOLS // 2) or (j != i and i <= NCOLS // 2) + else np.NaN for j, x in enumerate( random_state.uniform(RAND_LOW, RAND_HIGH, size=(NROWS)) ) @@ -108,19 +94,10 @@ # "col1": "foo", # "col2": True, # }, - "100x100": { - "col{}".format((i - 50) % 100 + 1): random_state.randint( - RAND_LOW, RAND_HIGH, size=(100) - ) - for i in range(100) - }, } -# Create a dataframe based on integer dataframe but with one column called "index". Because of bug #1481 it cannot be -# created in normal way and has to be copied from dataset that works. -# TODO(gshimansky): when bug #1481 is fixed replace this dataframe initialization with ordinary one. -test_data["with_index_column"] = copy.copy(test_data["int_data"]) -test_data["with_index_column"]["index"] = test_data["with_index_column"].pop( +# See details in #1403 +test_data["int_data"]["index"] = test_data["int_data"].pop( "col{}".format(int(NCOLS / 2)) ) @@ -199,11 +176,8 @@ "empty_data", "columns_only", "int_data", - "float_data", - "sparse_nan_data", - "dense_nan_data", + "float_nan_data", "with_index_column", - "100x100", ] no_numeric_dfs = ["datetime_timedelta_data"] @@ -462,6 +436,7 @@ def df_equals(df1, df2): types_for_almost_equals = ( pandas.core.indexes.range.RangeIndex, pandas.core.indexes.base.Index, + np.recarray, ) # Gets AttributError if modin's groupby object is not import like this From 790b1962dd379ea340e27ff1d9c75a0b014d7ca0 Mon Sep 17 00:00:00 2001 From: binarycrayon <10211+binarycrayon@users.noreply.github.com> Date: Tue, 1 Sep 2020 09:01:15 -0700 Subject: [PATCH 086/120] FEAT-#1958: pin ray to 0.8.7 (#1966) Signed-off-by: Yudi Xue <10211+binarycrayon@users.noreply.github.com> --- .github/workflows/ci.yml | 2 +- .github/workflows/push.yml | 2 +- environment.yml | 2 +- modin/__init__.py | 2 +- modin/engines/ray/utils.py | 4 ++-- modin/experimental/cloud/ray-autoscaler.yml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d31ccc3e720..fb002dfd251 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,7 +77,7 @@ jobs: use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! auto-update-conda: true - shell: bash -l {0} - run: pip install ray==0.8.6 + run: pip install ray==0.8.7 - name: Conda environment shell: bash -l {0} run: | diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 4a1c95faf26..9a0b2cee586 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -30,7 +30,7 @@ jobs: use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! auto-update-conda: true - shell: bash -l {0} - run: pip install ray==0.8.6 + run: pip install ray==0.8.7 - name: Conda environment shell: bash -l {0} run: | diff --git a/environment.yml b/environment.yml index 4bc191caafa..b98335b021b 100644 --- a/environment.yml +++ b/environment.yml @@ -28,5 +28,5 @@ dependencies: - pytest-xdist - coverage<5.0 - pip: - - ray==0.8.6 + - ray==0.8.7 - rpyc diff --git a/modin/__init__.py b/modin/__init__.py index 664ddadeba0..ecc2a83abbd 100644 --- a/modin/__init__.py +++ b/modin/__init__.py @@ -53,7 +53,7 @@ def get_execution_engine(): except ImportError: pass else: - if version.parse(ray.__version__) != version.parse("0.8.6"): + if version.parse(ray.__version__) != version.parse("0.8.7"): raise ImportError( "Please `pip install modin[ray]` to install compatible Ray version." ) diff --git a/modin/engines/ray/utils.py b/modin/engines/ray/utils.py index 51f96d6f298..70d4bc3fe1f 100644 --- a/modin/engines/ray/utils.py +++ b/modin/engines/ray/utils.py @@ -103,7 +103,7 @@ def initialize_ray( # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", - include_webui=False, + include_dashboard=False, ignore_reinit_error=True, redis_password=redis_password, logging_level=100, @@ -136,7 +136,7 @@ def initialize_ray( object_store_memory = int(object_store_memory) ray.init( num_cpus=int(num_cpus), - include_webui=False, + include_dashboard=False, ignore_reinit_error=True, plasma_directory=plasma_directory, object_store_memory=object_store_memory, diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml index f4ea45f7ca8..9051337128c 100644 --- a/modin/experimental/cloud/ray-autoscaler.yml +++ b/modin/experimental/cloud/ray-autoscaler.yml @@ -123,7 +123,7 @@ setup_commands: conda activate modin conda install python==3.7.6 - pip install modin "ray==0.8.6" cloudpickle + pip install modin "ray==0.8.7" cloudpickle echo 'export MODIN_RAY_CLUSTER=True' >> ~/.bashrc # Consider uncommenting these if you also want to run apt-get commands during setup diff --git a/requirements.txt b/requirements.txt index 0e09fa3b4af..e1ccda9eb8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ numpy pyarrow<0.17 dask[complete]>=2.12.0,<=2.19.0 distributed>=2.12.0,<=2.19.0 -ray==0.8.6 +ray==0.8.7 psutil==5.6.6 xarray Jinja2 diff --git a/setup.py b/setup.py index 907f8f48753..52fd07f5df3 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def is_pure(self): dask_deps = ["dask>=2.12.0", "distributed>=2.12.0"] -ray_deps = ["ray==0.8.6", "pyarrow<0.17"] +ray_deps = ["ray==0.8.7", "pyarrow<0.17"] remote_deps = ["rpyc", "cloudpickle", "boto3"] all_deps = dask_deps + ray_deps + remote_deps From ff6ff0d47dd1331c4a7a16907e6541bb9af68023 Mon Sep 17 00:00:00 2001 From: Alina Bykovskaya Date: Wed, 2 Sep 2020 01:25:56 +0300 Subject: [PATCH 087/120] REFACTOR-#1527: split test_dataframe.py into several others (#1995) Signed-off-by: Alina --- .github/workflows/ci.yml | 10 +- .github/workflows/push.yml | 8 +- modin/pandas/test/dataframe/__init__.py | 12 + modin/pandas/test/dataframe/test_binary.py | 386 ++ modin/pandas/test/dataframe/test_default.py | 1175 ++++ modin/pandas/test/dataframe/test_indexing.py | 1154 ++++ modin/pandas/test/dataframe/test_iter.py | 389 ++ modin/pandas/test/dataframe/test_join_sort.py | 524 ++ .../test/dataframe/test_map_metadata.py | 1307 ++++ modin/pandas/test/dataframe/test_reduction.py | 340 + modin/pandas/test/dataframe/test_udf.py | 443 ++ modin/pandas/test/dataframe/test_window.py | 874 +++ modin/pandas/test/test_dataframe.py | 6118 +---------------- 13 files changed, 6621 insertions(+), 6119 deletions(-) create mode 100644 modin/pandas/test/dataframe/__init__.py create mode 100644 modin/pandas/test/dataframe/test_binary.py create mode 100644 modin/pandas/test/dataframe/test_default.py create mode 100644 modin/pandas/test/dataframe/test_indexing.py create mode 100644 modin/pandas/test/dataframe/test_iter.py create mode 100644 modin/pandas/test/dataframe/test_join_sort.py create mode 100644 modin/pandas/test/dataframe/test_map_metadata.py create mode 100644 modin/pandas/test/dataframe/test_reduction.py create mode 100644 modin/pandas/test/dataframe/test_udf.py create mode 100644 modin/pandas/test/dataframe/test_window.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb002dfd251..5b534b62bef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,7 +180,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -210,7 +210,7 @@ jobs: if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: pytest modin/pandas/test/ -k "TestDataFrame${{matrix.part}}" + run: pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py if: matrix.part != 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py @@ -269,7 +269,7 @@ jobs: conda info conda list - shell: bash -l {0} - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrameMapMetadata + run: python -m pytest modin/pandas/test/dataframe/test_map_metadata.py - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py - shell: bash -l {0} @@ -284,7 +284,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -312,7 +312,7 @@ jobs: conda info conda list - shell: bash -l {0} - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} + run: python -m pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py if: matrix.part != 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 9a0b2cee586..527358d18d3 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -44,7 +44,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -74,7 +74,7 @@ jobs: if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: pytest modin/pandas/test/ -k "TestDataFrame${{matrix.part}}" + run: pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py if: matrix.part != 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py @@ -110,7 +110,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["Reduction", "Binary", "MapMetadata", "UDF", "Default", "Window", "Indexing", "Iter", "JoinSort", 3] + part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -138,7 +138,7 @@ jobs: conda info conda list - shell: bash -l {0} - run: python -m pytest modin/pandas/test/test_dataframe.py::TestDataFrame${{matrix.part}} + run: python -m pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py if: matrix.part != 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py diff --git a/modin/pandas/test/dataframe/__init__.py b/modin/pandas/test/dataframe/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/pandas/test/dataframe/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py new file mode 100644 index 00000000000..c24164dcc65 --- /dev/null +++ b/modin/pandas/test/dataframe/test_binary.py @@ -0,0 +1,386 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import pandas +import matplotlib +import modin.pandas as pd + +from modin.pandas.test.utils import ( + random_state, + RAND_LOW, + RAND_HIGH, + df_equals, + test_data_values, + test_data_keys, + eval_general, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +def inter_df_math_helper(modin_df, pandas_df, op): + # Test dataframe to dataframe + try: + pandas_result = getattr(pandas_df, op)(pandas_df) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(modin_df) + else: + modin_result = getattr(modin_df, op)(modin_df) + df_equals(modin_result, pandas_result) + + # Test dataframe to int + try: + pandas_result = getattr(pandas_df, op)(4) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(4) + else: + modin_result = getattr(modin_df, op)(4) + df_equals(modin_result, pandas_result) + + # Test dataframe to float + try: + pandas_result = getattr(pandas_df, op)(4.0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(4.0) + else: + modin_result = getattr(modin_df, op)(4.0) + df_equals(modin_result, pandas_result) + + # Test transposed dataframes to float + try: + pandas_result = getattr(pandas_df.T, op)(4.0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df.T, op)(4.0) + else: + modin_result = getattr(modin_df.T, op)(4.0) + df_equals(modin_result, pandas_result) + + frame_data = { + "{}_other".format(modin_df.columns[0]): [0, 2], + modin_df.columns[0]: [0, 19], + modin_df.columns[1]: [1, 1], + } + modin_df2 = pd.DataFrame(frame_data) + pandas_df2 = pandas.DataFrame(frame_data) + + # Test dataframe to different dataframe shape + try: + pandas_result = getattr(pandas_df, op)(pandas_df2) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(modin_df2) + else: + modin_result = getattr(modin_df, op)(modin_df2) + df_equals(modin_result, pandas_result) + + # Test dataframe fill value + try: + pandas_result = getattr(pandas_df, op)(pandas_df2, fill_value=0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(modin_df2, fill_value=0) + else: + modin_result = getattr(modin_df, op)(modin_df2, fill_value=0) + df_equals(modin_result, pandas_result) + + # Test dataframe to list + list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[1])) + try: + pandas_result = getattr(pandas_df, op)(list_test, axis=1) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(list_test, axis=1) + else: + modin_result = getattr(modin_df, op)(list_test, axis=1) + df_equals(modin_result, pandas_result) + + # Test dataframe to series axis=0 + series_test_modin = modin_df[modin_df.columns[0]] + series_test_pandas = pandas_df[pandas_df.columns[0]] + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=0) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=0) + df_equals(modin_result, pandas_result) + + # Test dataframe to series axis=1 + series_test_modin = modin_df.iloc[0] + series_test_pandas = pandas_df.iloc[0] + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=1) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=1) + df_equals(modin_result, pandas_result) + + # Test dataframe to list axis=1 + series_test_modin = series_test_pandas = list(pandas_df.iloc[0]) + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=1) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=1) + df_equals(modin_result, pandas_result) + + # Test dataframe to list axis=0 + series_test_modin = series_test_pandas = list(pandas_df[pandas_df.columns[0]]) + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=0) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=0) + df_equals(modin_result, pandas_result) + + # Test dataframe to series missing values + series_test_modin = modin_df.iloc[0, :-2] + series_test_pandas = pandas_df.iloc[0, :-2] + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=1) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=1) + df_equals(modin_result, pandas_result) + + # Test dataframe to series with different index + series_test_modin = modin_df[modin_df.columns[0]].reset_index(drop=True) + series_test_pandas = pandas_df[pandas_df.columns[0]].reset_index(drop=True) + try: + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(series_test_modin, axis=0) + else: + modin_result = getattr(modin_df, op)(series_test_modin, axis=0) + df_equals(modin_result, pandas_result) + + # Level test + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df.index] + ) + modin_df_multi_level = modin_df.copy() + modin_df_multi_level.index = new_idx + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + + +@pytest.mark.parametrize( + "function", + [ + "add", + "div", + "divide", + "floordiv", + "mod", + "mul", + "multiply", + "pow", + "sub", + "subtract", + "truediv", + "__div__", + "__add__", + "__radd__", + "__mul__", + "__rmul__", + "__pow__", + "__rpow__", + "__sub__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__mod__", + "__rmod__", + "__rdiv__", + ], +) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_math_functions(data, function): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_helper(modin_df, pandas_df, function) + + +@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) +@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_comparison(data, op, other): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + eval_general( + modin_df, + pandas_df, + operation=lambda df, **kwargs: getattr(df, op)( + df if other == "as_left" else other + ), + ) + + +@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_multi_level_comparison(data, op): + modin_df_multi_level = pd.DataFrame(data) + + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df_multi_level.index] + ) + modin_df_multi_level.index = new_idx + + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + + +# Test dataframe right operations +def inter_df_math_right_ops_helper(modin_df, pandas_df, op): + try: + pandas_result = getattr(pandas_df, op)(4) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(4) + else: + modin_result = getattr(modin_df, op)(4) + df_equals(modin_result, pandas_result) + + try: + pandas_result = getattr(pandas_df, op)(4.0) + except Exception as e: + with pytest.raises(type(e)): + getattr(modin_df, op)(4.0) + else: + modin_result = getattr(modin_df, op)(4.0) + df_equals(modin_result, pandas_result) + + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df.index] + ) + modin_df_multi_level = modin_df.copy() + modin_df_multi_level.index = new_idx + + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_radd(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "radd") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rdiv(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rdiv") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rfloordiv(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rfloordiv") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rmod(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rmod") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rmul(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rmul") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rpow(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rpow") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rsub(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rsub") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rtruediv(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "rtruediv") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___rsub__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + inter_df_math_right_ops_helper(modin_df, pandas_df, "__rsub__") + + +# END test dataframe right operations + + +def test_equals(): + frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]} + modin_df1 = pd.DataFrame(frame_data) + modin_df2 = pd.DataFrame(frame_data) + + assert modin_df1.equals(modin_df2) + + df_equals(modin_df1, modin_df2) + df_equals(modin_df1, pd.DataFrame(modin_df1)) + + frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]} + modin_df3 = pd.DataFrame(frame_data, index=list("abcd")) + + assert not modin_df1.equals(modin_df3) + + with pytest.raises(AssertionError): + df_equals(modin_df3, modin_df1) + + with pytest.raises(AssertionError): + df_equals(modin_df3, modin_df2) + + assert modin_df1.equals(modin_df2._query_compiler.to_pandas()) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py new file mode 100644 index 00000000000..bd486163653 --- /dev/null +++ b/modin/pandas/test/dataframe/test_default.py @@ -0,0 +1,1175 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import os +import matplotlib +import modin.pandas as pd +from modin.pandas.utils import to_pandas +from numpy.testing import assert_array_equal +import io + +from modin.pandas.test.utils import ( + df_equals, + name_contains, + test_data_values, + test_data_keys, + numeric_dfs, + axis_keys, + axis_values, + bool_arg_keys, + bool_arg_values, + eval_general, + create_test_dfs, + generate_multiindex, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +def test_align(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).align(pd.DataFrame(data)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_numpy(data): + modin_frame = pd.DataFrame(data) + pandas_frame = pandas.DataFrame(data) + assert_array_equal(modin_frame.values, pandas_frame.values) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_partition_to_numpy(data): + frame = pd.DataFrame(data) + for partition in frame._query_compiler._modin_frame._partitions.flatten().tolist(): + assert_array_equal(partition.to_pandas().values, partition.to_numpy()) + + +def test_asfreq(): + index = pd.date_range("1/1/2000", periods=4, freq="T") + series = pd.Series([0.0, None, 2.0, 3.0], index=index) + df = pd.DataFrame({"s": series}) + with pytest.warns(UserWarning): + # We are only testing that this defaults to pandas, so we will just check for + # the warning + df.asfreq(freq="30S") + + +def test_asof(): + df = pd.DataFrame( + {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}, + index=pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ), + ) + with pytest.warns(UserWarning): + df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) + + +def test_assign(): + data = test_data_values[0] + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_result = modin_df.assign(new_column=pd.Series(modin_df.iloc[:, 0])) + pandas_result = pandas_df.assign(new_column=pandas.Series(pandas_df.iloc[:, 0])) + df_equals(modin_result, pandas_result) + modin_result = modin_df.assign( + new_column=pd.Series(modin_df.iloc[:, 0]), + new_column2=pd.Series(modin_df.iloc[:, 1]), + ) + pandas_result = pandas_df.assign( + new_column=pandas.Series(pandas_df.iloc[:, 0]), + new_column2=pandas.Series(pandas_df.iloc[:, 1]), + ) + df_equals(modin_result, pandas_result) + + +def test_at_time(): + i = pd.date_range("2008-01-01", periods=1000, freq="12H") + modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) + pandas_df = pandas.DataFrame( + {"A": list(range(1000)), "B": list(range(1000))}, index=i + ) + df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00")) + df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00")) + df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1)) + + +def test_between_time(): + i = pd.date_range("2008-01-01", periods=1000, freq="12H") + modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) + pandas_df = pandas.DataFrame( + {"A": list(range(1000)), "B": list(range(1000))}, index=i + ) + df_equals( + modin_df.between_time("12:00", "17:00"), + pandas_df.between_time("12:00", "17:00"), + ) + df_equals( + modin_df.between_time("3:00", "4:00"), + pandas_df.between_time("3:00", "4:00"), + ) + df_equals( + modin_df.T.between_time("12:00", "17:00", axis=1), + pandas_df.T.between_time("12:00", "17:00", axis=1), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_bfill(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.bfill(), pandas_df.bfill()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_bool(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(ValueError): + modin_df.bool() + modin_df.__bool__() + + single_bool_pandas_df = pandas.DataFrame([True]) + single_bool_modin_df = pd.DataFrame([True]) + + assert single_bool_pandas_df.bool() == single_bool_modin_df.bool() + + with pytest.raises(ValueError): + # __bool__ always raises this error for DataFrames + single_bool_modin_df.__bool__() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_boxplot(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + assert modin_df.boxplot() == to_pandas(modin_df).boxplot() + + +def test_combine_first(): + data1 = {"A": [None, 0], "B": [None, 4]} + modin_df1 = pd.DataFrame(data1) + pandas_df1 = pandas.DataFrame(data1) + data2 = {"A": [1, 1], "B": [3, 3]} + modin_df2 = pd.DataFrame(data2) + pandas_df2 = pandas.DataFrame(data2) + df_equals(modin_df1.combine_first(modin_df2), pandas_df1.combine_first(pandas_df2)) + + +def test_corr(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).corr() + + +def test_corrwith(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).corrwith(pd.DataFrame(data)) + + +def test_cov(): + data = test_data_values[0] + modin_result = pd.DataFrame(data).cov() + pandas_result = pandas.DataFrame(data).cov() + df_equals(modin_result, pandas_result) + + +@pytest.mark.skipif( + os.name == "nt", + reason="AssertionError: numpy array are different", +) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dot(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + col_len = len(modin_df.columns) + + # Test list input + arr = np.arange(col_len) + modin_result = modin_df.dot(arr) + pandas_result = pandas_df.dot(arr) + df_equals(modin_result, pandas_result) + + # Test bad dimensions + with pytest.raises(ValueError): + modin_result = modin_df.dot(np.arange(col_len + 10)) + + # Test series input + modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) + pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) + modin_result = modin_df.dot(modin_series) + pandas_result = pandas_df.dot(pandas_series) + df_equals(modin_result, pandas_result) + + # Test dataframe input + modin_result = modin_df.dot(modin_df.T) + pandas_result = pandas_df.dot(pandas_df.T) + df_equals(modin_result, pandas_result) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_result = modin_df.dot(pd.Series(np.arange(col_len))) + + # Test case when left dataframe has size (n x 1) + # and right dataframe has size (1 x n) + modin_df = pd.DataFrame(modin_series) + pandas_df = pandas.DataFrame(pandas_series) + modin_result = modin_df.dot(modin_df.T) + pandas_result = pandas_df.dot(pandas_df.T) + df_equals(modin_result, pandas_result) + + # Test case when left dataframe has size (1 x 1) + # and right dataframe has size (1 x n) + modin_result = pd.DataFrame([1]).dot(modin_df.T) + pandas_result = pandas.DataFrame([1]).dot(pandas_df.T) + df_equals(modin_result, pandas_result) + + +@pytest.mark.skipif( + os.name == "nt", + reason="AssertionError: numpy array are different", +) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_matmul(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + col_len = len(modin_df.columns) + + # Test list input + arr = np.arange(col_len) + modin_result = modin_df @ arr + pandas_result = pandas_df @ arr + df_equals(modin_result, pandas_result) + + # Test bad dimensions + with pytest.raises(ValueError): + modin_result = modin_df @ np.arange(col_len + 10) + + # Test series input + modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) + pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) + modin_result = modin_df @ modin_series + pandas_result = pandas_df @ pandas_series + df_equals(modin_result, pandas_result) + + # Test dataframe input + modin_result = modin_df @ modin_df.T + pandas_result = pandas_df @ pandas_df.T + df_equals(modin_result, pandas_result) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_result = modin_df @ pd.Series(np.arange(col_len)) + + +def test_ewm(): + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + with pytest.warns(UserWarning): + df.ewm(com=0.5).mean() + + +def test_expanding(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).expanding() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_explode(data): + modin_df = pd.DataFrame(data) + with pytest.warns(UserWarning): + modin_df.explode(modin_df.columns[0]) + + +def test_first(): + i = pd.date_range("2010-04-09", periods=400, freq="2D") + modin_df = pd.DataFrame({"A": list(range(400)), "B": list(range(400))}, index=i) + pandas_df = pandas.DataFrame( + {"A": list(range(400)), "B": list(range(400))}, index=i + ) + df_equals(modin_df.first("3D"), pandas_df.first("3D")) + df_equals(modin_df.first("20D"), pandas_df.first("20D")) + + +@pytest.mark.skip(reason="Defaulting to Pandas") +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_from_dict(data): + modin_df = pd.DataFrame(data) # noqa F841 + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(NotImplementedError): + pd.DataFrame.from_dict(None) + + +@pytest.mark.skip(reason="Defaulting to Pandas") +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_from_items(data): + modin_df = pd.DataFrame(data) # noqa F841 + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(NotImplementedError): + pd.DataFrame.from_items(None) + + +@pytest.mark.skip(reason="Defaulting to Pandas") +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_from_records(data): + modin_df = pd.DataFrame(data) # noqa F841 + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(NotImplementedError): + pd.DataFrame.from_records(None) + + +def test_hist(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).hist(None) + + +def test_infer_objects(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).infer_objects() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_info_default_param(data): + with io.StringIO() as first, io.StringIO() as second: + eval_general( + pd.DataFrame(data), + pandas.DataFrame(data), + verbose=None, + max_cols=None, + memory_usage=None, + null_counts=None, + operation=lambda df, **kwargs: df.info(**kwargs), + buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, + ) + modin_info = first.getvalue().splitlines() + pandas_info = second.getvalue().splitlines() + + assert modin_info[0] == str(pd.DataFrame) + assert pandas_info[0] == str(pandas.DataFrame) + assert modin_info[1:] == pandas_info[1:] + + +@pytest.mark.parametrize("verbose", [True, False]) +@pytest.mark.parametrize("max_cols", [10, 99999999]) +@pytest.mark.parametrize("memory_usage", [True, False, "deep"]) +@pytest.mark.parametrize("null_counts", [True, False]) +def test_info(verbose, max_cols, memory_usage, null_counts): + data = test_data_values[0] + with io.StringIO() as first, io.StringIO() as second: + eval_general( + pd.DataFrame(data), + pandas.DataFrame(data), + operation=lambda df, **kwargs: df.info(**kwargs), + verbose=verbose, + max_cols=max_cols, + memory_usage=memory_usage, + null_counts=null_counts, + buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, + ) + modin_info = first.getvalue().splitlines() + pandas_info = second.getvalue().splitlines() + + assert modin_info[0] == str(pd.DataFrame) + assert pandas_info[0] == str(pandas.DataFrame) + assert modin_info[1:] == pandas_info[1:] + + +def test_interpolate(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).interpolate() + + +def test_kurt_kurtosis_equals(): + # It's optimization. If failed, df.kurt should be tested explicitly + # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. + data = test_data_values[0] + df_modin = pd.DataFrame(data) + assert df_modin.kurt == df_modin.kurtosis + + +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("skipna", bool_arg_values, ids=bool_arg_keys) +@pytest.mark.parametrize("numeric_only", bool_arg_values, ids=bool_arg_keys) +def test_kurt_kurtosis(axis, skipna, numeric_only): + data = test_data_values[0] + df_modin = pd.DataFrame(data) + df_pandas = pandas.DataFrame(data) + + eval_general( + df_modin, + df_pandas, + lambda df: df.kurtosis( + axis=axis, skipna=skipna, level=None, numeric_only=numeric_only + ), + ) + + +@pytest.mark.parametrize("level", [-1, 0, 1]) +def test_kurt_kurtosis_level(level): + data = test_data_values[0] + df_modin = pd.DataFrame(data) + df_pandas = pandas.DataFrame(data) + + index = generate_multiindex(len(data.keys())) + df_modin.columns = index + df_pandas.columns = index + eval_general( + df_modin, + df_pandas, + lambda df: df.kurtosis(axis=1, level=level), + ) + + +def test_last(): + modin_index = pd.date_range("2010-04-09", periods=400, freq="2D") + pandas_index = pandas.date_range("2010-04-09", periods=400, freq="2D") + modin_df = pd.DataFrame( + {"A": list(range(400)), "B": list(range(400))}, index=modin_index + ) + pandas_df = pandas.DataFrame( + {"A": list(range(400)), "B": list(range(400))}, index=pandas_index + ) + df_equals(modin_df.last("3D"), pandas_df.last("3D")) + df_equals(modin_df.last("20D"), pandas_df.last("20D")) + + +def test_lookup(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).lookup([0, 1], ["col1", "col2"]) + + +@pytest.mark.parametrize("data", test_data_values) +@pytest.mark.parametrize("axis", [None, 0, 1]) +@pytest.mark.parametrize("skipna", [None, True, False]) +def test_mad(data, axis, skipna): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + df_equals( + modin_df.mad(axis=axis, skipna=skipna, level=None), + pandas_df.mad(axis=axis, skipna=skipna, level=None), + ) + + +@pytest.mark.parametrize("level", [-1, 0, 1]) +def test_mad_level(level): + data = test_data_values[0] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + index = generate_multiindex(len(data.keys())) + modin_df.columns = index + pandas_df.columns = index + eval_general( + modin_df, + pandas_df, + lambda df: df.mad(axis=1, level=level), + ) + + +def test_mask(): + df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) + m = df % 3 == 0 + with pytest.warns(UserWarning): + try: + df.mask(~m, -df) + except ValueError: + pass + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "id_vars", [lambda df: df.columns[0], lambda df: df.columns[:4], None] +) +@pytest.mark.parametrize( + "value_vars", [lambda df: df.columns[-1], lambda df: df.columns[-4:], None] +) +def test_melt(data, id_vars, value_vars): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.melt(*args, **kwargs) + .sort_values(["variable", "value"]) + .reset_index(drop=True), + id_vars=id_vars, + value_vars=value_vars, + ) + + +def test_pct_change(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).pct_change() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "index", [lambda df: df.columns[0], lambda df: df[df.columns[0]].values, None] +) +@pytest.mark.parametrize("columns", [lambda df: df.columns[len(df.columns) // 2]]) +@pytest.mark.parametrize( + "values", [lambda df: df.columns[-1], lambda df: df.columns[-2:], None] +) +def test_pivot(data, index, columns, values): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.pivot(*args, **kwargs), + index=index, + columns=columns, + values=values, + check_exception_type=None, + ) + + +def test_pivot_table(): + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + with pytest.warns(UserWarning): + df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc=np.sum) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_plot(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if name_contains(request.node.name, numeric_dfs): + # We have to test this way because equality in plots means same object. + zipped_plot_lines = zip(modin_df.plot().lines, pandas_df.plot().lines) + for left, right in zipped_plot_lines: + if isinstance(left.get_xdata(), np.ma.core.MaskedArray) and isinstance( + right.get_xdata(), np.ma.core.MaskedArray + ): + assert all((left.get_xdata() == right.get_xdata()).data) + else: + assert np.array_equal(left.get_xdata(), right.get_xdata()) + if isinstance(left.get_ydata(), np.ma.core.MaskedArray) and isinstance( + right.get_ydata(), np.ma.core.MaskedArray + ): + assert all((left.get_ydata() == right.get_ydata()).data) + else: + assert np.array_equal(left.get_xdata(), right.get_xdata()) + + +def test_replace(): + modin_df = pd.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} + ) + pandas_df = pandas.DataFrame( + {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} + ) + modin_result = modin_df.replace({"A": 0, "B": 5}, 100) + pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.replace({"A": {0: 100, 4: 400}}) + pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}}) + df_equals(modin_result, pandas_result) + + modin_df = pd.DataFrame({"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]}) + pandas_df = pandas.DataFrame( + {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} + ) + modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) + pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new") + pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new") + df_equals(modin_result, pandas_result) + + modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) + pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("rule", ["5T", pandas.offsets.Hour()]) +@pytest.mark.parametrize("axis", [0, "columns"]) +@pytest.mark.parametrize("closed", ["left", "right"]) +@pytest.mark.parametrize("label", ["right", "left"]) +@pytest.mark.parametrize("on", [None, "DateColumn"]) +@pytest.mark.parametrize("level", [None, 1]) +def test_resample(rule, axis, closed, label, on, level): + freq = "H" + base = 2 + index = pandas.date_range("31/12/2000", periods=12, freq=freq) + data = {"A": range(12), "B": range(12)} + + pandas_df = pandas.DataFrame(data, index=index) + modin_df = pd.DataFrame(data, index=index) + + if on is not None and axis == 0: + pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") + modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") + else: + on = None + + if axis == "columns": + pandas_df = pandas_df.T + modin_df = modin_df.T + + if level is not None and axis == 0 and on is None: + index = pandas.MultiIndex.from_product( + [["a", "b", "c"], pandas.date_range("31/12/2000", periods=4, freq=freq)] + ) + pandas_df.index = index + modin_df.index = index + else: + level = None + + pandas_resampler = pandas_df.resample( + rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level + ) + modin_resampler = modin_df.resample( + rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level + ) + + df_equals(modin_resampler.count(), pandas_resampler.count()) + df_equals(modin_resampler.var(0), pandas_resampler.var(0)) + df_equals(modin_resampler.sum(), pandas_resampler.sum()) + df_equals(modin_resampler.std(), pandas_resampler.std()) + df_equals(modin_resampler.sem(), pandas_resampler.sem()) + df_equals(modin_resampler.size(), pandas_resampler.size()) + df_equals(modin_resampler.prod(), pandas_resampler.prod()) + if on is None: + df_equals(modin_resampler.ohlc(), pandas_resampler.ohlc()) + df_equals(modin_resampler.min(), pandas_resampler.min()) + df_equals(modin_resampler.median(), pandas_resampler.median()) + df_equals(modin_resampler.mean(), pandas_resampler.mean()) + df_equals(modin_resampler.max(), pandas_resampler.max()) + df_equals(modin_resampler.last(), pandas_resampler.last()) + df_equals(modin_resampler.first(), pandas_resampler.first()) + df_equals(modin_resampler.nunique(), pandas_resampler.nunique()) + df_equals( + modin_resampler.pipe(lambda x: x.max() - x.min()), + pandas_resampler.pipe(lambda x: x.max() - x.min()), + ) + df_equals( + modin_resampler.transform(lambda x: (x - x.mean()) / x.std()), + pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), + ) + df_equals( + pandas_resampler.aggregate("max"), + modin_resampler.aggregate("max"), + ) + df_equals( + modin_resampler.apply("sum"), + pandas_resampler.apply("sum"), + ) + df_equals( + modin_resampler.get_group(name=list(modin_resampler.groups)[0]), + pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), + ) + assert pandas_resampler.indices == modin_resampler.indices + assert pandas_resampler.groups == modin_resampler.groups + df_equals(modin_resampler.quantile(), pandas_resampler.quantile()) + if axis == 0: + # Upsampling from level= or on= selection is not supported + if on is None and level is None: + df_equals( + modin_resampler.interpolate(), + pandas_resampler.interpolate(), + ) + df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) + df_equals( + modin_resampler.fillna(method="nearest"), + pandas_resampler.fillna(method="nearest"), + ) + df_equals(modin_resampler.pad(), pandas_resampler.pad()) + df_equals(modin_resampler.nearest(), pandas_resampler.nearest()) + df_equals(modin_resampler.bfill(), pandas_resampler.bfill()) + df_equals(modin_resampler.backfill(), pandas_resampler.backfill()) + df_equals(modin_resampler.ffill(), pandas_resampler.ffill()) + df_equals( + pandas_resampler.apply(["sum", "mean", "max"]), + modin_resampler.apply(["sum", "mean", "max"]), + ) + df_equals( + modin_resampler.aggregate(["sum", "mean", "max"]), + pandas_resampler.aggregate(["sum", "mean", "max"]), + ) + + +def test_sem(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).sem() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("index", ["default", "ndarray"]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) +def test_shift(data, index, axis, periods): + if index == "default": + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + elif index == "ndarray": + data_column_length = len(data[next(iter(data))]) + index_data = np.arange(2, data_column_length + 2) + modin_df = pd.DataFrame(data, index=index_data) + pandas_df = pandas.DataFrame(data, index=index_data) + + df_equals( + modin_df.shift(periods=periods, axis=axis), + pandas_df.shift(periods=periods, axis=axis), + ) + df_equals( + modin_df.shift(periods=periods, axis=axis, fill_value=777), + pandas_df.shift(periods=periods, axis=axis, fill_value=777), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("index", ["default", "ndarray"]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) +def test_slice_shift(data, index, axis, periods): + if index == "default": + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + elif index == "ndarray": + data_column_length = len(data[next(iter(data))]) + index_data = np.arange(2, data_column_length + 2) + modin_df = pd.DataFrame(data, index=index_data) + pandas_df = pandas.DataFrame(data, index=index_data) + + df_equals( + modin_df.slice_shift(periods=periods, axis=axis), + pandas_df.slice_shift(periods=periods, axis=axis), + ) + + +@pytest.mark.parametrize("is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]) +@pytest.mark.parametrize("is_multi_col", [True, False], ids=["col_multi", "col_index"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_stack(data, is_multi_idx, is_multi_col): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if is_multi_idx: + if len(pandas_df.index) == 256: + index = pd.MultiIndex.from_product( + [ + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_df.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + else: + index = pandas_df.index + + if is_multi_col: + if len(pandas_df.columns) == 64: + columns = pd.MultiIndex.from_product( + [ + ["A", "B", "C", "D"], + ["xx", "yy", "zz", "LAST"], + [10, 20, 30, 40], + ] + ) + elif len(pandas_df.columns) == 100: + columns = pd.MultiIndex.from_product( + [ + ["xx", "yy", "zz", "LAST"], + ["A", "B", "C", "D", "F"], + ["I", "J", "K", "L", "INDEX"], + ] + ) + else: + columns = pandas_df.columns + + pandas_df.columns = columns + pandas_df.index = index + + modin_df.columns = columns + modin_df.index = index + + df_equals(modin_df.stack(), pandas_df.stack()) + + if is_multi_col: + df_equals(modin_df.stack(level=0), pandas_df.stack(level=0)) + df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1])) + df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2])) + + +def test_style(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).style + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis1", [0, 1]) +@pytest.mark.parametrize("axis2", [0, 1]) +def test_swapaxes(data, axis1, axis2): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + pandas_result = pandas_df.swapaxes(axis1, axis2) + modin_result = modin_df.swapaxes(axis1, axis2) + df_equals(modin_result, pandas_result) + + +def test_swapaxes_axes_names(): + modin_df = pd.DataFrame(test_data_values[0]) + modin_result1 = modin_df.swapaxes(0, 1) + modin_result2 = modin_df.swapaxes("columns", "index") + df_equals(modin_result1, modin_result2) + + +def test_swaplevel(): + data = np.random.randint(1, 100, 12) + modin_df = pd.DataFrame( + data, + index=pd.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + pandas_df = pandas.DataFrame( + data, + index=pandas.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + df_equals( + modin_df.swaplevel("Number", "Color"), + pandas_df.swaplevel("Number", "Color"), + ) + df_equals(modin_df.swaplevel(), pandas_df.swaplevel()) + df_equals(modin_df.swaplevel(0, 1), pandas_df.swaplevel(0, 1)) + + +def test_take(): + modin_df = pd.DataFrame( + [ + ("falcon", "bird", 389.0), + ("parrot", "bird", 24.0), + ("lion", "mammal", 80.5), + ("monkey", "mammal", np.nan), + ], + columns=["name", "class", "max_speed"], + index=[0, 2, 3, 1], + ) + pandas_df = pandas.DataFrame( + [ + ("falcon", "bird", 389.0), + ("parrot", "bird", 24.0), + ("lion", "mammal", 80.5), + ("monkey", "mammal", np.nan), + ], + columns=["name", "class", "max_speed"], + index=[0, 2, 3, 1], + ) + df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3])) + df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_records(request, data): + eval_general( + *create_test_dfs(data), + lambda df: df.dropna().to_records(), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_string(data): + eval_general( + *create_test_dfs(data), + lambda df: df.to_string(), + ) + + +def test_to_timestamp(): + idx = pd.date_range("1/1/2012", periods=5, freq="M") + df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) + + with pytest.warns(UserWarning): + df.to_period().to_timestamp() + + +def test_to_xarray(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).to_xarray() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_truncate(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + before = 1 + after = len(modin_df - 3) + df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) + + before = 1 + after = 3 + df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) + + before = modin_df.columns[1] + after = modin_df.columns[-3] + try: + pandas_result = pandas_df.truncate(before, after, axis=1) + except Exception as e: + with pytest.raises(type(e)): + modin_df.truncate(before, after, axis=1) + else: + modin_result = modin_df.truncate(before, after, axis=1) + df_equals(modin_result, pandas_result) + + before = modin_df.columns[1] + after = modin_df.columns[3] + try: + pandas_result = pandas_df.truncate(before, after, axis=1) + except Exception as e: + with pytest.raises(type(e)): + modin_df.truncate(before, after, axis=1) + else: + modin_result = modin_df.truncate(before, after, axis=1) + df_equals(modin_result, pandas_result) + + before = None + after = None + df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) + try: + pandas_result = pandas_df.truncate(before, after, axis=1) + except Exception as e: + with pytest.raises(type(e)): + modin_df.truncate(before, after, axis=1) + else: + modin_result = modin_df.truncate(before, after, axis=1) + df_equals(modin_result, pandas_result) + + +def test_tshift(): + idx = pd.date_range("1/1/2012", periods=5, freq="M") + data = np.random.randint(0, 100, size=(len(idx), 4)) + modin_df = pd.DataFrame(data, index=idx) + pandas_df = pandas.DataFrame(data, index=idx) + df_equals(modin_df.tshift(4), pandas_df.tshift(4)) + + +def test_tz_convert(): + modin_idx = pd.date_range( + "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" + ) + pandas_idx = pandas.date_range( + "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" + ) + data = np.random.randint(0, 100, size=(len(modin_idx), 4)) + modin_df = pd.DataFrame(data, index=modin_idx) + pandas_df = pandas.DataFrame(data, index=pandas_idx) + modin_result = modin_df.tz_convert("UTC", axis=0) + pandas_result = pandas_df.tz_convert("UTC", axis=0) + df_equals(modin_result, pandas_result) + + modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) + pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))]) + modin_series = pd.DataFrame(data, index=modin_multi) + pandas_series = pandas.DataFrame(data, index=pandas_multi) + df_equals( + modin_series.tz_convert("UTC", axis=0, level=0), + pandas_series.tz_convert("UTC", axis=0, level=0), + ) + + +def test_tz_localize(): + idx = pd.date_range("1/1/2012", periods=400, freq="2D") + data = np.random.randint(0, 100, size=(len(idx), 4)) + modin_df = pd.DataFrame(data, index=idx) + pandas_df = pandas.DataFrame(data, index=idx) + df_equals(modin_df.tz_localize("UTC", axis=0), pandas_df.tz_localize("UTC", axis=0)) + df_equals( + modin_df.tz_localize("America/Los_Angeles", axis=0), + pandas_df.tz_localize("America/Los_Angeles", axis=0), + ) + + +@pytest.mark.parametrize("is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]) +@pytest.mark.parametrize("is_multi_col", [True, False], ids=["col_multi", "col_index"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_unstack(data, is_multi_idx, is_multi_col): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if is_multi_idx: + if len(pandas_df.index) == 256: + index = pd.MultiIndex.from_product( + [ + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_df.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + else: + index = pandas_df.index + + if is_multi_col: + if len(pandas_df.columns) == 64: + columns = pd.MultiIndex.from_product( + [ + ["A", "B", "C", "D"], + ["xx", "yy", "zz", "LAST"], + [10, 20, 30, 40], + ] + ) + elif len(pandas_df.columns) == 100: + columns = pd.MultiIndex.from_product( + [ + ["xx", "yy", "zz", "LAST"], + ["A", "B", "C", "D", "F"], + ["I", "J", "K", "L", "INDEX"], + ] + ) + else: + columns = pandas_df.columns + + pandas_df.columns = columns + pandas_df.index = index + + modin_df.columns = columns + modin_df.index = index + + df_equals(modin_df.unstack(), pandas_df.unstack()) + + if is_multi_idx: + df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) + df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) + df_equals(modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])) + + if len(pandas_df.index) == 256: + df_equals( + modin_df.unstack(level=[0, 1, 2, 3]), + pandas_df.unstack(level=[0, 1, 2, 3]), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___array__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert_array_equal(modin_df.__array__(), pandas_df.__array__()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___bool__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.__bool__() + except Exception as e: + with pytest.raises(type(e)): + modin_df.__bool__() + else: + modin_result = modin_df.__bool__() + df_equals(modin_result, pandas_result) + + +def test___getstate__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).__getstate__() + + +def test___setstate__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + try: + pd.DataFrame(data).__setstate__(None) + except TypeError: + pass + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_hasattr_sparse(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + try: + pandas_result = hasattr(pandas_df, "sparse") + except Exception as e: + with pytest.raises(type(e)): + hasattr(modin_df, "sparse") + else: + modin_result = hasattr(modin_df, "sparse") + assert modin_result == pandas_result diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py new file mode 100644 index 00000000000..4a669949ddc --- /dev/null +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -0,0 +1,1154 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import pandas.util.testing as tm +import matplotlib +import modin.pandas as pd +import sys + +from modin.pandas.test.utils import ( + RAND_LOW, + RAND_HIGH, + df_equals, + arg_keys, + name_contains, + test_data, + test_data_values, + test_data_keys, + axis_keys, + axis_values, + int_arg_keys, + int_arg_values, + create_test_dfs, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_first_valid_index(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert modin_df.first_valid_index() == (pandas_df.first_valid_index()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) +def test_head(data, n): + # Test normal dataframe head + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.head(n), pandas_df.head(n)) + df_equals(modin_df.head(len(modin_df) + 1), pandas_df.head(len(pandas_df) + 1)) + + # Test head when we call it from a QueryCompilerView + modin_result = modin_df.loc[:, ["col1", "col3", "col3"]].head(n) + pandas_result = pandas_df.loc[:, ["col1", "col3", "col3"]].head(n) + df_equals(modin_result, pandas_result) + + +@pytest.mark.skip(reason="Defaulting to Pandas") +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iat(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(NotImplementedError): + modin_df.iat() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iloc(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if not name_contains(request.node.name, ["empty_data"]): + # Scaler + np.testing.assert_equal(modin_df.iloc[0, 1], pandas_df.iloc[0, 1]) + + # Series + df_equals(modin_df.iloc[0], pandas_df.iloc[0]) + df_equals(modin_df.iloc[1:, 0], pandas_df.iloc[1:, 0]) + df_equals(modin_df.iloc[1:2, 0], pandas_df.iloc[1:2, 0]) + + # DataFrame + df_equals(modin_df.iloc[[1, 2]], pandas_df.iloc[[1, 2]]) + # See issue #80 + # df_equals(modin_df.iloc[[1, 2], [1, 0]], pandas_df.iloc[[1, 2], [1, 0]]) + df_equals(modin_df.iloc[1:2, 0:2], pandas_df.iloc[1:2, 0:2]) + + # Issue #43 + modin_df.iloc[0:3, :] + + # Write Item + modin_df.iloc[[1, 2]] = 42 + pandas_df.iloc[[1, 2]] = 42 + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_df.iloc[0] = modin_df.iloc[1] + pandas_df.iloc[0] = pandas_df.iloc[1] + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_df.iloc[:, 0] = modin_df.iloc[:, 1] + pandas_df.iloc[:, 0] = pandas_df.iloc[:, 1] + df_equals(modin_df, pandas_df) + + # From issue #1775 + df_equals( + modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], + pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], + ) + else: + with pytest.raises(IndexError): + modin_df.iloc[0, 1] + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_index(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.index, pandas_df.index) + modin_df_cp = modin_df.copy() + pandas_df_cp = pandas_df.copy() + + modin_df_cp.index = [str(i) for i in modin_df_cp.index] + pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] + df_equals(modin_df_cp.index, pandas_df_cp.index) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_indexing_duplicate_axis(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))] + assert any(modin_df.index.duplicated()) + assert any(pandas_df.index.duplicated()) + + df_equals(modin_df.iloc[0], pandas_df.iloc[0]) + df_equals(modin_df.loc[0], pandas_df.loc[0]) + df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4]) + df_equals( + modin_df.loc[0, modin_df.columns[0:4]], + pandas_df.loc[0, pandas_df.columns[0:4]], + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_keys(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.keys(), pandas_df.keys()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_loc(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + # Scaler + df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1]) + + # Series + df_equals(modin_df.loc[0], pandas_df.loc[0]) + df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) + df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) + + # DataFrame + df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) + + # List-like of booleans + indices = [i % 3 == 0 for i in range(len(modin_df.index))] + columns = [i % 5 == 0 for i in range(len(modin_df.columns))] + modin_result = modin_df.loc[indices, columns] + pandas_result = pandas_df.loc[indices, columns] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[:, columns] + pandas_result = pandas_df.loc[:, columns] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[indices] + pandas_result = pandas_df.loc[indices] + df_equals(modin_result, pandas_result) + + # See issue #80 + # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) + df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) + + # From issue #421 + df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) + df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) + + # From issue #1023 + key1 = modin_df.columns[0] + key2 = modin_df.columns[-2] + df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) + + # Write Item + modin_df_copy = modin_df.copy() + pandas_df_copy = pandas_df.copy() + modin_df_copy.loc[[1, 2]] = 42 + pandas_df_copy.loc[[1, 2]] = 42 + df_equals(modin_df_copy, pandas_df_copy) + + # From issue #1775 + df_equals( + modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], + ) + + # From issue #1374 + with pytest.raises(KeyError): + modin_df.loc["NO_EXIST"] + + +def test_loc_multi_index(): + modin_df = pd.read_csv( + "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 + ) + pandas_df = pandas.read_csv( + "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 + ) + + df_equals(modin_df.loc[1], pandas_df.loc[1]) + df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) + df_equals( + modin_df.loc[1, ("Presidents", "Pure mentions")], + pandas_df.loc[1, ("Presidents", "Pure mentions")], + ) + assert ( + modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] + == pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] + ) + df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"]) + + tuples = [ + ("bar", "one"), + ("bar", "two"), + ("bar", "three"), + ("bar", "four"), + ("baz", "one"), + ("baz", "two"), + ("baz", "three"), + ("baz", "four"), + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("foo", "four"), + ("qux", "one"), + ("qux", "two"), + ("qux", "three"), + ("qux", "four"), + ] + + modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) + frame_data = np.random.randint(0, 100, size=(16, 100)) + modin_df = pd.DataFrame( + frame_data, + index=modin_index, + columns=["col{}".format(i) for i in range(100)], + ) + pandas_df = pandas.DataFrame( + frame_data, + index=pandas_index, + columns=["col{}".format(i) for i in range(100)], + ) + df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) + assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] + df_equals( + modin_df.loc["bar", ("col1", "col2")], + pandas_df.loc["bar", ("col1", "col2")], + ) + + # From issue #1456 + transposed_modin = modin_df.T + transposed_pandas = pandas_df.T + df_equals( + transposed_modin.loc[transposed_modin.index[:-2], :], + transposed_pandas.loc[transposed_pandas.index[:-2], :], + ) + + # From issue #1610 + df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) + df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]]) + + +@pytest.mark.parametrize("index", [["row1", "row2", "row3"], ["row1"]]) +@pytest.mark.parametrize("columns", [["col1", "col2"], ["col1"]]) +def test_loc_assignment(index, columns): + md_df, pd_df = create_test_dfs(index=index, columns=columns) + for i, ind in enumerate(index): + for j, col in enumerate(columns): + value_to_assign = int(str(i) + str(j)) + md_df.loc[ind][col] = value_to_assign + pd_df.loc[ind][col] = value_to_assign + df_equals(md_df, pd_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_loc_nested_assignment(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + + modin_df[key1].loc[0] = 500 + pandas_df[key1].loc[0] = 500 + df_equals(modin_df, pandas_df) + + modin_df[key2].loc[0] = None + pandas_df[key2].loc[0] = None + df_equals(modin_df, pandas_df) + + +def test_iloc_assignment(): + modin_df = pd.DataFrame(index=["row1", "row2", "row3"], columns=["col1", "col2"]) + pandas_df = pandas.DataFrame( + index=["row1", "row2", "row3"], columns=["col1", "col2"] + ) + modin_df.iloc[0]["col1"] = 11 + modin_df.iloc[1]["col1"] = 21 + modin_df.iloc[2]["col1"] = 31 + modin_df.iloc[0]["col2"] = 12 + modin_df.iloc[1]["col2"] = 22 + modin_df.iloc[2]["col2"] = 32 + pandas_df.iloc[0]["col1"] = 11 + pandas_df.iloc[1]["col1"] = 21 + pandas_df.iloc[2]["col1"] = 31 + pandas_df.iloc[0]["col2"] = 12 + pandas_df.iloc[1]["col2"] = 22 + pandas_df.iloc[2]["col2"] = 32 + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iloc_nested_assignment(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + key1 = modin_df.columns[0] + key2 = modin_df.columns[1] + + modin_df[key1].iloc[0] = 500 + pandas_df[key1].iloc[0] = 500 + df_equals(modin_df, pandas_df) + + modin_df[key2].iloc[0] = None + pandas_df[key2].iloc[0] = None + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pop(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if "empty_data" not in request.node.name: + key = modin_df.columns[0] + temp_modin_df = modin_df.copy() + temp_pandas_df = pandas_df.copy() + modin_popped = temp_modin_df.pop(key) + pandas_popped = temp_pandas_df.pop(key) + df_equals(modin_popped, pandas_popped) + df_equals(temp_modin_df, temp_pandas_df) + + +def test_reindex(): + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + df_equals(modin_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) + df_equals(modin_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) + df_equals( + modin_df.reindex(["col1", "col3", "col4", "col2"], axis=1), + pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1), + ) + df_equals( + modin_df.reindex(["col1", "col7", "col4", "col8"], axis=1), + pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1), + ) + df_equals( + modin_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), + pandas_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), + ) + df_equals( + modin_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), + pandas_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), + ) + + +def test_reindex_like(): + df1 = pd.DataFrame( + [ + [24.3, 75.7, "high"], + [31, 87.8, "high"], + [22, 71.6, "medium"], + [35, 95, "medium"], + ], + columns=["temp_celsius", "temp_fahrenheit", "windspeed"], + index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), + ) + df2 = pd.DataFrame( + [[28, "low"], [30, "low"], [35.1, "medium"]], + columns=["temp_celsius", "windspeed"], + index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]), + ) + with pytest.warns(UserWarning): + df2.reindex_like(df1) + + +def test_rename_sanity(): + test_data = pandas.DataFrame(tm.getSeriesData()) + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} + + modin_df = pd.DataFrame(test_data) + df_equals(modin_df.rename(columns=mapping), test_data.rename(columns=mapping)) + + renamed2 = test_data.rename(columns=str.lower) + df_equals(modin_df.rename(columns=str.lower), renamed2) + + modin_df = pd.DataFrame(renamed2) + df_equals(modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) + + # index + data = {"A": {"foo": 0, "bar": 1}} + + # gets sorted alphabetical + df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + tm.assert_index_equal( + modin_df.rename(index={"foo": "bar", "bar": "foo"}).index, + df.rename(index={"foo": "bar", "bar": "foo"}).index, + ) + + tm.assert_index_equal( + modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index + ) + + # Using the `mapper` functionality with `axis` + tm.assert_index_equal( + modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index + ) + tm.assert_index_equal( + modin_df.rename(str.upper, axis=1).columns, + df.rename(str.upper, axis=1).columns, + ) + + # have to pass something + with pytest.raises(TypeError): + modin_df.rename() + + # partial columns + renamed = test_data.rename(columns={"C": "foo", "D": "bar"}) + modin_df = pd.DataFrame(test_data) + tm.assert_index_equal( + modin_df.rename(columns={"C": "foo", "D": "bar"}).index, + test_data.rename(columns={"C": "foo", "D": "bar"}).index, + ) + + # other axis + renamed = test_data.T.rename(index={"C": "foo", "D": "bar"}) + tm.assert_index_equal( + test_data.T.rename(index={"C": "foo", "D": "bar"}).index, + modin_df.T.rename(index={"C": "foo", "D": "bar"}).index, + ) + + # index with name + index = pandas.Index(["foo", "bar"], name="name") + renamer = pandas.DataFrame(data, index=index) + modin_df = pd.DataFrame(data, index=index) + + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, modin_renamed.index) + + assert renamed.index.name == modin_renamed.index.name + + +def test_rename_multiindex(): + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) + + frame_data = [(0, 0), (1, 1)] + df = pandas.DataFrame(frame_data, index=index, columns=columns) + modin_df = pd.DataFrame(frame_data, index=index, columns=columns) + + # + # without specifying level -> accross all levels + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + modin_renamed = modin_df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + tm.assert_index_equal(renamed.index, modin_renamed.index) + + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + assert renamed.index.names == modin_renamed.index.names + assert renamed.columns.names == modin_renamed.columns.names + + # + # with specifying a level + + # dict + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) + modin_renamed = modin_df.rename( + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0 + ) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") + modin_renamed = modin_df.rename( + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz" + ) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) + modin_renamed = modin_df.rename( + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1 + ) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") + modin_renamed = modin_df.rename( + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz" + ) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + + # function + func = str.upper + renamed = df.rename(columns=func, level=0) + modin_renamed = modin_df.rename(columns=func, level=0) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + renamed = df.rename(columns=func, level="fizz") + modin_renamed = modin_df.rename(columns=func, level="fizz") + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + + renamed = df.rename(columns=func, level=1) + modin_renamed = modin_df.rename(columns=func, level=1) + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + renamed = df.rename(columns=func, level="buzz") + modin_renamed = modin_df.rename(columns=func, level="buzz") + tm.assert_index_equal(renamed.columns, modin_renamed.columns) + + # index + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) + modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) + tm.assert_index_equal(modin_renamed.index, renamed.index) + + +@pytest.mark.skip(reason="Pandas does not pass this test") +def test_rename_nocopy(): + test_data = pandas.DataFrame(tm.getSeriesData()) + modin_df = pd.DataFrame(test_data) + modin_renamed = modin_df.rename(columns={"C": "foo"}, copy=False) + modin_renamed["foo"] = 1 + assert (modin_df["C"] == 1).all() + + +def test_rename_inplace(): + test_data = pandas.DataFrame(tm.getSeriesData()) + modin_df = pd.DataFrame(test_data) + + df_equals( + modin_df.rename(columns={"C": "foo"}), + test_data.rename(columns={"C": "foo"}), + ) + + frame = test_data.copy() + modin_frame = modin_df.copy() + frame.rename(columns={"C": "foo"}, inplace=True) + modin_frame.rename(columns={"C": "foo"}, inplace=True) + + df_equals(modin_frame, frame) + + +def test_rename_bug(): + # rename set ref_locs, and set_index was not resetting + frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) + df = df.set_index(["a", "b"]) + df.columns = ["2001-01-01"] + + modin_df = modin_df.rename(columns={0: "a"}) + modin_df = modin_df.rename(columns={1: "b"}) + modin_df = modin_df.set_index(["a", "b"]) + modin_df.columns = ["2001-01-01"] + + df_equals(modin_df, df) + + +def test_rename_axis(): + data = {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]} + index = ["dog", "cat", "monkey"] + modin_df = pd.DataFrame(data, index) + pandas_df = pandas.DataFrame(data, index) + df_equals(modin_df.rename_axis("animal"), pandas_df.rename_axis("animal")) + df_equals( + modin_df.rename_axis("limbs", axis="columns"), + pandas_df.rename_axis("limbs", axis="columns"), + ) + + modin_df.rename_axis("limbs", axis="columns", inplace=True) + pandas_df.rename_axis("limbs", axis="columns", inplace=True) + df_equals(modin_df, pandas_df) + + new_index = pd.MultiIndex.from_product( + [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"] + ) + modin_df.index = new_index + pandas_df.index = new_index + + df_equals( + modin_df.rename_axis(index={"type": "class"}), + pandas_df.rename_axis(index={"type": "class"}), + ) + df_equals( + modin_df.rename_axis(columns=str.upper), + pandas_df.rename_axis(columns=str.upper), + ) + df_equals( + modin_df.rename_axis(columns=[str.upper(o) for o in modin_df.columns.names]), + pandas_df.rename_axis(columns=[str.upper(o) for o in pandas_df.columns.names]), + ) + + with pytest.raises(ValueError): + df_equals( + modin_df.rename_axis(str.upper, axis=1), + pandas_df.rename_axis(str.upper, axis=1), + ) + + +def test_rename_axis_inplace(): + test_frame = pandas.DataFrame(tm.getSeriesData()) + modin_df = pd.DataFrame(test_frame) + + result = test_frame.copy() + modin_result = modin_df.copy() + no_return = result.rename_axis("foo", inplace=True) + modin_no_return = modin_result.rename_axis("foo", inplace=True) + + assert no_return is modin_no_return + df_equals(modin_result, result) + + result = test_frame.copy() + modin_result = modin_df.copy() + no_return = result.rename_axis("bar", axis=1, inplace=True) + modin_no_return = modin_result.rename_axis("bar", axis=1, inplace=True) + + assert no_return is modin_no_return + df_equals(modin_result, result) + + +def test_reorder_levels(): + data = np.random.randint(1, 100, 12) + modin_df = pd.DataFrame( + data, + index=pd.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + pandas_df = pandas.DataFrame( + data, + index=pandas.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + df_equals( + modin_df.reorder_levels(["Letter", "Color", "Number"]), + pandas_df.reorder_levels(["Letter", "Color", "Number"]), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_reset_index(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_result = modin_df.reset_index(inplace=False) + pandas_result = pandas_df.reset_index(inplace=False) + df_equals(modin_result, pandas_result) + + modin_df_cp = modin_df.copy() + pd_df_cp = pandas_df.copy() + modin_df_cp.reset_index(inplace=True) + pd_df_cp.reset_index(inplace=True) + df_equals(modin_df_cp, pd_df_cp) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_sample(data, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + with pytest.raises(ValueError): + modin_df.sample(n=3, frac=0.4, axis=axis) + + with pytest.raises(KeyError): + modin_df.sample(frac=0.5, weights="CoLuMn_No_ExIsT", axis=0) + + with pytest.raises(ValueError): + modin_df.sample(frac=0.5, weights=modin_df.columns[0], axis=1) + + with pytest.raises(ValueError): + modin_df.sample( + frac=0.5, weights=[0.5 for _ in range(len(modin_df.index[:-1]))], axis=0 + ) + + with pytest.raises(ValueError): + modin_df.sample( + frac=0.5, + weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], + axis=1, + ) + + with pytest.raises(ValueError): + modin_df.sample(n=-3, axis=axis) + + with pytest.raises(ValueError): + modin_df.sample(frac=0.2, weights=pandas.Series(), axis=axis) + + if isinstance(axis, str): + num_axis = pandas.DataFrame()._get_axis_number(axis) + else: + num_axis = axis + + # weights that sum to 1 + sums = sum(i % 2 for i in range(len(modin_df.axes[num_axis]))) + weights = [i % 2 / sums for i in range(len(modin_df.axes[num_axis]))] + + modin_result = modin_df.sample( + frac=0.5, random_state=42, weights=weights, axis=axis + ) + pandas_result = pandas_df.sample( + frac=0.5, random_state=42, weights=weights, axis=axis + ) + df_equals(modin_result, pandas_result) + + # weights that don't sum to 1 + weights = [i % 2 for i in range(len(modin_df.axes[num_axis]))] + modin_result = modin_df.sample( + frac=0.5, random_state=42, weights=weights, axis=axis + ) + pandas_result = pandas_df.sample( + frac=0.5, random_state=42, weights=weights, axis=axis + ) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.sample(n=0, axis=axis) + pandas_result = pandas_df.sample(n=0, axis=axis) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.sample(frac=0.5, random_state=42, axis=axis) + pandas_result = pandas_df.sample(frac=0.5, random_state=42, axis=axis) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.sample(n=2, random_state=42, axis=axis) + pandas_result = pandas_df.sample(n=2, random_state=42, axis=axis) + df_equals(modin_result, pandas_result) + + # issue #1692, numpy RandomState object + # We must create a new random state for each iteration because the values that + # are selected will be impacted if the object has already been used. + random_state = np.random.RandomState(42) + modin_result = modin_df.sample(frac=0.5, random_state=random_state, axis=axis) + + random_state = np.random.RandomState(42) + pandas_result = pandas_df.sample(frac=0.5, random_state=random_state, axis=axis) + df_equals(modin_result, pandas_result) + + +def test_select_dtypes(): + frame_data = { + "test1": list("abc"), + "test2": np.arange(3, 6).astype("u1"), + "test3": np.arange(8.0, 11.0, dtype="float64"), + "test4": [True, False, True], + "test5": pandas.date_range("now", periods=3).values, + "test6": list(range(5, 8)), + } + df = pandas.DataFrame(frame_data) + rd = pd.DataFrame(frame_data) + + include = np.float, "integer" + exclude = (np.bool_,) + r = rd.select_dtypes(include=include, exclude=exclude) + + e = df[["test2", "test3", "test6"]] + df_equals(r, e) + + r = rd.select_dtypes(include=np.bool_) + e = df[["test4"]] + df_equals(r, e) + + r = rd.select_dtypes(exclude=np.bool_) + e = df[["test1", "test2", "test3", "test5", "test6"]] + df_equals(r, e) + + try: + pd.DataFrame().select_dtypes() + assert False + except ValueError: + assert True + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) +def test_tail(data, n): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.tail(n), pandas_df.tail(n)) + df_equals(modin_df.tail(len(modin_df)), pandas_df.tail(len(pandas_df))) + + +def test_xs(): + d = { + "num_legs": [4, 4, 2, 2], + "num_wings": [0, 0, 2, 2], + "class": ["mammal", "mammal", "mammal", "bird"], + "animal": ["cat", "dog", "bat", "penguin"], + "locomotion": ["walks", "walks", "flies", "walks"], + } + df = pd.DataFrame(data=d) + df = df.set_index(["class", "animal", "locomotion"]) + with pytest.warns(UserWarning): + df.xs("mammal") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___getitem__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + key = modin_df.columns[0] + modin_col = modin_df.__getitem__(key) + assert isinstance(modin_col, pd.Series) + + pd_col = pandas_df[key] + df_equals(pd_col, modin_col) + + slices = [ + (None, -1), + (-1, None), + (1, 2), + (1, None), + (None, 1), + (1, -1), + (-3, -1), + (1, -1, 2), + ] + + # slice test + for slice_param in slices: + s = slice(*slice_param) + df_equals(modin_df[s], pandas_df[s]) + + # Test empty + df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10]) + + +def test_getitem_empty_mask(): + # modin-project/modin#517 + modin_frames = [] + pandas_frames = [] + data1 = np.random.randint(0, 100, size=(100, 4)) + mdf1 = pd.DataFrame(data1, columns=list("ABCD")) + pdf1 = pandas.DataFrame(data1, columns=list("ABCD")) + modin_frames.append(mdf1) + pandas_frames.append(pdf1) + + data2 = np.random.randint(0, 100, size=(100, 4)) + mdf2 = pd.DataFrame(data2, columns=list("ABCD")) + pdf2 = pandas.DataFrame(data2, columns=list("ABCD")) + modin_frames.append(mdf2) + pandas_frames.append(pdf2) + + data3 = np.random.randint(0, 100, size=(100, 4)) + mdf3 = pd.DataFrame(data3, columns=list("ABCD")) + pdf3 = pandas.DataFrame(data3, columns=list("ABCD")) + modin_frames.append(mdf3) + pandas_frames.append(pdf3) + + modin_data = pd.concat(modin_frames) + pandas_data = pandas.concat(pandas_frames) + df_equals( + modin_data[[False for _ in modin_data.index]], + pandas_data[[False for _ in modin_data.index]], + ) + + +def test_getitem_datetime_slice(): + data = {"data": range(1000)} + index = pd.date_range("2017/1/4", periods=1000) + modin_df = pd.DataFrame(data=data, index=index) + pandas_df = pandas.DataFrame(data=data, index=index) + + s = slice("2017-01-06", "2017-01-09") + df_equals(modin_df[s], pandas_df[s]) + + +def test_getitem_same_name(): + data = [ + [1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + [17, 18, 19, 20], + ] + columns = ["c1", "c2", "c1", "c3"] + modin_df = pd.DataFrame(data, columns=columns) + pandas_df = pandas.DataFrame(data, columns=columns) + df_equals(modin_df["c1"], pandas_df["c1"]) + df_equals(modin_df["c2"], pandas_df["c2"]) + df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]]) + df_equals(modin_df["c3"], pandas_df["c3"]) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___getattr__(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + if "empty_data" not in request.node.name: + key = modin_df.columns[0] + col = modin_df.__getattr__(key) + + col = modin_df.__getattr__("col1") + assert isinstance(col, pd.Series) + + col = getattr(modin_df, "col1") + assert isinstance(col, pd.Series) + + # Check that lookup in column doesn't override other attributes + df2 = modin_df.rename(index=str, columns={key: "columns"}) + assert isinstance(df2.columns, pandas.Index) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___setitem__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_df.__setitem__(modin_df.columns[-1], 1) + pandas_df.__setitem__(pandas_df.columns[-1], 1) + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_df[modin_df.columns[-1]] = pd.DataFrame(modin_df[modin_df.columns[0]]) + pandas_df[pandas_df.columns[-1]] = pandas.DataFrame(pandas_df[pandas_df.columns[0]]) + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + rows = len(modin_df) + arr = np.arange(rows * 2).reshape(-1, 2) + modin_df[modin_df.columns[-1]] = arr + pandas_df[pandas_df.columns[-1]] = arr + df_equals(pandas_df, modin_df) + + with pytest.raises(ValueError, match=r"Wrong number of items passed"): + modin_df["___NON EXISTENT COLUMN"] = arr + + modin_df[modin_df.columns[0]] = np.arange(len(modin_df)) + pandas_df[pandas_df.columns[0]] = np.arange(len(pandas_df)) + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(columns=modin_df.columns) + pandas_df = pandas.DataFrame(columns=pandas_df.columns) + + for col in modin_df.columns: + modin_df[col] = np.arange(1000) + + for col in pandas_df.columns: + pandas_df[col] = np.arange(1000) + + df_equals(modin_df, pandas_df) + + # Test series assignment to column + modin_df = pd.DataFrame(columns=modin_df.columns) + pandas_df = pandas.DataFrame(columns=pandas_df.columns) + modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]] + pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]] + df_equals(modin_df, pandas_df) + + if not sys.version_info.major == 3 and sys.version_info.minor > 6: + # This test doesn't work correctly on Python 3.6 + # Test 2d ndarray assignment to column + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values + pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values + df_equals(modin_df, pandas_df) + assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0])) + + # Transpose test + modin_df = pd.DataFrame(data).T + pandas_df = pandas.DataFrame(data).T + + # We default to pandas on non-string column names + if not all(isinstance(c, str) for c in modin_df.columns): + with pytest.warns(UserWarning): + modin_df[modin_df.columns[0]] = 0 + else: + modin_df[modin_df.columns[0]] = 0 + + pandas_df[pandas_df.columns[0]] = 0 + + df_equals(modin_df, pandas_df) + + modin_df.columns = [str(i) for i in modin_df.columns] + pandas_df.columns = [str(i) for i in pandas_df.columns] + + modin_df[modin_df.columns[0]] = 0 + pandas_df[pandas_df.columns[0]] = 0 + + df_equals(modin_df, pandas_df) + + modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345 + pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345 + + df_equals(modin_df, pandas_df) + + +def test___setitem__mask(): + # DataFrame mask: + data = test_data["int_data"] + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + mean = int((RAND_HIGH + RAND_LOW) / 2) + pandas_df[pandas_df > mean] = -50 + modin_df[modin_df > mean] = -50 + + df_equals(modin_df, pandas_df) + + # Array mask: + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + array = (pandas_df > mean).to_numpy() + + modin_df[array] = -50 + pandas_df[array] = -50 + + df_equals(modin_df, pandas_df) + + # Array mask of wrong size: + with pytest.raises(ValueError): + array = np.array([[1, 2], [3, 4]]) + modin_df[array] = 20 + + +@pytest.mark.parametrize( + "data", + [ + {}, + pytest.param( + {"id": [], "max_speed": [], "health": []}, + marks=pytest.mark.xfail( + reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now" + ), + ), + ], + ids=["empty", "empty_columns"], +) +@pytest.mark.parametrize( + "value", + [np.array(["one", "two"]), [11, 22]], + ids=["ndarray", "list"], +) +@pytest.mark.parametrize("convert_to_series", [False, True]) +@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) +def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value + modin_df[new_col_id] = pd.Series(value) if convert_to_series else value + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___len__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert len(modin_df) == len(pandas_df) + + +def test_index_order(): + # see #1708 and #1869 for details + df_modin, df_pandas = ( + pd.DataFrame(test_data["float_nan_data"]), + pandas.DataFrame(test_data["float_nan_data"]), + ) + rows_number = len(df_modin.index) + level_0 = np.random.choice([x for x in range(10)], rows_number) + level_1 = np.random.choice([x for x in range(10)], rows_number) + index = pandas.MultiIndex.from_arrays([level_0, level_1]) + + df_modin.index = index + df_pandas.index = index + + for func in ["all", "any", "mad", "count"]: + df_equals( + getattr(df_modin, func)(level=0).index, + getattr(df_pandas, func)(level=0).index, + ) diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py new file mode 100644 index 00000000000..b1372396407 --- /dev/null +++ b/modin/pandas/test/dataframe/test_iter.py @@ -0,0 +1,389 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest + +import numpy as np +import pandas +import matplotlib +import modin.pandas as pd +import io + +from modin.pandas.test.utils import ( + random_state, + RAND_LOW, + RAND_HIGH, + df_equals, + test_data_values, + test_data_keys, + create_test_dfs, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +def test_items(): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_items = modin_df.items() + pandas_items = pandas_df.items() + for modin_item, pandas_item in zip(modin_items, pandas_items): + modin_index, modin_series = modin_item + pandas_index, pandas_series = pandas_item + df_equals(pandas_series, modin_series) + assert pandas_index == modin_index + + +def test_iteritems(): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_items = modin_df.iteritems() + pandas_items = pandas_df.iteritems() + for modin_item, pandas_item in zip(modin_items, pandas_items): + modin_index, modin_series = modin_item + pandas_index, pandas_series = pandas_item + df_equals(pandas_series, modin_series) + assert pandas_index == modin_index + + +def test_iterrows(): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_iterrows = modin_df.iterrows() + pandas_iterrows = pandas_df.iterrows() + for modin_row, pandas_row in zip(modin_iterrows, pandas_iterrows): + modin_index, modin_series = modin_row + pandas_index, pandas_series = pandas_row + df_equals(pandas_series, modin_series) + assert pandas_index == modin_index + + +@pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) +@pytest.mark.parametrize("index", [True, False]) +def test_itertuples(name, index): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) + + mi_index_modin = pd.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] + ) + mi_index_pandas = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in range(len(pandas_df.columns))] + ) + modin_df.columns = mi_index_modin + pandas_df.columns = mi_index_pandas + modin_it_custom = modin_df.itertuples(index=index, name=name) + pandas_it_custom = pandas_df.itertuples(index=index, name=name) + for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): + np.testing.assert_equal(modin_row, pandas_row) + + +def test___iter__(): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_iterator = modin_df.__iter__() + + # Check that modin_iterator implements the iterator interface + assert hasattr(modin_iterator, "__iter__") + assert hasattr(modin_iterator, "next") or hasattr(modin_iterator, "__next__") + + pd_iterator = pandas_df.__iter__() + assert list(modin_iterator) == list(pd_iterator) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___contains__(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + result = False + key = "Not Exist" + assert result == modin_df.__contains__(key) + assert result == (key in modin_df) + + if "empty_data" not in request.node.name: + result = True + key = pandas_df.columns[0] + assert result == modin_df.__contains__(key) + assert result == (key in modin_df) + + +def test__options_display(): + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + pandas.options.display.max_rows = 10 + pandas.options.display.max_columns = 10 + x = repr(pandas_df) + pd.options.display.max_rows = 5 + pd.options.display.max_columns = 5 + y = repr(modin_df) + assert x != y + pd.options.display.max_rows = 10 + pd.options.display.max_columns = 10 + y = repr(modin_df) + assert x == y + + # test for old fixed max values + pandas.options.display.max_rows = 75 + pandas.options.display.max_columns = 75 + x = repr(pandas_df) + pd.options.display.max_rows = 75 + pd.options.display.max_columns = 75 + y = repr(modin_df) + assert x == y + + +def test___finalize__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).__finalize__(None) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___copy__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_df_copy, pandas_df_copy = modin_df.__copy__(), pandas_df.__copy__() + df_equals(modin_df_copy, pandas_df_copy) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___deepcopy__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_df_copy, pandas_df_copy = ( + modin_df.__deepcopy__(), + pandas_df.__deepcopy__(), + ) + df_equals(modin_df_copy, pandas_df_copy) + + +def test___repr__(): + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + assert repr(pandas_df) == repr(modin_df) + + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + assert repr(pandas_df) == repr(modin_df) + + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + assert repr(pandas_df) == repr(modin_df) + + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + assert repr(pandas_df) == repr(modin_df) + + # ___repr___ method has a different code path depending on + # whether the number of rows is >60; and a different code path + # depending on the number of columns is >20. + # Previous test cases already check the case when cols>20 + # and rows>60. The cases that follow exercise the other three + # combinations. + # rows <= 60, cols > 20 + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + assert repr(pandas_df) == repr(modin_df) + + # rows <= 60, cols <= 20 + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + assert repr(pandas_df) == repr(modin_df) + + # rows > 60, cols <= 20 + frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + assert repr(pandas_df) == repr(modin_df) + + # Empty + pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)]) + modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) + + assert repr(pandas_df) == repr(modin_df) + + # From Issue #1705 + string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" +"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 +"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 +"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 +"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" + pandas_df = pandas.read_csv(io.StringIO(string_data)) + modin_df = pd.read_csv(io.StringIO(string_data)) + assert repr(pandas_df) == repr(modin_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_reset_index_with_multi_index(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if len(modin_df.columns) > len(pandas_df.columns): + col0 = modin_df.columns[0] + col1 = modin_df.columns[1] + modin_cols = modin_df.groupby([col0, col1]).count().reset_index().columns + pandas_cols = pandas_df.groupby([col0, col1]).count().reset_index().columns + + assert modin_cols.equals(pandas_cols) + + +def test_reset_index_with_named_index(): + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + + modin_df.index.name = pandas_df.index.name = "NAME_OF_INDEX" + df_equals(modin_df, pandas_df) + df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) + + modin_df.reset_index(drop=True, inplace=True) + pandas_df.reset_index(drop=True, inplace=True) + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(test_data_values[0]) + pandas_df = pandas.DataFrame(test_data_values[0]) + modin_df.index.name = pandas_df.index.name = "NEW_NAME" + df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_inplace_series_ops(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if len(modin_df.columns) > len(pandas_df.columns): + col0 = modin_df.columns[0] + col1 = modin_df.columns[1] + pandas_df[col1].dropna(inplace=True) + modin_df[col1].dropna(inplace=True) + df_equals(modin_df, pandas_df) + + pandas_df[col0].fillna(0, inplace=True) + modin_df[col0].fillna(0, inplace=True) + df_equals(modin_df, pandas_df) + + +def test___setattr__(): + pandas_df = pandas.DataFrame([1, 2, 3]) + modin_df = pd.DataFrame([1, 2, 3]) + + pandas_df.new_col = [4, 5, 6] + modin_df.new_col = [4, 5, 6] + + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_isin(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + val = [1, 2, 3, 4] + pandas_result = pandas_df.isin(val) + modin_result = modin_df.isin(val) + + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_constructor(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + df_equals(pandas_df, modin_df) + + pandas_df = pandas.DataFrame({k: pandas.Series(v) for k, v in data.items()}) + modin_df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()}) + df_equals(pandas_df, modin_df) + + +@pytest.mark.parametrize( + "data", + [ + np.arange(1, 10000, dtype=np.float32), + [ + pd.Series([1, 2, 3], dtype="int32"), + pandas.Series([4, 5, 6], dtype="int64"), + np.array([7, 8, 9], dtype=np.float32), + ], + pandas.Categorical([1, 2, 3, 4, 5]), + ], +) +def test_constructor_dtypes(data): + md_df, pd_df = create_test_dfs(data) + df_equals(md_df, pd_df) + + +def test_constructor_columns_and_index(): + modin_df = pd.DataFrame( + [[1, 1, 10], [2, 4, 20], [3, 7, 30]], + index=[1, 2, 3], + columns=["id", "max_speed", "health"], + ) + pandas_df = pandas.DataFrame( + [[1, 1, 10], [2, 4, 20], [3, 7, 30]], + index=[1, 2, 3], + columns=["id", "max_speed", "health"], + ) + df_equals(modin_df, pandas_df) + df_equals(pd.DataFrame(modin_df), pandas.DataFrame(pandas_df)) + df_equals( + pd.DataFrame(modin_df, columns=["max_speed", "health"]), + pandas.DataFrame(pandas_df, columns=["max_speed", "health"]), + ) + df_equals( + pd.DataFrame(modin_df, index=[1, 2]), + pandas.DataFrame(pandas_df, index=[1, 2]), + ) + df_equals( + pd.DataFrame(modin_df, index=[1, 2], columns=["health"]), + pandas.DataFrame(pandas_df, index=[1, 2], columns=["health"]), + ) + df_equals( + pd.DataFrame(modin_df.iloc[:, 0], index=[1, 2, 3]), + pandas.DataFrame(pandas_df.iloc[:, 0], index=[1, 2, 3]), + ) + df_equals( + pd.DataFrame(modin_df.iloc[:, 0], columns=["NO_EXIST"]), + pandas.DataFrame(pandas_df.iloc[:, 0], columns=["NO_EXIST"]), + ) + with pytest.raises(NotImplementedError): + pd.DataFrame(modin_df, index=[1, 2, 99999]) + with pytest.raises(NotImplementedError): + pd.DataFrame(modin_df, columns=["NO_EXIST"]) diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py new file mode 100644 index 00000000000..b7232de38bb --- /dev/null +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -0,0 +1,524 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import matplotlib +import modin.pandas as pd +from modin.pandas.utils import to_pandas + +from modin.pandas.test.utils import ( + random_state, + df_equals, + arg_keys, + name_contains, + test_data_values, + test_data_keys, + numeric_dfs, + axis_keys, + axis_values, + bool_arg_keys, + bool_arg_values, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_combine(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + modin_df.combine(modin_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2) + pandas_df.combine( + pandas_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 + ) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ], +) +def test_join(test_data, test_data2): + modin_df = pd.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + ) + pandas_df = pandas.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + ) + modin_df2 = pd.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + ) + pandas_df2 = pandas.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + ) + + hows = ["inner", "left", "right", "outer"] + ons = ["col33", "col34"] + sorts = [False, True] + for i in range(4): + for j in range(2): + modin_result = modin_df.join( + modin_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + pandas_result = pandas_df.join( + pandas_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + df_equals(modin_result, pandas_result) + + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6], + } + + modin_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) + + frame_data2 = {"col5": [0], "col6": [1]} + modin_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) + + join_types = ["left", "right", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join(modin_df2, how=how) + pandas_join = pandas_df.join(pandas_df2, how=how) + df_equals(modin_join, pandas_join) + + frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} + + modin_df3 = pd.DataFrame(frame_data3) + pandas_df3 = pandas.DataFrame(frame_data3) + + join_types = ["left", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join([modin_df2, modin_df3], how=how) + pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) + df_equals(modin_join, pandas_join) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + ), + ( + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), + np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), + ), + ], +) +def test_merge(test_data, test_data2): + modin_df = pd.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + ) + pandas_df = pandas.DataFrame( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + ) + modin_df2 = pd.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + ) + pandas_df2 = pandas.DataFrame( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + ) + + hows = ["left", "inner"] + ons = ["col33", ["col33", "col34"]] + sorts = [False, True] + for i in range(2): + for j in range(2): + modin_result = modin_df.merge( + modin_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + pandas_result = pandas_df.merge( + pandas_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.merge( + modin_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + pandas_result = pandas_df.merge( + pandas_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + df_equals(modin_result, pandas_result) + + # Test for issue #1771 + modin_df = pd.DataFrame({"name": np.arange(40)}) + modin_df2 = pd.DataFrame({"name": [39], "position": [0]}) + pandas_df = pandas.DataFrame({"name": np.arange(40)}) + pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]}) + modin_result = modin_df.merge(modin_df2, on="name", how="inner") + pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner") + df_equals(modin_result, pandas_result) + + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6], + } + + modin_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) + + frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} + modin_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) + + join_types = ["outer", "inner"] + for how in join_types: + # Defaults + modin_result = modin_df.merge(modin_df2, how=how) + pandas_result = pandas_df.merge(pandas_df2, how=how) + df_equals(modin_result, pandas_result) + + # left_on and right_index + modin_result = modin_df.merge( + modin_df2, how=how, left_on="col1", right_index=True + ) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on="col1", right_index=True + ) + df_equals(modin_result, pandas_result) + + # left_index and right_on + modin_result = modin_df.merge( + modin_df2, how=how, left_index=True, right_on="col1" + ) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_index=True, right_on="col1" + ) + df_equals(modin_result, pandas_result) + + # left_on and right_on col1 + modin_result = modin_df.merge( + modin_df2, how=how, left_on="col1", right_on="col1" + ) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on="col1", right_on="col1" + ) + df_equals(modin_result, pandas_result) + + # left_on and right_on col2 + modin_result = modin_df.merge( + modin_df2, how=how, left_on="col2", right_on="col2" + ) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_on="col2", right_on="col2" + ) + df_equals(modin_result, pandas_result) + + # left_index and right_index + modin_result = modin_df.merge( + modin_df2, how=how, left_index=True, right_index=True + ) + pandas_result = pandas_df.merge( + pandas_df2, how=how, left_index=True, right_index=True + ) + df_equals(modin_result, pandas_result) + + # Named Series promoted to DF + s = pd.Series(frame_data2.get("col1")) + with pytest.raises(ValueError): + modin_df.merge(s) + + s = pd.Series(frame_data2.get("col1"), name="col1") + df_equals(modin_df.merge(s), modin_df.merge(modin_df2[["col1"]])) + + with pytest.raises(TypeError): + modin_df.merge("Non-valid type") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) +) +@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) +@pytest.mark.parametrize( + "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) +) +def test_sort_index(data, axis, ascending, na_position, sort_remaining): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + # Change index value so sorting will actually make a difference + if axis == "rows" or axis == 0: + length = len(modin_df.index) + modin_df.index = [(i - length / 2) % length for i in range(length)] + pandas_df.index = [(i - length / 2) % length for i in range(length)] + # Add NaNs to sorted index + if axis == "rows" or axis == 0: + length = len(modin_df.index) + modin_df.index = [ + np.nan if i % 2 == 0 else modin_df.index[i] for i in range(length) + ] + pandas_df.index = [ + np.nan if i % 2 == 0 else pandas_df.index[i] for i in range(length) + ] + else: + length = len(modin_df.columns) + modin_df.columns = [ + np.nan if i % 2 == 0 else modin_df.columns[i] for i in range(length) + ] + pandas_df.columns = [ + np.nan if i % 2 == 0 else pandas_df.columns[i] for i in range(length) + ] + + modin_result = modin_df.sort_index( + axis=axis, ascending=ascending, na_position=na_position, inplace=False + ) + pandas_result = pandas_df.sort_index( + axis=axis, ascending=ascending, na_position=na_position, inplace=False + ) + df_equals(modin_result, pandas_result) + + modin_df_cp = modin_df.copy() + pandas_df_cp = pandas_df.copy() + modin_df_cp.sort_index( + axis=axis, ascending=ascending, na_position=na_position, inplace=True + ) + pandas_df_cp.sort_index( + axis=axis, ascending=ascending, na_position=na_position, inplace=True + ) + df_equals(modin_df_cp, pandas_df_cp) + + # MultiIndex + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_df.index = pd.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(modin_df))] + ) + pandas_df.index = pandas.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(pandas_df))] + ) + modin_df.columns = pd.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(modin_df.columns))] + ) + pandas_df.columns = pd.MultiIndex.from_tuples( + [(i // 10, i // 5, i) for i in range(len(pandas_df.columns))] + ) + + with pytest.warns(UserWarning): + df_equals(modin_df.sort_index(level=0), pandas_df.sort_index(level=0)) + with pytest.warns(UserWarning): + df_equals(modin_df.sort_index(axis=0), pandas_df.sort_index(axis=0)) + with pytest.warns(UserWarning): + df_equals(modin_df.sort_index(axis=1), pandas_df.sort_index(axis=1)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) +) +@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) +def test_sort_values(request, data, axis, ascending, na_position): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if "empty_data" not in request.node.name and ( + (axis == 0 or axis == "over rows") + or name_contains(request.node.name, numeric_dfs) + ): + index = modin_df.index if axis == 1 or axis == "columns" else modin_df.columns + key = index[0] + modin_result = modin_df.sort_values( + key, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=False, + ) + pandas_result = pandas_df.sort_values( + key, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=False, + ) + df_equals(modin_result, pandas_result) + + modin_df_cp = modin_df.copy() + pandas_df_cp = pandas_df.copy() + modin_df_cp.sort_values( + key, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=True, + ) + pandas_df_cp.sort_values( + key, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=True, + ) + df_equals(modin_df_cp, pandas_df_cp) + + keys = [key, index[-1]] + modin_result = modin_df.sort_values( + keys, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=False, + ) + pandas_result = pandas_df.sort_values( + keys, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=False, + ) + df_equals(modin_result, pandas_result) + + modin_df_cp = modin_df.copy() + pandas_df_cp = pandas_df.copy() + modin_df_cp.sort_values( + keys, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=True, + ) + pandas_df_cp.sort_values( + keys, + axis=axis, + ascending=ascending, + na_position=na_position, + inplace=True, + ) + df_equals(modin_df_cp, pandas_df_cp) + + +def test_sort_values_with_duplicates(): + modin_df = pd.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) + pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) + + key = modin_df.columns[0] + modin_result = modin_df.sort_values(key, inplace=False) + pandas_result = pandas_df.sort_values(key, inplace=False) + df_equals(modin_result, pandas_result) + + modin_df.sort_values(key, inplace=True) + pandas_df.sort_values(key, inplace=True) + df_equals(modin_df, pandas_df) + + +def test_sort_values_with_string_index(): + modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) + pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) + + key = modin_df.columns[0] + modin_result = modin_df.sort_values(key, inplace=False) + pandas_result = pandas_df.sort_values(key, inplace=False) + df_equals(modin_result, pandas_result) + + modin_df.sort_values(key, inplace=True) + pandas_df.sort_values(key, inplace=True) + df_equals(modin_df, pandas_df) + + +def test_where(): + frame_data = random_state.randn(100, 10) + pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) + modin_df = pd.DataFrame(frame_data, columns=list("abcdefghij")) + pandas_cond_df = pandas_df % 5 < 2 + modin_cond_df = modin_df % 5 < 2 + + pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) + modin_result = modin_df.where(modin_cond_df, -modin_df) + assert all((to_pandas(modin_result) == pandas_result).all()) + + other = pandas_df.loc[3] + pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) + modin_result = modin_df.where(modin_cond_df, other, axis=1) + assert all((to_pandas(modin_result) == pandas_result).all()) + + other = pandas_df["e"] + pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) + modin_result = modin_df.where(modin_cond_df, other, axis=0) + assert all((to_pandas(modin_result) == pandas_result).all()) + + pandas_result = pandas_df.where(pandas_df < 2, True) + modin_result = modin_df.where(modin_df < 2, True) + assert all((to_pandas(modin_result) == pandas_result).all()) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py new file mode 100644 index 00000000000..a24a80c92bb --- /dev/null +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -0,0 +1,1307 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import pandas.util.testing as tm +import matplotlib +import modin.pandas as pd + +from modin.pandas.test.utils import ( + random_state, + RAND_LOW, + RAND_HIGH, + df_equals, + df_is_empty, + arg_keys, + name_contains, + test_data_values, + test_data_keys, + test_data_with_duplicates_values, + test_data_with_duplicates_keys, + numeric_dfs, + test_func_keys, + test_func_values, + indices_keys, + indices_values, + axis_keys, + axis_values, + bool_arg_keys, + bool_arg_values, + int_arg_keys, + int_arg_values, + eval_general, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +def eval_insert(modin_df, pandas_df, **kwargs): + _kwargs = {"loc": 0, "col": "New column"} + _kwargs.update(kwargs) + + eval_general( + modin_df, + pandas_df, + operation=lambda df, **kwargs: df.insert(**kwargs), + **_kwargs, + ) + + +def test_indexing(): + modin_df = pd.DataFrame( + dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] + ) + pandas_df = pandas.DataFrame( + dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] + ) + + modin_result = modin_df + pandas_result = pandas_df + df_equals(modin_result, pandas_result) + + modin_result = modin_df["b"] + pandas_result = pandas_df["b"] + df_equals(modin_result, pandas_result) + + modin_result = modin_df[["b"]] + pandas_result = pandas_df[["b"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df[["b", "a"]] + pandas_result = pandas_df[["b", "a"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc["b"] + pandas_result = pandas_df.loc["b"] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[["b"]] + pandas_result = pandas_df.loc[["b"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[["b", "a"]] + pandas_result = pandas_df.loc[["b", "a"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[["b", "a"], ["a", "c"]] + pandas_result = pandas_df.loc[["b", "a"], ["a", "c"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[:, ["a", "c"]] + pandas_result = pandas_df.loc[:, ["a", "c"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[:, ["c"]] + pandas_result = pandas_df.loc[:, ["c"]] + df_equals(modin_result, pandas_result) + + modin_result = modin_df.loc[[]] + pandas_result = pandas_df.loc[[]] + df_equals(modin_result, pandas_result) + + +def test_empty_df(): + df = pd.DataFrame(index=["a", "b"]) + df_is_empty(df) + tm.assert_index_equal(df.index, pd.Index(["a", "b"])) + assert len(df.columns) == 0 + + df = pd.DataFrame(columns=["a", "b"]) + df_is_empty(df) + assert len(df.index) == 0 + tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) + + df = pd.DataFrame() + df_is_empty(df) + assert len(df.index) == 0 + assert len(df.columns) == 0 + + df = pd.DataFrame(index=["a", "b"]) + df_is_empty(df) + tm.assert_index_equal(df.index, pd.Index(["a", "b"])) + assert len(df.columns) == 0 + + df = pd.DataFrame(columns=["a", "b"]) + df_is_empty(df) + assert len(df.index) == 0 + tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) + + df = pd.DataFrame() + df_is_empty(df) + assert len(df.index) == 0 + assert len(df.columns) == 0 + + df = pd.DataFrame() + pd_df = pandas.DataFrame() + df["a"] = [1, 2, 3, 4, 5] + pd_df["a"] = [1, 2, 3, 4, 5] + df_equals(df, pd_df) + + df = pd.DataFrame() + pd_df = pandas.DataFrame() + df["a"] = list("ABCDEF") + pd_df["a"] = list("ABCDEF") + df_equals(df, pd_df) + + df = pd.DataFrame() + pd_df = pandas.DataFrame() + df["a"] = pd.Series([1, 2, 3, 4, 5]) + pd_df["a"] = pandas.Series([1, 2, 3, 4, 5]) + df_equals(df, pd_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_abs(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.abs() + except Exception as e: + with pytest.raises(type(e)): + modin_df.abs() + else: + modin_result = modin_df.abs() + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_add_prefix(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + test_prefix = "TEST" + new_modin_df = modin_df.add_prefix(test_prefix) + new_pandas_df = pandas_df.add_prefix(test_prefix) + df_equals(new_modin_df.columns, new_pandas_df.columns) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) +def test_applymap(request, data, testfunc): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + with pytest.raises(ValueError): + x = 2 + modin_df.applymap(x) + + try: + pandas_result = pandas_df.applymap(testfunc) + except Exception as e: + with pytest.raises(type(e)): + modin_df.applymap(testfunc) + else: + modin_result = modin_df.applymap(testfunc) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) +def test_applymap_numeric(request, data, testfunc): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if name_contains(request.node.name, numeric_dfs): + try: + pandas_result = pandas_df.applymap(testfunc) + except Exception as e: + with pytest.raises(type(e)): + modin_df.applymap(testfunc) + else: + modin_result = modin_df.applymap(testfunc) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_add_suffix(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + test_suffix = "TEST" + new_modin_df = modin_df.add_suffix(test_suffix) + new_pandas_df = pandas_df.add_suffix(test_suffix) + + df_equals(new_modin_df.columns, new_pandas_df.columns) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_at(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + key1 = modin_df.columns[0] + # Scaler + df_equals(modin_df.at[0, key1], pandas_df.at[0, key1]) + + # Series + df_equals(modin_df.loc[0].at[key1], pandas_df.loc[0].at[key1]) + + # Write Item + modin_df_copy = modin_df.copy() + pandas_df_copy = pandas_df.copy() + modin_df_copy.at[1, key1] = modin_df.at[0, key1] + pandas_df_copy.at[1, key1] = pandas_df.at[0, key1] + df_equals(modin_df_copy, pandas_df_copy) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_axes(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + for modin_axis, pd_axis in zip(modin_df.axes, pandas_df.axes): + assert np.array_equal(modin_axis, pd_axis) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_copy(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + # pandas_df is unused but there so there won't be confusing list comprehension + # stuff in the pytest.mark.parametrize + new_modin_df = modin_df.copy() + + assert new_modin_df is not modin_df + assert np.array_equal( + new_modin_df._query_compiler._modin_frame._partitions, + modin_df._query_compiler._modin_frame._partitions, + ) + assert new_modin_df is not modin_df + df_equals(new_modin_df, modin_df) + + # Shallow copy tests + modin_df = pd.DataFrame(data) + modin_df_cp = modin_df.copy(False) + + modin_df[modin_df.columns[0]] = 0 + df_equals(modin_df, modin_df_cp) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dtypes(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.dtypes, pandas_df.dtypes) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("key", indices_values, ids=indices_keys) +def test_get(data, key): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.get(key), pandas_df.get(key)) + df_equals( + modin_df.get(key, default="default"), pandas_df.get(key, default="default") + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "dummy_na", bool_arg_values, ids=arg_keys("dummy_na", bool_arg_keys) +) +@pytest.mark.parametrize( + "drop_first", bool_arg_values, ids=arg_keys("drop_first", bool_arg_keys) +) +def test_get_dummies(request, data, dummy_na, drop_first): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas.get_dummies( + pandas_df, dummy_na=dummy_na, drop_first=drop_first + ) + except Exception as e: + with pytest.raises(type(e)): + pd.get_dummies(modin_df, dummy_na=dummy_na, drop_first=drop_first) + else: + modin_result = pd.get_dummies( + modin_df, dummy_na=dummy_na, drop_first=drop_first + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_isna(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + pandas_result = pandas_df.isna() + modin_result = modin_df.isna() + + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_isnull(data): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + pandas_result = pandas_df.isnull() + modin_result = modin_df.isnull() + + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_append(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + data_to_append = {"append_a": 2, "append_b": 1000} + + ignore_idx_values = [True, False] + + for ignore in ignore_idx_values: + try: + pandas_result = pandas_df.append(data_to_append, ignore_index=ignore) + except Exception as e: + with pytest.raises(type(e)): + modin_df.append(data_to_append, ignore_index=ignore) + else: + modin_result = modin_df.append(data_to_append, ignore_index=ignore) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.append(pandas_df.iloc[-1]) + except Exception as e: + with pytest.raises(type(e)): + modin_df.append(modin_df.iloc[-1]) + else: + modin_result = modin_df.append(modin_df.iloc[-1]) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.append(list(pandas_df.iloc[-1])) + except Exception as e: + with pytest.raises(type(e)): + modin_df.append(list(modin_df.iloc[-1])) + else: + modin_result = modin_df.append(list(modin_df.iloc[-1])) + # Pandas has bug where sort=False is ignored + # (https://github.com/pandas-dev/pandas/issues/35092), but Modin + # now does the right thing, so for now manually sort to workaround + # this. Once the Pandas bug is fixed and Modin upgrades to that + # Pandas release, this sort will cause the test to fail, and the + # next two lines should be deleted. + assert list(modin_result.columns) == list(modin_df.columns) + [0] + modin_result = modin_result[[0] + sorted(modin_df.columns)] + df_equals(modin_result, pandas_result) + + verify_integrity_values = [True, False] + + for verify_integrity in verify_integrity_values: + try: + pandas_result = pandas_df.append( + [pandas_df, pandas_df], verify_integrity=verify_integrity + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.append([modin_df, modin_df], verify_integrity=verify_integrity) + else: + modin_result = modin_df.append( + [modin_df, modin_df], verify_integrity=verify_integrity + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.append( + pandas_df, verify_integrity=verify_integrity + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.append(modin_df, verify_integrity=verify_integrity) + else: + modin_result = modin_df.append(modin_df, verify_integrity=verify_integrity) + df_equals(modin_result, pandas_result) + + +def test_astype(): + td = pandas.DataFrame(tm.getSeriesData()) + modin_df = pd.DataFrame(td.values, index=td.index, columns=td.columns) + expected_df = pandas.DataFrame(td.values, index=td.index, columns=td.columns) + + modin_df_casted = modin_df.astype(np.int32) + expected_df_casted = expected_df.astype(np.int32) + df_equals(modin_df_casted, expected_df_casted) + + modin_df_casted = modin_df.astype(np.float64) + expected_df_casted = expected_df.astype(np.float64) + df_equals(modin_df_casted, expected_df_casted) + + modin_df_casted = modin_df.astype(str) + expected_df_casted = expected_df.astype(str) + df_equals(modin_df_casted, expected_df_casted) + + modin_df_casted = modin_df.astype("category") + expected_df_casted = expected_df.astype("category") + df_equals(modin_df_casted, expected_df_casted) + + dtype_dict = {"A": np.int32, "B": np.int64, "C": str} + modin_df_casted = modin_df.astype(dtype_dict) + expected_df_casted = expected_df.astype(dtype_dict) + df_equals(modin_df_casted, expected_df_casted) + + # Ignore lint because this is testing bad input + bad_dtype_dict = {"B": np.int32, "B": np.int64, "B": str} # noqa F601 + modin_df_casted = modin_df.astype(bad_dtype_dict) + expected_df_casted = expected_df.astype(bad_dtype_dict) + df_equals(modin_df_casted, expected_df_casted) + + modin_df = pd.DataFrame(index=["row1"], columns=["col1"]) + modin_df["col1"]["row1"] = 11 + modin_df_casted = modin_df.astype(int) + expected_df = pandas.DataFrame(index=["row1"], columns=["col1"]) + expected_df["col1"]["row1"] = 11 + expected_df_casted = expected_df.astype(int) + df_equals(modin_df_casted, expected_df_casted) + + with pytest.raises(KeyError): + modin_df.astype({"not_exists": np.uint8}) + + +def test_astype_category(): + modin_df = pd.DataFrame( + {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} + ) + pandas_df = pandas.DataFrame( + {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} + ) + + modin_result = modin_df.astype({"col1": "category"}) + pandas_result = pandas_df.astype({"col1": "category"}) + df_equals(modin_result, pandas_result) + assert modin_result.dtypes.equals(pandas_result.dtypes) + + modin_result = modin_df.astype("category") + pandas_result = pandas_df.astype("category") + df_equals(modin_result, pandas_result) + assert modin_result.dtypes.equals(pandas_result.dtypes) + + +@pytest.mark.xfail( + reason="Categorical dataframe created in memory don't work yet and categorical dtype is lost" +) +def test_astype_category_large(): + series_length = 10_000 + modin_df = pd.DataFrame( + { + "col1": ["str{0}".format(i) for i in range(0, series_length)], + "col2": [i for i in range(0, series_length)], + } + ) + pandas_df = pandas.DataFrame( + { + "col1": ["str{0}".format(i) for i in range(0, series_length)], + "col2": [i for i in range(0, series_length)], + } + ) + + modin_result = modin_df.astype({"col1": "category"}) + pandas_result = pandas_df.astype({"col1": "category"}) + df_equals(modin_result, pandas_result) + assert modin_result.dtypes.equals(pandas_result.dtypes) + + modin_result = modin_df.astype("category") + pandas_result = pandas_df.astype("category") + df_equals(modin_result, pandas_result) + assert modin_result.dtypes.equals(pandas_result.dtypes) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_clip(request, data, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if name_contains(request.node.name, numeric_dfs): + ind_len = ( + len(modin_df.index) + if not pandas.DataFrame()._get_axis_number(axis) + else len(modin_df.columns) + ) + # set bounds + lower, upper = np.sort(random_state.random_integers(RAND_LOW, RAND_HIGH, 2)) + lower_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len) + upper_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len) + + # test only upper scalar bound + modin_result = modin_df.clip(None, upper, axis=axis) + pandas_result = pandas_df.clip(None, upper, axis=axis) + df_equals(modin_result, pandas_result) + + # test lower and upper scalar bound + modin_result = modin_df.clip(lower, upper, axis=axis) + pandas_result = pandas_df.clip(lower, upper, axis=axis) + df_equals(modin_result, pandas_result) + + # test lower and upper list bound on each column + modin_result = modin_df.clip(lower_list, upper_list, axis=axis) + pandas_result = pandas_df.clip(lower_list, upper_list, axis=axis) + df_equals(modin_result, pandas_result) + + # test only upper list bound on each column + modin_result = modin_df.clip(np.nan, upper_list, axis=axis) + pandas_result = pandas_df.clip(np.nan, upper_list, axis=axis) + df_equals(modin_result, pandas_result) + + with pytest.raises(ValueError): + modin_df.clip(lower=[1, 2, 3], axis=None) + + +def test_drop(): + frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} + simple = pandas.DataFrame(frame_data) + modin_simple = pd.DataFrame(frame_data) + df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) + df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) + df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + pytest.raises(ValueError, modin_simple.drop, 5) + pytest.raises(ValueError, modin_simple.drop, "C", 1) + pytest.raises(ValueError, modin_simple.drop, [1, 5]) + pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) + + # errors = 'ignore' + df_equals(modin_simple.drop(5, errors="ignore"), simple) + df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) + df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) + df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) + + # non-unique + nu_df = pandas.DataFrame( + zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"] + ) + modin_nu_df = pd.DataFrame(nu_df) + df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) + df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) + df_equals(modin_nu_df.drop([]), nu_df) + + nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + modin_nu_df = pd.DataFrame(nu_df) + df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + frame_data = random_state.randn(10, 3) + df = pandas.DataFrame(frame_data, columns=list("abc")) + modin_df = pd.DataFrame(frame_data, columns=list("abc")) + expected = df[~(df.b > 0)] + modin_df.drop(labels=df[df.b > 0].index, inplace=True) + df_equals(modin_df, expected) + + midx = pd.MultiIndex( + levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) + df = pd.DataFrame( + index=midx, + columns=["big", "small"], + data=[ + [45, 30], + [200, 100], + [1.5, 1], + [30, 20], + [250, 150], + [1.5, 0.8], + [320, 250], + [1, 0.8], + [0.3, 0.2], + ], + ) + with pytest.warns(UserWarning): + df.drop(index="length", level=1) + + +def test_drop_api_equivalence(): + # equivalence of the labels/axis and index/columns API's + frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] + + modin_df = pd.DataFrame(frame_data, index=["a", "b", "c"], columns=["d", "e", "f"]) + + modin_df1 = modin_df.drop("a") + modin_df2 = modin_df.drop(index="a") + df_equals(modin_df1, modin_df2) + + modin_df1 = modin_df.drop("d", 1) + modin_df2 = modin_df.drop(columns="d") + df_equals(modin_df1, modin_df2) + + modin_df1 = modin_df.drop(labels="e", axis=1) + modin_df2 = modin_df.drop(columns="e") + df_equals(modin_df1, modin_df2) + + modin_df1 = modin_df.drop(["a"], axis=0) + modin_df2 = modin_df.drop(index=["a"]) + df_equals(modin_df1, modin_df2) + + modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1) + modin_df2 = modin_df.drop(index=["a"], columns=["d"]) + df_equals(modin_df1, modin_df2) + + with pytest.raises(ValueError): + modin_df.drop(labels="a", index="b") + + with pytest.raises(ValueError): + modin_df.drop(labels="a", columns="b") + + with pytest.raises(ValueError): + modin_df.drop(axis=1) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_drop_transpose(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + modin_result = modin_df.T.drop(columns=[0, 1, 2]) + pandas_result = pandas_df.T.drop(columns=[0, 1, 2]) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.T.drop(index=["col3", "col1"]) + pandas_result = pandas_df.T.drop(index=["col3", "col1"]) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) + pandas_result = pandas_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) + df_equals(modin_result, pandas_result) + + +def test_droplevel(): + df = ( + pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + .set_index([0, 1]) + .rename_axis(["a", "b"]) + ) + df.columns = pd.MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) + df.droplevel("a") + df.droplevel("level_2", axis=1) + + +@pytest.mark.parametrize( + "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys +) +@pytest.mark.parametrize( + "keep", ["last", "first", False], ids=["last", "first", "False"] +) +@pytest.mark.parametrize( + "subset", + [None, "col1", "name", ("col1", "col3"), ["col1", "col3", "col7"]], + ids=["None", "string", "name", "tuple", "list"], +) +def test_drop_duplicates(data, keep, subset): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_df.drop_duplicates(keep=keep, inplace=False, subset=subset) + except Exception as e: + with pytest.raises(type(e)): + modin_df.drop_duplicates(keep=keep, inplace=False, subset=subset) + else: + df_equals( + pandas_df.drop_duplicates(keep=keep, inplace=False, subset=subset), + modin_df.drop_duplicates(keep=keep, inplace=False, subset=subset), + ) + + try: + pandas_results = pandas_df.drop_duplicates( + keep=keep, inplace=True, subset=subset + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.drop_duplicates(keep=keep, inplace=True, subset=subset) + else: + modin_results = modin_df.drop_duplicates(keep=keep, inplace=True, subset=subset) + df_equals(modin_results, pandas_results) + + +def test_drop_duplicates_with_missing_index_values(): + data = { + "columns": ["value", "time", "id"], + "index": [ + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + ], + "data": [ + ["3", 1279213398000.0, 88.0], + ["3", 1279204682000.0, 88.0], + ["0", 1245772835000.0, 448.0], + ["0", 1270564258000.0, 32.0], + ["0", 1267106669000.0, 118.0], + ["7", 1300621123000.0, 5.0], + ["0", 1251130752000.0, 957.0], + ["0", 1311683506000.0, 62.0], + ["9", 1283692698000.0, 89.0], + ["9", 1270234253000.0, 64.0], + ["0", 1285088818000.0, 50.0], + ["0", 1218212725000.0, 695.0], + ["2", 1383933968000.0, 348.0], + ["0", 1368227625000.0, 257.0], + ["1", 1454514093000.0, 446.0], + ["1", 1428497427000.0, 134.0], + ["1", 1459184936000.0, 568.0], + ["1", 1502293302000.0, 599.0], + ["1", 1491833358000.0, 829.0], + ["1", 1485431534000.0, 806.0], + ["8", 1351800505000.0, 101.0], + ["0", 1357247721000.0, 916.0], + ["0", 1335804423000.0, 370.0], + ["24", 1327547726000.0, 720.0], + ["0", 1332334140000.0, 415.0], + ["0", 1309543100000.0, 30.0], + ["18", 1309541141000.0, 30.0], + ["0", 1298979435000.0, 48.0], + ["14", 1276098160000.0, 59.0], + ["0", 1233936302000.0, 109.0], + ], + } + + pandas_df = pandas.DataFrame( + data["data"], index=data["index"], columns=data["columns"] + ) + modin_df = pd.DataFrame(data["data"], index=data["index"], columns=data["columns"]) + modin_result = modin_df.sort_values(["id", "time"]).drop_duplicates(["id"]) + pandas_result = pandas_df.sort_values(["id", "time"]).drop_duplicates(["id"]) + df_equals(modin_result, pandas_result) + + +def test_drop_duplicates_after_sort(): + data = [ + {"value": 1, "time": 2}, + {"value": 1, "time": 1}, + {"value": 2, "time": 1}, + {"value": 2, "time": 2}, + ] + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_result = modin_df.sort_values(["value", "time"]).drop_duplicates(["value"]) + pandas_result = pandas_df.sort_values(["value", "time"]).drop_duplicates(["value"]) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"]) +def test_dropna(data, axis, how): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + with pytest.raises(ValueError): + modin_df.dropna(axis=axis, how="invalid") + + with pytest.raises(TypeError): + modin_df.dropna(axis=axis, how=None, thresh=None) + + with pytest.raises(KeyError): + modin_df.dropna(axis=axis, subset=["NotExists"], how=how) + + modin_result = modin_df.dropna(axis=axis, how=how) + pandas_result = pandas_df.dropna(axis=axis, how=how) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dropna_inplace(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + pandas_result = pandas_df.dropna() + modin_df.dropna(inplace=True) + df_equals(modin_df, pandas_result) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + pandas_df.dropna(thresh=2, inplace=True) + modin_df.dropna(thresh=2, inplace=True) + df_equals(modin_df, pandas_df) + + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + pandas_df.dropna(axis=1, how="any", inplace=True) + modin_df.dropna(axis=1, how="any", inplace=True) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dropna_multiple_axes(data): + modin_df = pd.DataFrame(data) + + with pytest.raises(TypeError): + modin_df.dropna(how="all", axis=[0, 1]) + with pytest.raises(TypeError): + modin_df.dropna(how="all", axis=(0, 1)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dropna_subset(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if "empty_data" not in request.node.name: + column_subset = modin_df.columns[0:2] + df_equals( + modin_df.dropna(how="all", subset=column_subset), + pandas_df.dropna(how="all", subset=column_subset), + ) + df_equals( + modin_df.dropna(how="any", subset=column_subset), + pandas_df.dropna(how="any", subset=column_subset), + ) + + row_subset = modin_df.index[0:2] + df_equals( + modin_df.dropna(how="all", axis=1, subset=row_subset), + pandas_df.dropna(how="all", axis=1, subset=row_subset), + ) + df_equals( + modin_df.dropna(how="any", axis=1, subset=row_subset), + pandas_df.dropna(how="any", axis=1, subset=row_subset), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dropna_subset_error(data): + modin_df = pd.DataFrame(data) + + # pandas_df is unused so there won't be confusing list comprehension + # stuff in the pytest.mark.parametrize + with pytest.raises(KeyError): + modin_df.dropna(subset=list("EF")) + + if len(modin_df.columns) < 5: + with pytest.raises(KeyError): + modin_df.dropna(axis=1, subset=[4, 5]) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "astype", + [ + "category", + pytest.param( + "int32", + marks=pytest.mark.xfail( + reason="Modin astype() does not raises ValueError at non-numeric argument when Pandas does." + ), + ), + "float", + ], +) +def test_insert_dtypes(data, astype): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + # categories with NaN works incorrect for now + if astype == "category" and pandas_df.iloc[:, 0].isnull().any(): + return + + eval_insert( + modin_df, + pandas_df, + col="TypeSaver", + value=lambda df: df.iloc[:, 0].astype(astype), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("loc", int_arg_values, ids=arg_keys("loc", int_arg_keys)) +def test_insert_loc(data, loc): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + value = modin_df.iloc[:, 0] + + eval_insert(modin_df, pandas_df, loc=loc, value=value) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_insert(data): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + eval_insert( + modin_df, pandas_df, col="Duplicate", value=lambda df: df[df.columns[0]] + ) + eval_insert(modin_df, pandas_df, col="Scalar", value=100) + eval_insert( + pd.DataFrame(columns=list("ab")), + pandas.DataFrame(columns=list("ab")), + col=lambda df: df.columns[0], + value=lambda df: df[df.columns[0]], + ) + eval_insert( + pd.DataFrame(index=modin_df.index), + pandas.DataFrame(index=pandas_df.index), + col=lambda df: df.columns[0], + value=lambda df: df[df.columns[0]], + ) + eval_insert( + modin_df, + pandas_df, + col="DataFrame insert", + value=lambda df: df[[df.columns[0]]], + ) + + # Bad inserts + eval_insert(modin_df, pandas_df, col="Bad Column", value=lambda df: df) + eval_insert( + modin_df, + pandas_df, + col="Too Short", + value=lambda df: list(df[df.columns[0]])[:-1], + ) + eval_insert( + modin_df, + pandas_df, + col=lambda df: df.columns[0], + value=lambda df: df[df.columns[0]], + ) + eval_insert( + modin_df, + pandas_df, + loc=lambda df: len(df.columns) + 100, + col="Bad Loc", + value=100, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ndim(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert modin_df.ndim == pandas_df.ndim + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_notna(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.notna(), pandas_df.notna()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_notnull(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.notnull(), pandas_df.notnull()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_round(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.round(), pandas_df.round()) + df_equals(modin_df.round(1), pandas_df.round(1)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_set_axis(data, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + x = pandas.DataFrame()._get_axis_number(axis) + index = modin_df.columns if x else modin_df.index + labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])] + + modin_result = modin_df.set_axis(labels, axis=axis, inplace=False) + pandas_result = pandas_df.set_axis(labels, axis=axis, inplace=False) + df_equals(modin_result, pandas_result) + + modin_df_copy = modin_df.copy() + modin_df.set_axis(labels, axis=axis, inplace=True) + + # Check that the copy and original are different + try: + df_equals(modin_df, modin_df_copy) + except AssertionError: + assert True + else: + assert False + + pandas_df.set_axis(labels, axis=axis, inplace=True) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("drop", bool_arg_values, ids=arg_keys("drop", bool_arg_keys)) +@pytest.mark.parametrize( + "append", bool_arg_values, ids=arg_keys("append", bool_arg_keys) +) +def test_set_index(request, data, drop, append): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if "empty_data" not in request.node.name: + key = modin_df.columns[0] + modin_result = modin_df.set_index(key, drop=drop, append=append, inplace=False) + pandas_result = pandas_df.set_index( + key, drop=drop, append=append, inplace=False + ) + df_equals(modin_result, pandas_result) + + modin_df_copy = modin_df.copy() + modin_df.set_index(key, drop=drop, append=append, inplace=True) + + # Check that the copy and original are different + try: + df_equals(modin_df, modin_df_copy) + except AssertionError: + assert True + else: + assert False + + pandas_df.set_index(key, drop=drop, append=append, inplace=True) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_shape(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert modin_df.shape == pandas_df.shape + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_size(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + assert modin_df.size == pandas_df.size + + +def test_squeeze(): + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + frame_data_2 = {"col1": [0, 1, 2, 3]} + frame_data_3 = { + "col1": [0], + "col2": [4], + "col3": [8], + "col4": [12], + "col5": [0], + } + frame_data_4 = {"col1": [2]} + frame_data_5 = {"col1": ["string"]} + # Different data for different cases + pandas_df = pandas.DataFrame(frame_data).squeeze() + modin_df = pd.DataFrame(frame_data).squeeze() + df_equals(modin_df, pandas_df) + + pandas_df_2 = pandas.DataFrame(frame_data_2).squeeze() + modin_df_2 = pd.DataFrame(frame_data_2).squeeze() + df_equals(modin_df_2, pandas_df_2) + + pandas_df_3 = pandas.DataFrame(frame_data_3).squeeze() + modin_df_3 = pd.DataFrame(frame_data_3).squeeze() + df_equals(modin_df_3, pandas_df_3) + + pandas_df_4 = pandas.DataFrame(frame_data_4).squeeze() + modin_df_4 = pd.DataFrame(frame_data_4).squeeze() + df_equals(modin_df_4, pandas_df_4) + + pandas_df_5 = pandas.DataFrame(frame_data_5).squeeze() + modin_df_5 = pd.DataFrame(frame_data_5).squeeze() + df_equals(modin_df_5, pandas_df_5) + + data = [ + [ + pd.Timestamp("2019-01-02"), + pd.Timestamp("2019-01-03"), + pd.Timestamp("2019-01-04"), + pd.Timestamp("2019-01-05"), + ], + [1, 1, 1, 2], + ] + df = pd.DataFrame(data, index=["date", "value"]).T + pf = pandas.DataFrame(data, index=["date", "value"]).T + df.set_index("date", inplace=True) + pf.set_index("date", inplace=True) + df_equals(df.iloc[0], pf.iloc[0]) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_transpose(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.T, pandas_df.T) + df_equals(modin_df.transpose(), pandas_df.transpose()) + + # Test for map across full axis for select indices + df_equals(modin_df.T.dropna(), pandas_df.T.dropna()) + # Test for map across full axis + df_equals(modin_df.T.nunique(), pandas_df.T.nunique()) + # Test for map across blocks + df_equals(modin_df.T.notna(), pandas_df.T.notna()) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ( + {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, + {"B": ["d", "e", "f", "g", "h", "i"]}, + ), + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), + ], +) +def test_update(data, other_data): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + other_modin_df, other_pandas_df = ( + pd.DataFrame(other_data), + pandas.DataFrame(other_data), + ) + modin_df.update(other_modin_df) + pandas_df.update(other_pandas_df) + df_equals(modin_df, pandas_df) + + with pytest.raises(ValueError): + modin_df.update(other_modin_df, errors="raise") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___neg__(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.__neg__() + except Exception as e: + with pytest.raises(type(e)): + modin_df.__neg__() + else: + modin_result = modin_df.__neg__() + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___invert__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + try: + pandas_result = ~pandas_df + except Exception as e: + with pytest.raises(type(e)): + repr(~modin_df) + else: + modin_result = ~modin_df + df_equals(modin_result, pandas_result) + + +def test___hash__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + try: + pd.DataFrame(data).__hash__() + except TypeError: + pass + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___delitem__(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if "empty_data" not in request.node.name: + key = pandas_df.columns[0] + + modin_df = modin_df.copy() + pandas_df = pandas_df.copy() + modin_df.__delitem__(key) + pandas_df.__delitem__(key) + df_equals(modin_df, pandas_df) + + # Issue 2027 + last_label = pandas_df.iloc[:, -1].name + modin_df.__delitem__(last_label) + pandas_df.__delitem__(last_label) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___nonzero__(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(ValueError): + # Always raises ValueError + modin_df.__nonzero__() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___abs__(request, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = abs(pandas_df) + except Exception as e: + with pytest.raises(type(e)): + abs(modin_df) + else: + modin_result = abs(modin_df) + df_equals(modin_result, pandas_result) + + +def test___round__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + pd.DataFrame(data).__round__() diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py new file mode 100644 index 00000000000..dc3587a58c0 --- /dev/null +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -0,0 +1,340 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import os +import matplotlib +import modin.pandas as pd + +from modin.pandas.test.utils import ( + df_equals, + arg_keys, + test_data, + test_data_values, + test_data_keys, + axis_keys, + axis_values, + bool_arg_keys, + bool_arg_values, + int_arg_keys, + int_arg_values, + eval_general, + create_test_dfs, + generate_multiindex, + test_data_diff_dtype, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize("method", ["all", "any"]) +@pytest.mark.parametrize("is_transposed", [False, True]) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_all_any(data, axis, skipna, is_transposed, method): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( + axis=axis, skipna=skipna, bool_only=None + ), + ) + + +@pytest.mark.parametrize("method", ["all", "any"]) +@pytest.mark.parametrize( + "bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys) +) +def test_all_any_specific(bool_only, method): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: getattr(df, method)(bool_only=bool_only), + ) + + +@pytest.mark.parametrize("method", ["all", "any"]) +@pytest.mark.parametrize("level", [-1, 0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("data", [test_data["int_data"]]) +def test_all_any_level(data, axis, level, method): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + if axis == 0: + new_idx = generate_multiindex(len(modin_df.index)) + modin_df.index = new_idx + pandas_df.index = new_idx + else: + new_col = generate_multiindex(len(modin_df.columns)) + modin_df.columns = new_col + pandas_df.columns = new_col + + eval_general( + modin_df, + pandas_df, + lambda df: getattr(df, method)(axis=axis, level=level), + ) + + +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_count(data, axis): + eval_general( + *create_test_dfs(data), + lambda df: df.count(axis=axis), + ) + + +@pytest.mark.parametrize( + "numeric_only", + [ + pytest.param(True, marks=pytest.mark.xfail(reason="See #1965 for details")), + False, + None, + ], +) +def test_count_specific(numeric_only): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.count(numeric_only=numeric_only), + ) + + +@pytest.mark.parametrize("level", [-1, 0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("data", [test_data["int_data"]]) +def test_count_level(data, axis, level): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + if axis == 0: + new_idx = generate_multiindex(len(modin_df.index)) + modin_df.index = new_idx + pandas_df.index = new_idx + else: + new_col = generate_multiindex(len(modin_df.columns)) + modin_df.columns = new_col + pandas_df.columns = new_col + + eval_general( + modin_df, + pandas_df, + lambda df: df.count(axis=axis, level=level), + ) + + +@pytest.mark.parametrize("percentiles", [None, 0.10, 0.11, 0.44, 0.78, 0.99]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_describe(data, percentiles): + eval_general( + *create_test_dfs(data), + lambda df: df.describe(percentiles=percentiles), + ) + + +@pytest.mark.parametrize( + "exclude,include", + [ + ([np.float64], None), + (np.float64, None), + (None, [np.timedelta64, np.datetime64, np.object, np.bool]), + (None, "all"), + (None, np.number), + ], +) +def test_describe_specific(exclude, include): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.drop("str_col", axis=1).describe( + exclude=exclude, include=include + ), + ) + + +@pytest.mark.parametrize("data", [test_data["int_data"]]) +def test_describe_str(data): + modin_df = pd.DataFrame(data).applymap(str) + pandas_df = pandas.DataFrame(data).applymap(str) + + try: + df_equals(modin_df.describe(), pandas_df.describe()) + except AssertionError: + # We have to do this because we choose the highest count slightly differently + # than pandas. Because there is no true guarantee which one will be first, + # If they don't match, make sure that the `freq` is the same at least. + df_equals( + modin_df.describe().loc[["count", "unique", "freq"]], + pandas_df.describe().loc[["count", "unique", "freq"]], + ) + + +def test_describe_dtypes(): + data = { + "col1": list("abc"), + "col2": list("abc"), + "col3": list("abc"), + "col4": [1, 2, 3], + } + eval_general(*create_test_dfs(data), lambda df: df.describe()) + + +@pytest.mark.parametrize("method", ["idxmin", "idxmax"]) +@pytest.mark.parametrize("is_transposed", [False, True]) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_idxmin_idxmax(data, axis, skipna, is_transposed, method): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( + axis=axis, skipna=skipna + ), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_last_valid_index(data): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + assert modin_df.last_valid_index() == pandas_df.last_valid_index() + + +@pytest.mark.parametrize("index", bool_arg_values, ids=arg_keys("index", bool_arg_keys)) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_memory_usage(data, index): + eval_general(*create_test_dfs(data), lambda df: df.memory_usage(index=index)) + + +@pytest.mark.parametrize("method", ["min", "max", "mean"]) +@pytest.mark.parametrize("is_transposed", [False, True]) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method): + eval_general( + *create_test_dfs(data), + lambda df: getattr((df.T if is_transposed else df), method)( + axis=axis, skipna=skipna, numeric_only=numeric_only + ), + ) + + +@pytest.mark.skipif( + os.name == "nt", + reason="Windows has a memory issue for large numbers on this test", +) +@pytest.mark.parametrize( + "method", + [ + "prod", + pytest.param( + "product", + marks=pytest.mark.skipif( + pandas.DataFrame.product == pandas.DataFrame.prod + and pd.DataFrame.product == pd.DataFrame.prod, + reason="That method was already tested.", + ), + ), + ], +) +@pytest.mark.parametrize("is_transposed", [False, True]) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_prod( + data, + axis, + skipna, + is_transposed, + method, +): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( + axis=axis, + skipna=skipna, + ), + ) + + +@pytest.mark.parametrize( + "numeric_only", + [ + pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), + False, + True, + ], +) +@pytest.mark.parametrize( + "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) +) +def test_prod_specific(min_count, numeric_only): + if min_count == 5 and numeric_only: + pytest.xfail("see #1953 for details") + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), + ) + + +@pytest.mark.parametrize("is_transposed", [False, True]) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("data", [test_data["float_nan_data"]]) +def test_sum(data, axis, skipna, is_transposed): + eval_general( + *create_test_dfs(data), + lambda df: (df.T if is_transposed else df).sum( + axis=axis, + skipna=skipna, + ), + ) + + +@pytest.mark.parametrize( + "numeric_only", + [ + pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), + False, + True, + ], +) +@pytest.mark.parametrize("min_count", int_arg_values) +def test_sum_specific(min_count, numeric_only): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: df.sum(min_count=min_count, numeric_only=numeric_only), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_sum_single_column(data): + modin_df = pd.DataFrame(data).iloc[:, [0]] + pandas_df = pandas.DataFrame(data).iloc[:, [0]] + df_equals(modin_df.sum(), pandas_df.sum()) + df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1)) diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py new file mode 100644 index 00000000000..a87243471e5 --- /dev/null +++ b/modin/pandas/test/dataframe/test_udf.py @@ -0,0 +1,443 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import matplotlib +import modin.pandas as pd + +from modin.pandas.test.utils import ( + random_state, + df_equals, + name_contains, + test_data_values, + test_data_keys, + numeric_dfs, + query_func_keys, + query_func_values, + agg_func_keys, + agg_func_values, + numeric_agg_funcs, + axis_keys, + axis_values, + eval_general, + create_test_dfs, + udf_func_values, + udf_func_keys, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_agg(data, axis, func): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.agg(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.agg(func, axis) + else: + modin_result = modin_df.agg(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_agg_numeric(request, data, axis, func): + if name_contains(request.node.name, numeric_agg_funcs) and name_contains( + request.node.name, numeric_dfs + ): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.agg(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.agg(func, axis) + else: + modin_result = modin_df.agg(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_aggregate(request, data, func, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.aggregate(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.aggregate(func, axis) + else: + modin_result = modin_df.aggregate(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_aggregate_numeric(request, data, axis, func): + if name_contains(request.node.name, numeric_agg_funcs) and name_contains( + request.node.name, numeric_dfs + ): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.agg(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.agg(func, axis) + else: + modin_result = modin_df.agg(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_aggregate_error_checking(data): + modin_df = pd.DataFrame(data) + + assert modin_df.aggregate("ndim") == 2 + + with pytest.warns(UserWarning): + modin_df.aggregate({modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"}) + + with pytest.warns(UserWarning): + modin_df.aggregate("cumproduct") + + with pytest.raises(ValueError): + modin_df.aggregate("NOT_EXISTS") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_apply(request, data, func, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + with pytest.raises(TypeError): + modin_df.apply({"row": func}, axis=1) + + try: + pandas_result = pandas_df.apply(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.apply(func, axis) + else: + modin_result = modin_df.apply(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("level", [None, -1, 0, 1]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "func", + [ + "kurt", + pytest.param( + "count", + marks=pytest.mark.xfail( + reason="count method handle level parameter incorrectly" + ), + ), + pytest.param( + "sum", + marks=pytest.mark.xfail( + reason="sum method handle level parameter incorrectly" + ), + ), + pytest.param( + "mean", + marks=pytest.mark.xfail( + reason="mean method handle level parameter incorrectly" + ), + ), + pytest.param( + "all", + marks=pytest.mark.xfail( + reason="all method handle level parameter incorrectly" + ), + ), + ], +) +def test_apply_text_func_with_level(level, data, func, axis): + func_kwargs = {"level": level, "axis": axis} + rows_number = len(next(iter(data.values()))) # length of the first data column + level_0 = np.random.choice([0, 1, 2], rows_number) + level_1 = np.random.choice([3, 4, 5], rows_number) + index = pd.MultiIndex.from_arrays([level_0, level_1]) + + eval_general( + pd.DataFrame(data, index=index), + pandas.DataFrame(data, index=index), + lambda df, *args, **kwargs: df.apply(func, *args, **kwargs), + **func_kwargs, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_apply_args(data, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + def apply_func(series, y): + try: + return series + y + except TypeError: + return series.map(str) + str(y) + + modin_result = modin_df.apply(apply_func, axis=axis, args=(1,)) + pandas_result = pandas_df.apply(apply_func, axis=axis, args=(1,)) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.apply(apply_func, axis=axis, args=("_A",)) + pandas_result = pandas_df.apply(apply_func, axis=axis, args=("_A",)) + df_equals(modin_result, pandas_result) + + +def test_apply_metadata(): + def add(a, b, c): + return a + b + c + + data = {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]} + + modin_df = pd.DataFrame(data) + modin_df["add"] = modin_df.apply( + lambda row: add(row["A"], row["B"], row["C"]), axis=1 + ) + + pandas_df = pandas.DataFrame(data) + pandas_df["add"] = pandas_df.apply( + lambda row: add(row["A"], row["B"], row["C"]), axis=1 + ) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_apply_numeric(request, data, func, axis): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if name_contains(request.node.name, numeric_dfs): + try: + pandas_result = pandas_df.apply(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_df.apply(func, axis) + else: + modin_result = modin_df.apply(func, axis) + df_equals(modin_result, pandas_result) + + if "empty_data" not in request.node.name: + key = modin_df.columns[0] + modin_result = modin_df.apply(lambda df: df.drop(key), axis=1) + pandas_result = pandas_df.apply(lambda df: df.drop(key), axis=1) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("func", udf_func_values, ids=udf_func_keys) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_apply_udf(data, func): + eval_general( + *create_test_dfs(data), + lambda df, *args, **kwargs: df.apply(*args, **kwargs), + func=func, + other=lambda df: df, + ) + + +def test_eval_df_use_case(): + frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + # test eval for series results + tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") + tmp_modin = modin_df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") + + assert isinstance(tmp_modin, pd.Series) + df_equals(tmp_modin, tmp_pandas) + + # Test not inplace assignments + tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") + tmp_modin = modin_df.eval( + "e = arctan2(sin(a), b)", engine="python", parser="pandas" + ) + df_equals(tmp_modin, tmp_pandas) + + # Test inplace assignments + df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True) + modin_df.eval( + "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True + ) + # TODO: Use a series equality validator. + df_equals(modin_df, df) + + +def test_eval_df_arithmetic_subexpression(): + frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) + modin_df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) + # TODO: Use a series equality validator. + df_equals(modin_df, df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_filter(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} + df_equals(modin_df.filter(items=by["items"]), pandas_df.filter(items=by["items"])) + + df_equals( + modin_df.filter(regex=by["regex"], axis=0), + pandas_df.filter(regex=by["regex"], axis=0), + ) + df_equals( + modin_df.filter(regex=by["regex"], axis=1), + pandas_df.filter(regex=by["regex"], axis=1), + ) + + df_equals(modin_df.filter(like=by["like"]), pandas_df.filter(like=by["like"])) + + with pytest.raises(TypeError): + modin_df.filter(items=by["items"], regex=by["regex"]) + + with pytest.raises(TypeError): + modin_df.filter() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pipe(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + n = len(modin_df.index) + a, b, c = 2 % n, 0, 3 % n + col = modin_df.columns[3 % len(modin_df.columns)] + + def h(x): + return x.drop(columns=[col]) + + def g(x, arg1=0): + for _ in range(arg1): + x = x.append(x) + return x + + def f(x, arg2=0, arg3=0): + return x.drop([arg2, arg3]) + + df_equals( + f(g(h(modin_df), arg1=a), arg2=b, arg3=c), + (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) + df_equals( + (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("funcs", query_func_values, ids=query_func_keys) +def test_query(data, funcs): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + with pytest.raises(ValueError): + modin_df.query("") + with pytest.raises(NotImplementedError): + x = 2 # noqa F841 + modin_df.query("col1 < @x") + + try: + pandas_result = pandas_df.query(funcs) + except Exception as e: + with pytest.raises(type(e)): + modin_df.query(funcs) + else: + modin_result = modin_df.query(funcs) + df_equals(modin_result, pandas_result) + + +def test_query_after_insert(): + modin_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) + modin_df["z"] = modin_df.eval("x / y") + modin_df = modin_df.query("z >= 0") + modin_result = modin_df.reset_index(drop=True) + modin_result.columns = ["a", "b", "c"] + + pandas_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) + pandas_df["z"] = pandas_df.eval("x / y") + pandas_df = pandas_df.query("z >= 0") + pandas_result = pandas_df.reset_index(drop=True) + pandas_result.columns = ["a", "b", "c"] + + df_equals(modin_result, pandas_result) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_transform(request, data, func): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.transform(func) + except Exception as e: + with pytest.raises(type(e)): + modin_df.transform(func) + else: + modin_result = modin_df.transform(func) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_transform_numeric(request, data, func): + if name_contains(request.node.name, numeric_agg_funcs) and name_contains( + request.node.name, numeric_dfs + ): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.transform(func) + except Exception as e: + with pytest.raises(type(e)): + modin_df.transform(func) + else: + modin_result = modin_df.transform(func) + df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py new file mode 100644 index 00000000000..621b776def7 --- /dev/null +++ b/modin/pandas/test/dataframe/test_window.py @@ -0,0 +1,874 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest +import numpy as np +import pandas +import matplotlib +import modin.pandas as pd + +from modin.pandas.test.utils import ( + random_state, + df_equals, + arg_keys, + name_contains, + test_data_values, + test_data_keys, + no_numeric_dfs, + quantiles_keys, + quantiles_values, + axis_keys, + axis_values, + bool_arg_keys, + bool_arg_values, + int_arg_keys, + int_arg_values, +) + +pd.DEFAULT_NPARTITIONS = 4 + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cummax(request, data, axis, skipna): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.cummax(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.cummax(axis=axis, skipna=skipna) + else: + modin_result = modin_df.cummax(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.cummax(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cummax(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cummax(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_cummax_int_and_float(axis): + data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cummin(request, data, axis, skipna): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.cummin(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.cummin(axis=axis, skipna=skipna) + else: + modin_result = modin_df.cummin(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.cummin(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cummin(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cummin(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_cummin_int_and_float(axis): + data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.cummin(axis=axis), pandas_df.cummin(axis=axis)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cumprod(request, data, axis, skipna): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.cumprod(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.cumprod(axis=axis, skipna=skipna) + else: + modin_result = modin_df.cumprod(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.cumprod(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cumprod(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cumprod(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cumsum(request, data, axis, skipna): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + # pandas exhibits weird behavior for this case + # Remove this case when we can pull the error messages from backend + if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( + axis == 0 or axis == "rows" + ): + with pytest.raises(TypeError): + modin_df.cumsum(axis=axis, skipna=skipna) + else: + try: + pandas_result = pandas_df.cumsum(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.cumsum(axis=axis, skipna=skipna) + else: + modin_result = modin_df.cumsum(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( + axis == 0 or axis == "rows" + ): + with pytest.raises(TypeError): + modin_df.T.cumsum(axis=axis, skipna=skipna) + else: + try: + pandas_result = pandas_df.T.cumsum(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cumsum(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cumsum(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) +) +def test_diff(request, data, axis, periods): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.diff(axis=axis, periods=periods) + except Exception as e: + with pytest.raises(type(e)): + modin_df.diff(axis=axis, periods=periods) + else: + modin_result = modin_df.diff(axis=axis, periods=periods) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.diff(axis=axis, periods=periods) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.diff(axis=axis, periods=periods) + else: + modin_result = modin_df.T.diff(axis=axis, periods=periods) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "keep", ["last", "first", False], ids=["last", "first", "False"] +) +def test_duplicated(data, keep): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + pandas_result = pandas_df.duplicated(keep=keep) + modin_result = modin_df.duplicated(keep=keep) + df_equals(modin_result, pandas_result) + + import random + + subset = random.sample( + list(pandas_df.columns), random.randint(1, len(pandas_df.columns)) + ) + pandas_result = pandas_df.duplicated(keep=keep, subset=subset) + modin_result = modin_df.duplicated(keep=keep, subset=subset) + + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ffill(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.ffill(), pandas_df.ffill()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "method", + ["backfill", "bfill", "pad", "ffill", None], + ids=["backfill", "bfill", "pad", "ffill", "None"], +) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("limit", int_arg_values, ids=int_arg_keys) +def test_fillna(data, method, axis, limit): + # We are not testing when limit is not positive until pandas-27042 gets fixed. + # We are not testing when axis is over rows until pandas-17399 gets fixed. + if limit > 0 and axis != 1 and axis != "columns": + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.fillna(0, method=method, axis=axis, limit=limit) + except Exception as e: + with pytest.raises(type(e)): + modin_df.fillna(0, method=method, axis=axis, limit=limit) + else: + modin_result = modin_df.fillna(0, method=method, axis=axis, limit=limit) + df_equals(modin_result, pandas_result) + + +def test_fillna_sanity(): + # with different dtype + frame_data = [ + ["a", "a", np.nan, "a"], + ["b", "b", np.nan, "b"], + ["c", "c", np.nan, "c"], + ] + df = pandas.DataFrame(frame_data) + + result = df.fillna({2: "foo"}) + modin_df = pd.DataFrame(frame_data).fillna({2: "foo"}) + + df_equals(modin_df, result) + + modin_df = pd.DataFrame(df) + df.fillna({2: "foo"}, inplace=True) + modin_df.fillna({2: "foo"}, inplace=True) + df_equals(modin_df, result) + + frame_data = { + "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")], + "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT], + } + df = pandas.DataFrame(frame_data) + result = df.fillna(value={"Date": df["Date2"]}) + modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]}) + df_equals(modin_df, result) + + frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad")) + + frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data).fillna(method="bfill") + df_equals(modin_df, df.fillna(method="bfill")) + + +def test_fillna_downcast(): + # infer int64 from float64 + frame_data = {"a": [1.0, np.nan]} + df = pandas.DataFrame(frame_data) + result = df.fillna(0, downcast="infer") + modin_df = pd.DataFrame(frame_data).fillna(0, downcast="infer") + df_equals(modin_df, result) + + # infer int64 from float64 when fillna value is a dict + df = pandas.DataFrame(frame_data) + result = df.fillna({"a": 0}, downcast="infer") + modin_df = pd.DataFrame(frame_data).fillna({"a": 0}, downcast="infer") + df_equals(modin_df, result) + + +def test_fillna_inplace(): + frame_data = random_state.randn(10, 4) + df = pandas.DataFrame(frame_data) + df[1][:4] = np.nan + df[3][-4:] = np.nan + + modin_df = pd.DataFrame(df) + df.fillna(value=0, inplace=True) + try: + df_equals(modin_df, df) + except AssertionError: + pass + else: + assert False + + modin_df.fillna(value=0, inplace=True) + df_equals(modin_df, df) + + modin_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) + assert modin_df is None + + df[1][:4] = np.nan + df[3][-4:] = np.nan + modin_df = pd.DataFrame(df) + df.fillna(method="ffill", inplace=True) + try: + df_equals(modin_df, df) + except AssertionError: + pass + else: + assert False + + modin_df.fillna(method="ffill", inplace=True) + df_equals(modin_df, df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_frame_fillna_limit(data): + pandas_df = pandas.DataFrame(data) + + index = pandas_df.index + + result = pandas_df[:2].reindex(index) + modin_df = pd.DataFrame(result) + df_equals( + modin_df.fillna(method="pad", limit=2), result.fillna(method="pad", limit=2) + ) + + result = pandas_df[-2:].reindex(index) + modin_df = pd.DataFrame(result) + df_equals( + modin_df.fillna(method="backfill", limit=2), + result.fillna(method="backfill", limit=2), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_frame_pad_backfill_limit(data): + pandas_df = pandas.DataFrame(data) + + index = pandas_df.index + + result = pandas_df[:2].reindex(index) + modin_df = pd.DataFrame(result) + df_equals( + modin_df.fillna(method="pad", limit=2), result.fillna(method="pad", limit=2) + ) + + result = pandas_df[-2:].reindex(index) + modin_df = pd.DataFrame(result) + df_equals( + modin_df.fillna(method="backfill", limit=2), + result.fillna(method="backfill", limit=2), + ) + + +def test_fillna_dtype_conversion(): + # make sure that fillna on an empty frame works + df = pandas.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + modin_df = pd.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + df_equals(modin_df.fillna("nan"), df.fillna("nan")) + + frame_data = {"A": [1, np.nan], "B": [1.0, 2.0]} + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + for v in ["", 1, np.nan, 1.0]: + df_equals(modin_df.fillna(v), df.fillna(v)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna_skip_certain_blocks(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + # don't try to fill boolean, int blocks + df_equals(modin_df.fillna(np.nan), pandas_df.fillna(np.nan)) + + +def test_fillna_dict_series(): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) + + df_equals( + modin_df.fillna({"a": 0, "b": 5, "d": 7}), + df.fillna({"a": 0, "b": 5, "d": 7}), + ) + + # Series treated same as dict + df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max())) + + +def test_fillna_dataframe(): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + df = pandas.DataFrame(frame_data, index=list("VWXYZ")) + modin_df = pd.DataFrame(frame_data, index=list("VWXYZ")) + + # df2 may have different index and columns + df2 = pandas.DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) + modin_df2 = pd.DataFrame(df2) + + # only those columns and indices which are shared get filled + df_equals(modin_df.fillna(modin_df2), df.fillna(df2)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna_columns(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals( + modin_df.fillna(method="ffill", axis=1), + pandas_df.fillna(method="ffill", axis=1), + ) + + df_equals( + modin_df.fillna(method="ffill", axis=1), + pandas_df.fillna(method="ffill", axis=1), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna_invalid_method(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) # noqa F841 + + with pytest.raises(ValueError): + modin_df.fillna(method="ffil") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna_invalid_value(data): + modin_df = pd.DataFrame(data) + # list + pytest.raises(TypeError, modin_df.fillna, [1, 2]) + # tuple + pytest.raises(TypeError, modin_df.fillna, (1, 2)) + # frame with series + pytest.raises(TypeError, modin_df.iloc[:, 0].fillna, modin_df) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna_col_reordering(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + df_equals(modin_df.fillna(method="ffill"), pandas_df.fillna(method="ffill")) + + +def test_fillna_datetime_columns(): + frame_data = { + "A": [-1, -2, np.nan], + "B": pd.date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + } + df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) + modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) + df_equals(modin_df.fillna("?"), df.fillna("?")) + + frame_data = { + "A": [-1, -2, np.nan], + "B": [ + pandas.Timestamp("2013-01-01"), + pandas.Timestamp("2013-01-02"), + pandas.NaT, + ], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + } + df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) + modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) + df_equals(modin_df.fillna("?"), df.fillna("?")) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +def test_median(request, data, axis, skipna, numeric_only): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.median(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.median(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +def test_mode(request, data, axis, numeric_only): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.mode(axis=axis, numeric_only=numeric_only) + except Exception: + with pytest.raises(TypeError): + modin_df.mode(axis=axis, numeric_only=numeric_only) + else: + modin_result = modin_df.mode(axis=axis, numeric_only=numeric_only) + df_equals(modin_result, pandas_result) + + +def test_nlargest(): + data = { + "population": [ + 59000000, + 65000000, + 434000, + 434000, + 434000, + 337000, + 11300, + 11300, + 11300, + ], + "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + } + index = [ + "Italy", + "France", + "Malta", + "Maldives", + "Brunei", + "Iceland", + "Nauru", + "Tuvalu", + "Anguilla", + ] + modin_df = pd.DataFrame(data=data, index=index) + pandas_df = pandas.DataFrame(data=data, index=index) + df_equals(modin_df.nlargest(3, "population"), pandas_df.nlargest(3, "population")) + + +def test_nsmallest(): + data = { + "population": [ + 59000000, + 65000000, + 434000, + 434000, + 434000, + 337000, + 11300, + 11300, + 11300, + ], + "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + } + index = [ + "Italy", + "France", + "Malta", + "Maldives", + "Brunei", + "Iceland", + "Nauru", + "Tuvalu", + "Anguilla", + ] + modin_df = pd.DataFrame(data=data, index=index) + pandas_df = pandas.DataFrame(data=data, index=index) + df_equals( + modin_df.nsmallest(n=3, columns="population"), + pandas_df.nsmallest(n=3, columns="population"), + ) + df_equals( + modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), + pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "dropna", bool_arg_values, ids=arg_keys("dropna", bool_arg_keys) +) +def test_nunique(data, axis, dropna): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + modin_result = modin_df.nunique(axis=axis, dropna=dropna) + pandas_result = pandas_df.nunique(axis=axis, dropna=dropna) + df_equals(modin_result, pandas_result) + + modin_result = modin_df.T.nunique(axis=axis, dropna=dropna) + pandas_result = pandas_df.T.nunique(axis=axis, dropna=dropna) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) +def test_quantile(request, data, q): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + if not name_contains(request.node.name, no_numeric_dfs): + df_equals(modin_df.quantile(q), pandas_df.quantile(q)) + df_equals(modin_df.quantile(q, axis=1), pandas_df.quantile(q, axis=1)) + + try: + pandas_result = pandas_df.quantile(q, axis=1, numeric_only=False) + except Exception as e: + with pytest.raises(type(e)): + modin_df.quantile(q, axis=1, numeric_only=False) + else: + modin_result = modin_df.quantile(q, axis=1, numeric_only=False) + df_equals(modin_result, pandas_result) + else: + with pytest.raises(ValueError): + modin_df.quantile(q) + + if not name_contains(request.node.name, no_numeric_dfs): + df_equals(modin_df.T.quantile(q), pandas_df.T.quantile(q)) + df_equals(modin_df.T.quantile(q, axis=1), pandas_df.T.quantile(q, axis=1)) + + try: + pandas_result = pandas_df.T.quantile(q, axis=1, numeric_only=False) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.quantile(q, axis=1, numeric_only=False) + else: + modin_result = modin_df.T.quantile(q, axis=1, numeric_only=False) + df_equals(modin_result, pandas_result) + else: + with pytest.raises(ValueError): + modin_df.T.quantile(q) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +@pytest.mark.parametrize( + "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] +) +def test_rank(data, axis, numeric_only, na_option): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.rank( + axis=axis, numeric_only=numeric_only, na_option=na_option + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.rank(axis=axis, numeric_only=numeric_only, na_option=na_option) + else: + modin_result = modin_df.rank( + axis=axis, numeric_only=numeric_only, na_option=na_option + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +def test_skew(request, data, axis, skipna, numeric_only): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_std(request, data, axis, skipna, numeric_only, ddof): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.std(axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof) + else: + modin_result = modin_df.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + else: + modin_result = modin_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_values(data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + np.testing.assert_equal(modin_df.values, pandas_df.values) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_var(request, data, axis, skipna, numeric_only, ddof): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + + try: + pandas_result = pandas_df.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception: + with pytest.raises(TypeError): + modin_df.var(axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof) + else: + modin_result = modin_df.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + else: + modin_result = modin_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 67401100ac2..1b5eb93b842 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -12,55 +12,14 @@ # governing permissions and limitations under the License. import pytest -import numpy as np import pandas -import pandas.util.testing as tm -import os import matplotlib import modin.pandas as pd -from modin.pandas.utils import to_pandas -from numpy.testing import assert_array_equal -import io -import sys -from .utils import ( - random_state, - RAND_LOW, - RAND_HIGH, +from modin.pandas.test.utils import ( df_equals, - df_is_empty, - arg_keys, - name_contains, - test_data, - test_data_values, - test_data_keys, - test_data_with_duplicates_values, - test_data_with_duplicates_keys, - numeric_dfs, - no_numeric_dfs, - test_func_keys, - test_func_values, - query_func_keys, - query_func_values, - agg_func_keys, - agg_func_values, - numeric_agg_funcs, - quantiles_keys, - quantiles_values, - indices_keys, - indices_values, - axis_keys, axis_values, - bool_arg_keys, - bool_arg_values, - int_arg_keys, - int_arg_values, - eval_general, - create_test_dfs, - udf_func_values, - udf_func_keys, - generate_multiindex, - test_data_diff_dtype, + axis_keys, ) pd.DEFAULT_NPARTITIONS = 4 @@ -69,6070 +28,9 @@ matplotlib.use("Agg") -def eval_insert(modin_df, pandas_df, **kwargs): - _kwargs = {"loc": 0, "col": "New column"} - _kwargs.update(kwargs) - - eval_general( - modin_df, - pandas_df, - operation=lambda df, **kwargs: df.insert(**kwargs), - **_kwargs, - ) - - -class TestDataFrameBinary: - def inter_df_math_helper(self, modin_df, pandas_df, op): - # Test dataframe to dataframe - try: - pandas_result = getattr(pandas_df, op)(pandas_df) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df) - else: - modin_result = getattr(modin_df, op)(modin_df) - df_equals(modin_result, pandas_result) - - # Test dataframe to int - try: - pandas_result = getattr(pandas_df, op)(4) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4) - else: - modin_result = getattr(modin_df, op)(4) - df_equals(modin_result, pandas_result) - - # Test dataframe to float - try: - pandas_result = getattr(pandas_df, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4.0) - else: - modin_result = getattr(modin_df, op)(4.0) - df_equals(modin_result, pandas_result) - - # Test transposed dataframes to float - try: - pandas_result = getattr(pandas_df.T, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df.T, op)(4.0) - else: - modin_result = getattr(modin_df.T, op)(4.0) - df_equals(modin_result, pandas_result) - - frame_data = { - "{}_other".format(modin_df.columns[0]): [0, 2], - modin_df.columns[0]: [0, 19], - modin_df.columns[1]: [1, 1], - } - modin_df2 = pd.DataFrame(frame_data) - pandas_df2 = pandas.DataFrame(frame_data) - - # Test dataframe to different dataframe shape - try: - pandas_result = getattr(pandas_df, op)(pandas_df2) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df2) - else: - modin_result = getattr(modin_df, op)(modin_df2) - df_equals(modin_result, pandas_result) - - # Test dataframe fill value - try: - pandas_result = getattr(pandas_df, op)(pandas_df2, fill_value=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df2, fill_value=0) - else: - modin_result = getattr(modin_df, op)(modin_df2, fill_value=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to list - list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[1])) - try: - pandas_result = getattr(pandas_df, op)(list_test, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(list_test, axis=1) - else: - modin_result = getattr(modin_df, op)(list_test, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to series axis=0 - series_test_modin = modin_df[modin_df.columns[0]] - series_test_pandas = pandas_df[pandas_df.columns[0]] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to series axis=1 - series_test_modin = modin_df.iloc[0] - series_test_pandas = pandas_df.iloc[0] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to list axis=1 - series_test_modin = series_test_pandas = list(pandas_df.iloc[0]) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to list axis=0 - series_test_modin = series_test_pandas = list(pandas_df[pandas_df.columns[0]]) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to series missing values - series_test_modin = modin_df.iloc[0, :-2] - series_test_pandas = pandas_df.iloc[0, :-2] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to series with different index - series_test_modin = modin_df[modin_df.columns[0]].reset_index(drop=True) - series_test_pandas = pandas_df[pandas_df.columns[0]].reset_index(drop=True) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Level test - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in modin_df.index] - ) - modin_df_multi_level = modin_df.copy() - modin_df_multi_level.index = new_idx - # Defaults to pandas - with pytest.warns(UserWarning): - # Operation against self for sanity check - getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) - - @pytest.mark.parametrize( - "function", - [ - "add", - "div", - "divide", - "floordiv", - "mod", - "mul", - "multiply", - "pow", - "sub", - "subtract", - "truediv", - "__div__", - "__add__", - "__radd__", - "__mul__", - "__rmul__", - "__pow__", - "__rpow__", - "__sub__", - "__floordiv__", - "__rfloordiv__", - "__truediv__", - "__rtruediv__", - "__mod__", - "__rmod__", - "__rdiv__", - ], - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_math_functions(self, data, function): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_helper(modin_df, pandas_df, function) - - @pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) - @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_comparison(self, data, op, other): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - eval_general( - modin_df, - pandas_df, - operation=lambda df, **kwargs: getattr(df, op)( - df if other == "as_left" else other - ), - ) - - @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_multi_level_comparison(self, data, op): - modin_df_multi_level = pd.DataFrame(data) - - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in modin_df_multi_level.index] - ) - modin_df_multi_level.index = new_idx - - # Defaults to pandas - with pytest.warns(UserWarning): - # Operation against self for sanity check - getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) - - # Test dataframe right operations - def inter_df_math_right_ops_helper(self, modin_df, pandas_df, op): - try: - pandas_result = getattr(pandas_df, op)(4) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4) - else: - modin_result = getattr(modin_df, op)(4) - df_equals(modin_result, pandas_result) - - try: - pandas_result = getattr(pandas_df, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4.0) - else: - modin_result = getattr(modin_df, op)(4.0) - df_equals(modin_result, pandas_result) - - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in modin_df.index] - ) - modin_df_multi_level = modin_df.copy() - modin_df_multi_level.index = new_idx - - # Defaults to pandas - with pytest.warns(UserWarning): - # Operation against self for sanity check - getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_radd(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "radd") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rdiv(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rdiv") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rfloordiv(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rfloordiv") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rmod(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rmod") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rmul(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rmul") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rpow(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rpow") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rsub(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rsub") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_rtruediv(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "rtruediv") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___rsub__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - self.inter_df_math_right_ops_helper(modin_df, pandas_df, "__rsub__") - - # END test dataframe right operations - - def test_equals(self): - frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]} - modin_df1 = pd.DataFrame(frame_data) - modin_df2 = pd.DataFrame(frame_data) - - assert modin_df1.equals(modin_df2) - - df_equals(modin_df1, modin_df2) - df_equals(modin_df1, pd.DataFrame(modin_df1)) - - frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]} - modin_df3 = pd.DataFrame(frame_data, index=list("abcd")) - - assert not modin_df1.equals(modin_df3) - - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df1) - - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df2) - - assert modin_df1.equals(modin_df2._query_compiler.to_pandas()) - - -class TestDataFrameMapMetadata: - def test_indexing(self): - modin_df = pd.DataFrame( - dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] - ) - pandas_df = pandas.DataFrame( - dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]), index=["a", "b", "c"] - ) - - modin_result = modin_df - pandas_result = pandas_df - df_equals(modin_result, pandas_result) - - modin_result = modin_df["b"] - pandas_result = pandas_df["b"] - df_equals(modin_result, pandas_result) - - modin_result = modin_df[["b"]] - pandas_result = pandas_df[["b"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df[["b", "a"]] - pandas_result = pandas_df[["b", "a"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc["b"] - pandas_result = pandas_df.loc["b"] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[["b"]] - pandas_result = pandas_df.loc[["b"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[["b", "a"]] - pandas_result = pandas_df.loc[["b", "a"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[["b", "a"], ["a", "c"]] - pandas_result = pandas_df.loc[["b", "a"], ["a", "c"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[:, ["a", "c"]] - pandas_result = pandas_df.loc[:, ["a", "c"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[:, ["c"]] - pandas_result = pandas_df.loc[:, ["c"]] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[[]] - pandas_result = pandas_df.loc[[]] - df_equals(modin_result, pandas_result) - - def test_empty_df(self): - df = pd.DataFrame(index=["a", "b"]) - df_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(["a", "b"])) - assert len(df.columns) == 0 - - df = pd.DataFrame(columns=["a", "b"]) - df_is_empty(df) - assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) - - df = pd.DataFrame() - df_is_empty(df) - assert len(df.index) == 0 - assert len(df.columns) == 0 - - df = pd.DataFrame(index=["a", "b"]) - df_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(["a", "b"])) - assert len(df.columns) == 0 - - df = pd.DataFrame(columns=["a", "b"]) - df_is_empty(df) - assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) - - df = pd.DataFrame() - df_is_empty(df) - assert len(df.index) == 0 - assert len(df.columns) == 0 - - df = pd.DataFrame() - pd_df = pandas.DataFrame() - df["a"] = [1, 2, 3, 4, 5] - pd_df["a"] = [1, 2, 3, 4, 5] - df_equals(df, pd_df) - - df = pd.DataFrame() - pd_df = pandas.DataFrame() - df["a"] = list("ABCDEF") - pd_df["a"] = list("ABCDEF") - df_equals(df, pd_df) - - df = pd.DataFrame() - pd_df = pandas.DataFrame() - df["a"] = pd.Series([1, 2, 3, 4, 5]) - pd_df["a"] = pandas.Series([1, 2, 3, 4, 5]) - df_equals(df, pd_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_abs(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.abs() - except Exception as e: - with pytest.raises(type(e)): - modin_df.abs() - else: - modin_result = modin_df.abs() - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_add_prefix(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - test_prefix = "TEST" - new_modin_df = modin_df.add_prefix(test_prefix) - new_pandas_df = pandas_df.add_prefix(test_prefix) - df_equals(new_modin_df.columns, new_pandas_df.columns) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) - def test_applymap(self, request, data, testfunc): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - with pytest.raises(ValueError): - x = 2 - modin_df.applymap(x) - - try: - pandas_result = pandas_df.applymap(testfunc) - except Exception as e: - with pytest.raises(type(e)): - modin_df.applymap(testfunc) - else: - modin_result = modin_df.applymap(testfunc) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys) - def test_applymap_numeric(self, request, data, testfunc): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if name_contains(request.node.name, numeric_dfs): - try: - pandas_result = pandas_df.applymap(testfunc) - except Exception as e: - with pytest.raises(type(e)): - modin_df.applymap(testfunc) - else: - modin_result = modin_df.applymap(testfunc) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_add_suffix(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - test_suffix = "TEST" - new_modin_df = modin_df.add_suffix(test_suffix) - new_pandas_df = pandas_df.add_suffix(test_suffix) - - df_equals(new_modin_df.columns, new_pandas_df.columns) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_at(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - key1 = modin_df.columns[0] - # Scaler - df_equals(modin_df.at[0, key1], pandas_df.at[0, key1]) - - # Series - df_equals(modin_df.loc[0].at[key1], pandas_df.loc[0].at[key1]) - - # Write Item - modin_df_copy = modin_df.copy() - pandas_df_copy = pandas_df.copy() - modin_df_copy.at[1, key1] = modin_df.at[0, key1] - pandas_df_copy.at[1, key1] = pandas_df.at[0, key1] - df_equals(modin_df_copy, pandas_df_copy) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_axes(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - for modin_axis, pd_axis in zip(modin_df.axes, pandas_df.axes): - assert np.array_equal(modin_axis, pd_axis) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_copy(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - # pandas_df is unused but there so there won't be confusing list comprehension - # stuff in the pytest.mark.parametrize - new_modin_df = modin_df.copy() - - assert new_modin_df is not modin_df - assert np.array_equal( - new_modin_df._query_compiler._modin_frame._partitions, - modin_df._query_compiler._modin_frame._partitions, - ) - assert new_modin_df is not modin_df - df_equals(new_modin_df, modin_df) - - # Shallow copy tests - modin_df = pd.DataFrame(data) - modin_df_cp = modin_df.copy(False) - - modin_df[modin_df.columns[0]] = 0 - df_equals(modin_df, modin_df_cp) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dtypes(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.dtypes, pandas_df.dtypes) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("key", indices_values, ids=indices_keys) - def test_get(self, data, key): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.get(key), pandas_df.get(key)) - df_equals( - modin_df.get(key, default="default"), pandas_df.get(key, default="default") - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "dummy_na", bool_arg_values, ids=arg_keys("dummy_na", bool_arg_keys) - ) - @pytest.mark.parametrize( - "drop_first", bool_arg_values, ids=arg_keys("drop_first", bool_arg_keys) - ) - def test_get_dummies(self, request, data, dummy_na, drop_first): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas.get_dummies( - pandas_df, dummy_na=dummy_na, drop_first=drop_first - ) - except Exception as e: - with pytest.raises(type(e)): - pd.get_dummies(modin_df, dummy_na=dummy_na, drop_first=drop_first) - else: - modin_result = pd.get_dummies( - modin_df, dummy_na=dummy_na, drop_first=drop_first - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_isna(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - pandas_result = pandas_df.isna() - modin_result = modin_df.isna() - - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_isnull(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - pandas_result = pandas_df.isnull() - modin_result = modin_df.isnull() - - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_append(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - data_to_append = {"append_a": 2, "append_b": 1000} - - ignore_idx_values = [True, False] - - for ignore in ignore_idx_values: - try: - pandas_result = pandas_df.append(data_to_append, ignore_index=ignore) - except Exception as e: - with pytest.raises(type(e)): - modin_df.append(data_to_append, ignore_index=ignore) - else: - modin_result = modin_df.append(data_to_append, ignore_index=ignore) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.append(pandas_df.iloc[-1]) - except Exception as e: - with pytest.raises(type(e)): - modin_df.append(modin_df.iloc[-1]) - else: - modin_result = modin_df.append(modin_df.iloc[-1]) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.append(list(pandas_df.iloc[-1])) - except Exception as e: - with pytest.raises(type(e)): - modin_df.append(list(modin_df.iloc[-1])) - else: - modin_result = modin_df.append(list(modin_df.iloc[-1])) - # Pandas has bug where sort=False is ignored - # (https://github.com/pandas-dev/pandas/issues/35092), but Modin - # now does the right thing, so for now manually sort to workaround - # this. Once the Pandas bug is fixed and Modin upgrades to that - # Pandas release, this sort will cause the test to fail, and the - # next two lines should be deleted. - assert list(modin_result.columns) == list(modin_df.columns) + [0] - modin_result = modin_result[[0] + sorted(modin_df.columns)] - df_equals(modin_result, pandas_result) - - verify_integrity_values = [True, False] - - for verify_integrity in verify_integrity_values: - try: - pandas_result = pandas_df.append( - [pandas_df, pandas_df], verify_integrity=verify_integrity - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.append( - [modin_df, modin_df], verify_integrity=verify_integrity - ) - else: - modin_result = modin_df.append( - [modin_df, modin_df], verify_integrity=verify_integrity - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.append( - pandas_df, verify_integrity=verify_integrity - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.append(modin_df, verify_integrity=verify_integrity) - else: - modin_result = modin_df.append( - modin_df, verify_integrity=verify_integrity - ) - df_equals(modin_result, pandas_result) - - def test_astype(self): - td = pandas.DataFrame(tm.getSeriesData()) - modin_df = pd.DataFrame(td.values, index=td.index, columns=td.columns) - expected_df = pandas.DataFrame(td.values, index=td.index, columns=td.columns) - - modin_df_casted = modin_df.astype(np.int32) - expected_df_casted = expected_df.astype(np.int32) - df_equals(modin_df_casted, expected_df_casted) - - modin_df_casted = modin_df.astype(np.float64) - expected_df_casted = expected_df.astype(np.float64) - df_equals(modin_df_casted, expected_df_casted) - - modin_df_casted = modin_df.astype(str) - expected_df_casted = expected_df.astype(str) - df_equals(modin_df_casted, expected_df_casted) - - modin_df_casted = modin_df.astype("category") - expected_df_casted = expected_df.astype("category") - df_equals(modin_df_casted, expected_df_casted) - - dtype_dict = {"A": np.int32, "B": np.int64, "C": str} - modin_df_casted = modin_df.astype(dtype_dict) - expected_df_casted = expected_df.astype(dtype_dict) - df_equals(modin_df_casted, expected_df_casted) - - # Ignore lint because this is testing bad input - bad_dtype_dict = {"B": np.int32, "B": np.int64, "B": str} # noqa F601 - modin_df_casted = modin_df.astype(bad_dtype_dict) - expected_df_casted = expected_df.astype(bad_dtype_dict) - df_equals(modin_df_casted, expected_df_casted) - - modin_df = pd.DataFrame(index=["row1"], columns=["col1"]) - modin_df["col1"]["row1"] = 11 - modin_df_casted = modin_df.astype(int) - expected_df = pandas.DataFrame(index=["row1"], columns=["col1"]) - expected_df["col1"]["row1"] = 11 - expected_df_casted = expected_df.astype(int) - df_equals(modin_df_casted, expected_df_casted) - - with pytest.raises(KeyError): - modin_df.astype({"not_exists": np.uint8}) - - def test_astype_category(self): - modin_df = pd.DataFrame( - {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} - ) - pandas_df = pandas.DataFrame( - {"col1": ["A", "A", "B", "B", "A"], "col2": [1, 2, 3, 4, 5]} - ) - - modin_result = modin_df.astype({"col1": "category"}) - pandas_result = pandas_df.astype({"col1": "category"}) - df_equals(modin_result, pandas_result) - assert modin_result.dtypes.equals(pandas_result.dtypes) - - modin_result = modin_df.astype("category") - pandas_result = pandas_df.astype("category") - df_equals(modin_result, pandas_result) - assert modin_result.dtypes.equals(pandas_result.dtypes) - - @pytest.mark.xfail( - reason="Categorical dataframe created in memory don't work yet and categorical dtype is lost" - ) - def test_astype_category_large(self): - series_length = 10_000 - modin_df = pd.DataFrame( - { - "col1": ["str{0}".format(i) for i in range(0, series_length)], - "col2": [i for i in range(0, series_length)], - } - ) - pandas_df = pandas.DataFrame( - { - "col1": ["str{0}".format(i) for i in range(0, series_length)], - "col2": [i for i in range(0, series_length)], - } - ) - - modin_result = modin_df.astype({"col1": "category"}) - pandas_result = pandas_df.astype({"col1": "category"}) - df_equals(modin_result, pandas_result) - assert modin_result.dtypes.equals(pandas_result.dtypes) - - modin_result = modin_df.astype("category") - pandas_result = pandas_df.astype("category") - df_equals(modin_result, pandas_result) - assert modin_result.dtypes.equals(pandas_result.dtypes) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_clip(self, request, data, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if name_contains(request.node.name, numeric_dfs): - ind_len = ( - len(modin_df.index) - if not pandas.DataFrame()._get_axis_number(axis) - else len(modin_df.columns) - ) - # set bounds - lower, upper = np.sort(random_state.random_integers(RAND_LOW, RAND_HIGH, 2)) - lower_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len) - upper_list = random_state.random_integers(RAND_LOW, RAND_HIGH, ind_len) - - # test only upper scalar bound - modin_result = modin_df.clip(None, upper, axis=axis) - pandas_result = pandas_df.clip(None, upper, axis=axis) - df_equals(modin_result, pandas_result) - - # test lower and upper scalar bound - modin_result = modin_df.clip(lower, upper, axis=axis) - pandas_result = pandas_df.clip(lower, upper, axis=axis) - df_equals(modin_result, pandas_result) - - # test lower and upper list bound on each column - modin_result = modin_df.clip(lower_list, upper_list, axis=axis) - pandas_result = pandas_df.clip(lower_list, upper_list, axis=axis) - df_equals(modin_result, pandas_result) - - # test only upper list bound on each column - modin_result = modin_df.clip(np.nan, upper_list, axis=axis) - pandas_result = pandas_df.clip(np.nan, upper_list, axis=axis) - df_equals(modin_result, pandas_result) - - with pytest.raises(ValueError): - modin_df.clip(lower=[1, 2, 3], axis=None) - - def test_drop(self): - frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} - simple = pandas.DataFrame(frame_data) - modin_simple = pd.DataFrame(frame_data) - df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) - df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) - df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) - - pytest.raises(ValueError, modin_simple.drop, 5) - pytest.raises(ValueError, modin_simple.drop, "C", 1) - pytest.raises(ValueError, modin_simple.drop, [1, 5]) - pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) - - # errors = 'ignore' - df_equals(modin_simple.drop(5, errors="ignore"), simple) - df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) - df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) - df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) - - # non-unique - nu_df = pandas.DataFrame( - zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"] - ) - modin_nu_df = pd.DataFrame(nu_df) - df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) - df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) - df_equals(modin_nu_df.drop([]), nu_df) - - nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) - nu_df.columns = list("abc") - modin_nu_df = pd.DataFrame(nu_df) - df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) - df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) - - # inplace cache issue - frame_data = random_state.randn(10, 3) - df = pandas.DataFrame(frame_data, columns=list("abc")) - modin_df = pd.DataFrame(frame_data, columns=list("abc")) - expected = df[~(df.b > 0)] - modin_df.drop(labels=df[df.b > 0].index, inplace=True) - df_equals(modin_df, expected) - - midx = pd.MultiIndex( - levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], - codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ) - df = pd.DataFrame( - index=midx, - columns=["big", "small"], - data=[ - [45, 30], - [200, 100], - [1.5, 1], - [30, 20], - [250, 150], - [1.5, 0.8], - [320, 250], - [1, 0.8], - [0.3, 0.2], - ], - ) - with pytest.warns(UserWarning): - df.drop(index="length", level=1) - - def test_drop_api_equivalence(self): - # equivalence of the labels/axis and index/columns API's - frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] - - modin_df = pd.DataFrame( - frame_data, index=["a", "b", "c"], columns=["d", "e", "f"] - ) - - modin_df1 = modin_df.drop("a") - modin_df2 = modin_df.drop(index="a") - df_equals(modin_df1, modin_df2) - - modin_df1 = modin_df.drop("d", 1) - modin_df2 = modin_df.drop(columns="d") - df_equals(modin_df1, modin_df2) - - modin_df1 = modin_df.drop(labels="e", axis=1) - modin_df2 = modin_df.drop(columns="e") - df_equals(modin_df1, modin_df2) - - modin_df1 = modin_df.drop(["a"], axis=0) - modin_df2 = modin_df.drop(index=["a"]) - df_equals(modin_df1, modin_df2) - - modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1) - modin_df2 = modin_df.drop(index=["a"], columns=["d"]) - df_equals(modin_df1, modin_df2) - - with pytest.raises(ValueError): - modin_df.drop(labels="a", index="b") - - with pytest.raises(ValueError): - modin_df.drop(labels="a", columns="b") - - with pytest.raises(ValueError): - modin_df.drop(axis=1) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_drop_transpose(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_result = modin_df.T.drop(columns=[0, 1, 2]) - pandas_result = pandas_df.T.drop(columns=[0, 1, 2]) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.T.drop(index=["col3", "col1"]) - pandas_result = pandas_df.T.drop(index=["col3", "col1"]) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) - pandas_result = pandas_df.T.drop(columns=[0, 1, 2], index=["col3", "col1"]) - df_equals(modin_result, pandas_result) - - def test_droplevel(self): - df = ( - pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - .set_index([0, 1]) - .rename_axis(["a", "b"]) - ) - df.columns = pd.MultiIndex.from_tuples( - [("c", "e"), ("d", "f")], names=["level_1", "level_2"] - ) - df.droplevel("a") - df.droplevel("level_2", axis=1) - - @pytest.mark.parametrize( - "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys - ) - @pytest.mark.parametrize( - "keep", ["last", "first", False], ids=["last", "first", "False"] - ) - @pytest.mark.parametrize( - "subset", - [None, "col1", "name", ("col1", "col3"), ["col1", "col3", "col7"]], - ids=["None", "string", "name", "tuple", "list"], - ) - def test_drop_duplicates(self, data, keep, subset): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_df.drop_duplicates(keep=keep, inplace=False, subset=subset) - except Exception as e: - with pytest.raises(type(e)): - modin_df.drop_duplicates(keep=keep, inplace=False, subset=subset) - else: - df_equals( - pandas_df.drop_duplicates(keep=keep, inplace=False, subset=subset), - modin_df.drop_duplicates(keep=keep, inplace=False, subset=subset), - ) - - try: - pandas_results = pandas_df.drop_duplicates( - keep=keep, inplace=True, subset=subset - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.drop_duplicates(keep=keep, inplace=True, subset=subset) - else: - modin_results = modin_df.drop_duplicates( - keep=keep, inplace=True, subset=subset - ) - df_equals(modin_results, pandas_results) - - def test_drop_duplicates_with_missing_index_values(self): - data = { - "columns": ["value", "time", "id"], - "index": [ - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - ], - "data": [ - ["3", 1279213398000.0, 88.0], - ["3", 1279204682000.0, 88.0], - ["0", 1245772835000.0, 448.0], - ["0", 1270564258000.0, 32.0], - ["0", 1267106669000.0, 118.0], - ["7", 1300621123000.0, 5.0], - ["0", 1251130752000.0, 957.0], - ["0", 1311683506000.0, 62.0], - ["9", 1283692698000.0, 89.0], - ["9", 1270234253000.0, 64.0], - ["0", 1285088818000.0, 50.0], - ["0", 1218212725000.0, 695.0], - ["2", 1383933968000.0, 348.0], - ["0", 1368227625000.0, 257.0], - ["1", 1454514093000.0, 446.0], - ["1", 1428497427000.0, 134.0], - ["1", 1459184936000.0, 568.0], - ["1", 1502293302000.0, 599.0], - ["1", 1491833358000.0, 829.0], - ["1", 1485431534000.0, 806.0], - ["8", 1351800505000.0, 101.0], - ["0", 1357247721000.0, 916.0], - ["0", 1335804423000.0, 370.0], - ["24", 1327547726000.0, 720.0], - ["0", 1332334140000.0, 415.0], - ["0", 1309543100000.0, 30.0], - ["18", 1309541141000.0, 30.0], - ["0", 1298979435000.0, 48.0], - ["14", 1276098160000.0, 59.0], - ["0", 1233936302000.0, 109.0], - ], - } - - pandas_df = pandas.DataFrame( - data["data"], index=data["index"], columns=data["columns"] - ) - modin_df = pd.DataFrame( - data["data"], index=data["index"], columns=data["columns"] - ) - modin_result = modin_df.sort_values(["id", "time"]).drop_duplicates(["id"]) - pandas_result = pandas_df.sort_values(["id", "time"]).drop_duplicates(["id"]) - df_equals(modin_result, pandas_result) - - def test_drop_duplicates_after_sort(self): - data = [ - {"value": 1, "time": 2}, - {"value": 1, "time": 1}, - {"value": 2, "time": 1}, - {"value": 2, "time": 2}, - ] - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_result = modin_df.sort_values(["value", "time"]).drop_duplicates( - ["value"] - ) - pandas_result = pandas_df.sort_values(["value", "time"]).drop_duplicates( - ["value"] - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"]) - def test_dropna(self, data, axis, how): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - with pytest.raises(ValueError): - modin_df.dropna(axis=axis, how="invalid") - - with pytest.raises(TypeError): - modin_df.dropna(axis=axis, how=None, thresh=None) - - with pytest.raises(KeyError): - modin_df.dropna(axis=axis, subset=["NotExists"], how=how) - - modin_result = modin_df.dropna(axis=axis, how=how) - pandas_result = pandas_df.dropna(axis=axis, how=how) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dropna_inplace(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - pandas_result = pandas_df.dropna() - modin_df.dropna(inplace=True) - df_equals(modin_df, pandas_result) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - pandas_df.dropna(thresh=2, inplace=True) - modin_df.dropna(thresh=2, inplace=True) - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - pandas_df.dropna(axis=1, how="any", inplace=True) - modin_df.dropna(axis=1, how="any", inplace=True) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dropna_multiple_axes(self, data): - modin_df = pd.DataFrame(data) - - with pytest.raises(TypeError): - modin_df.dropna(how="all", axis=[0, 1]) - with pytest.raises(TypeError): - modin_df.dropna(how="all", axis=(0, 1)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dropna_subset(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if "empty_data" not in request.node.name: - column_subset = modin_df.columns[0:2] - df_equals( - modin_df.dropna(how="all", subset=column_subset), - pandas_df.dropna(how="all", subset=column_subset), - ) - df_equals( - modin_df.dropna(how="any", subset=column_subset), - pandas_df.dropna(how="any", subset=column_subset), - ) - - row_subset = modin_df.index[0:2] - df_equals( - modin_df.dropna(how="all", axis=1, subset=row_subset), - pandas_df.dropna(how="all", axis=1, subset=row_subset), - ) - df_equals( - modin_df.dropna(how="any", axis=1, subset=row_subset), - pandas_df.dropna(how="any", axis=1, subset=row_subset), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dropna_subset_error(self, data): - modin_df = pd.DataFrame(data) - - # pandas_df is unused so there won't be confusing list comprehension - # stuff in the pytest.mark.parametrize - with pytest.raises(KeyError): - modin_df.dropna(subset=list("EF")) - - if len(modin_df.columns) < 5: - with pytest.raises(KeyError): - modin_df.dropna(axis=1, subset=[4, 5]) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "astype", - [ - "category", - pytest.param( - "int32", - marks=pytest.mark.xfail( - reason="Modin astype() does not raises ValueError at non-numeric argument when Pandas does." - ), - ), - "float", - ], - ) - def test_insert_dtypes(self, data, astype): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - - # categories with NaN works incorrect for now - if astype == "category" and pandas_df.iloc[:, 0].isnull().any(): - return - - eval_insert( - modin_df, - pandas_df, - col="TypeSaver", - value=lambda df: df.iloc[:, 0].astype(astype), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("loc", int_arg_values, ids=arg_keys("loc", int_arg_keys)) - def test_insert_loc(self, data, loc): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - value = modin_df.iloc[:, 0] - - eval_insert(modin_df, pandas_df, loc=loc, value=value) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_insert(self, data): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - - eval_insert( - modin_df, pandas_df, col="Duplicate", value=lambda df: df[df.columns[0]] - ) - eval_insert(modin_df, pandas_df, col="Scalar", value=100) - eval_insert( - pd.DataFrame(columns=list("ab")), - pandas.DataFrame(columns=list("ab")), - col=lambda df: df.columns[0], - value=lambda df: df[df.columns[0]], - ) - eval_insert( - pd.DataFrame(index=modin_df.index), - pandas.DataFrame(index=pandas_df.index), - col=lambda df: df.columns[0], - value=lambda df: df[df.columns[0]], - ) - eval_insert( - modin_df, - pandas_df, - col="DataFrame insert", - value=lambda df: df[[df.columns[0]]], - ) - - # Bad inserts - eval_insert(modin_df, pandas_df, col="Bad Column", value=lambda df: df) - eval_insert( - modin_df, - pandas_df, - col="Too Short", - value=lambda df: list(df[df.columns[0]])[:-1], - ) - eval_insert( - modin_df, - pandas_df, - col=lambda df: df.columns[0], - value=lambda df: df[df.columns[0]], - ) - eval_insert( - modin_df, - pandas_df, - loc=lambda df: len(df.columns) + 100, - col="Bad Loc", - value=100, - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_ndim(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert modin_df.ndim == pandas_df.ndim - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_notna(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.notna(), pandas_df.notna()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_notnull(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.notnull(), pandas_df.notnull()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_round(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.round(), pandas_df.round()) - df_equals(modin_df.round(1), pandas_df.round(1)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_set_axis(self, data, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - x = pandas.DataFrame()._get_axis_number(axis) - index = modin_df.columns if x else modin_df.index - labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])] - - modin_result = modin_df.set_axis(labels, axis=axis, inplace=False) - pandas_result = pandas_df.set_axis(labels, axis=axis, inplace=False) - df_equals(modin_result, pandas_result) - - modin_df_copy = modin_df.copy() - modin_df.set_axis(labels, axis=axis, inplace=True) - - # Check that the copy and original are different - try: - df_equals(modin_df, modin_df_copy) - except AssertionError: - assert True - else: - assert False - - pandas_df.set_axis(labels, axis=axis, inplace=True) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "drop", bool_arg_values, ids=arg_keys("drop", bool_arg_keys) - ) - @pytest.mark.parametrize( - "append", bool_arg_values, ids=arg_keys("append", bool_arg_keys) - ) - def test_set_index(self, request, data, drop, append): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if "empty_data" not in request.node.name: - key = modin_df.columns[0] - modin_result = modin_df.set_index( - key, drop=drop, append=append, inplace=False - ) - pandas_result = pandas_df.set_index( - key, drop=drop, append=append, inplace=False - ) - df_equals(modin_result, pandas_result) - - modin_df_copy = modin_df.copy() - modin_df.set_index(key, drop=drop, append=append, inplace=True) - - # Check that the copy and original are different - try: - df_equals(modin_df, modin_df_copy) - except AssertionError: - assert True - else: - assert False - - pandas_df.set_index(key, drop=drop, append=append, inplace=True) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_shape(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert modin_df.shape == pandas_df.shape - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_size(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert modin_df.size == pandas_df.size - - def test_squeeze(self): - frame_data = { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 10, 11], - "col4": [12, 13, 14, 15], - "col5": [0, 0, 0, 0], - } - frame_data_2 = {"col1": [0, 1, 2, 3]} - frame_data_3 = { - "col1": [0], - "col2": [4], - "col3": [8], - "col4": [12], - "col5": [0], - } - frame_data_4 = {"col1": [2]} - frame_data_5 = {"col1": ["string"]} - # Different data for different cases - pandas_df = pandas.DataFrame(frame_data).squeeze() - modin_df = pd.DataFrame(frame_data).squeeze() - df_equals(modin_df, pandas_df) - - pandas_df_2 = pandas.DataFrame(frame_data_2).squeeze() - modin_df_2 = pd.DataFrame(frame_data_2).squeeze() - df_equals(modin_df_2, pandas_df_2) - - pandas_df_3 = pandas.DataFrame(frame_data_3).squeeze() - modin_df_3 = pd.DataFrame(frame_data_3).squeeze() - df_equals(modin_df_3, pandas_df_3) - - pandas_df_4 = pandas.DataFrame(frame_data_4).squeeze() - modin_df_4 = pd.DataFrame(frame_data_4).squeeze() - df_equals(modin_df_4, pandas_df_4) - - pandas_df_5 = pandas.DataFrame(frame_data_5).squeeze() - modin_df_5 = pd.DataFrame(frame_data_5).squeeze() - df_equals(modin_df_5, pandas_df_5) - - data = [ - [ - pd.Timestamp("2019-01-02"), - pd.Timestamp("2019-01-03"), - pd.Timestamp("2019-01-04"), - pd.Timestamp("2019-01-05"), - ], - [1, 1, 1, 2], - ] - df = pd.DataFrame(data, index=["date", "value"]).T - pf = pandas.DataFrame(data, index=["date", "value"]).T - df.set_index("date", inplace=True) - pf.set_index("date", inplace=True) - df_equals(df.iloc[0], pf.iloc[0]) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_transpose(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.T, pandas_df.T) - df_equals(modin_df.transpose(), pandas_df.transpose()) - - # Test for map across full axis for select indices - df_equals(modin_df.T.dropna(), pandas_df.T.dropna()) - # Test for map across full axis - df_equals(modin_df.T.nunique(), pandas_df.T.nunique()) - # Test for map across blocks - df_equals(modin_df.T.notna(), pandas_df.T.notna()) - - @pytest.mark.parametrize( - "data, other_data", - [ - ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), - ( - {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, - {"B": ["d", "e", "f", "g", "h", "i"]}, - ), - ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), - ], - ) - def test_update(self, data, other_data): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - other_modin_df, other_pandas_df = ( - pd.DataFrame(other_data), - pandas.DataFrame(other_data), - ) - modin_df.update(other_modin_df) - pandas_df.update(other_pandas_df) - df_equals(modin_df, pandas_df) - - with pytest.raises(ValueError): - modin_df.update(other_modin_df, errors="raise") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___neg__(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.__neg__() - except Exception as e: - with pytest.raises(type(e)): - modin_df.__neg__() - else: - modin_result = modin_df.__neg__() - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___invert__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - try: - pandas_result = ~pandas_df - except Exception as e: - with pytest.raises(type(e)): - repr(~modin_df) - else: - modin_result = ~modin_df - df_equals(modin_result, pandas_result) - - def test___hash__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - try: - pd.DataFrame(data).__hash__() - except TypeError: - pass - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___delitem__(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if "empty_data" not in request.node.name: - key = pandas_df.columns[0] - - modin_df = modin_df.copy() - pandas_df = pandas_df.copy() - modin_df.__delitem__(key) - pandas_df.__delitem__(key) - df_equals(modin_df, pandas_df) - - # Issue 2027 - last_label = pandas_df.iloc[:, -1].name - modin_df.__delitem__(last_label) - pandas_df.__delitem__(last_label) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___nonzero__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(ValueError): - # Always raises ValueError - modin_df.__nonzero__() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___abs__(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = abs(pandas_df) - except Exception as e: - with pytest.raises(type(e)): - abs(modin_df) - else: - modin_result = abs(modin_df) - df_equals(modin_result, pandas_result) - - def test___round__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).__round__() - - -class TestDataFrameUDF: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_agg(self, data, axis, func): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_agg_numeric(self, request, data, axis, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_aggregate(self, request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.aggregate(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.aggregate(func, axis) - else: - modin_result = modin_df.aggregate(func, axis) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_aggregate_numeric(self, request, data, axis, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_aggregate_error_checking(self, data): - modin_df = pd.DataFrame(data) - - assert modin_df.aggregate("ndim") == 2 - - with pytest.warns(UserWarning): - modin_df.aggregate( - {modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"} - ) - - with pytest.warns(UserWarning): - modin_df.aggregate("cumproduct") - - with pytest.raises(ValueError): - modin_df.aggregate("NOT_EXISTS") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_apply(self, request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - with pytest.raises(TypeError): - modin_df.apply({"row": func}, axis=1) - - try: - pandas_result = pandas_df.apply(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.apply(func, axis) - else: - modin_result = modin_df.apply(func, axis) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("level", [None, -1, 0, 1]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "func", - [ - "kurt", - pytest.param( - "count", - marks=pytest.mark.xfail( - reason="count method handle level parameter incorrectly" - ), - ), - pytest.param( - "sum", - marks=pytest.mark.xfail( - reason="sum method handle level parameter incorrectly" - ), - ), - pytest.param( - "mean", - marks=pytest.mark.xfail( - reason="mean method handle level parameter incorrectly" - ), - ), - pytest.param( - "all", - marks=pytest.mark.xfail( - reason="all method handle level parameter incorrectly" - ), - ), - ], - ) - def test_apply_text_func_with_level(self, level, data, func, axis): - func_kwargs = {"level": level, "axis": axis} - rows_number = len(next(iter(data.values()))) # length of the first data column - level_0 = np.random.choice([0, 1, 2], rows_number) - level_1 = np.random.choice([3, 4, 5], rows_number) - index = pd.MultiIndex.from_arrays([level_0, level_1]) - - eval_general( - pd.DataFrame(data, index=index), - pandas.DataFrame(data, index=index), - lambda df, *args, **kwargs: df.apply(func, *args, **kwargs), - **func_kwargs, - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_apply_args(self, data, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - def apply_func(series, y): - try: - return series + y - except TypeError: - return series.map(str) + str(y) - - modin_result = modin_df.apply(apply_func, axis=axis, args=(1,)) - pandas_result = pandas_df.apply(apply_func, axis=axis, args=(1,)) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.apply(apply_func, axis=axis, args=("_A",)) - pandas_result = pandas_df.apply(apply_func, axis=axis, args=("_A",)) - df_equals(modin_result, pandas_result) - - def test_apply_metadata(self): - def add(a, b, c): - return a + b + c - - data = {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]} - - modin_df = pd.DataFrame(data) - modin_df["add"] = modin_df.apply( - lambda row: add(row["A"], row["B"], row["C"]), axis=1 - ) - - pandas_df = pandas.DataFrame(data) - pandas_df["add"] = pandas_df.apply( - lambda row: add(row["A"], row["B"], row["C"]), axis=1 - ) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_apply_numeric(self, request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if name_contains(request.node.name, numeric_dfs): - try: - pandas_result = pandas_df.apply(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.apply(func, axis) - else: - modin_result = modin_df.apply(func, axis) - df_equals(modin_result, pandas_result) - - if "empty_data" not in request.node.name: - key = modin_df.columns[0] - modin_result = modin_df.apply(lambda df: df.drop(key), axis=1) - pandas_result = pandas_df.apply(lambda df: df.drop(key), axis=1) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("func", udf_func_values, ids=udf_func_keys) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_apply_udf(self, data, func): - eval_general( - *create_test_dfs(data), - lambda df, *args, **kwargs: df.apply(*args, **kwargs), - func=func, - other=lambda df: df, - ) - - def test_eval_df_use_case(self): - frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - # test eval for series results - tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") - tmp_modin = modin_df.eval( - "arctan2(sin(a), b)", engine="python", parser="pandas" - ) - - assert isinstance(tmp_modin, pd.Series) - df_equals(tmp_modin, tmp_pandas) - - # Test not inplace assignments - tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") - tmp_modin = modin_df.eval( - "e = arctan2(sin(a), b)", engine="python", parser="pandas" - ) - df_equals(tmp_modin, tmp_pandas) - - # Test inplace assignments - df.eval( - "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True - ) - modin_df.eval( - "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True - ) - # TODO: Use a series equality validator. - df_equals(modin_df, df) - - def test_eval_df_arithmetic_subexpression(self): - frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) - modin_df.eval( - "not_e = sin(a + b)", engine="python", parser="pandas", inplace=True - ) - # TODO: Use a series equality validator. - df_equals(modin_df, df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_filter(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} - df_equals( - modin_df.filter(items=by["items"]), pandas_df.filter(items=by["items"]) - ) - - df_equals( - modin_df.filter(regex=by["regex"], axis=0), - pandas_df.filter(regex=by["regex"], axis=0), - ) - df_equals( - modin_df.filter(regex=by["regex"], axis=1), - pandas_df.filter(regex=by["regex"], axis=1), - ) - - df_equals(modin_df.filter(like=by["like"]), pandas_df.filter(like=by["like"])) - - with pytest.raises(TypeError): - modin_df.filter(items=by["items"], regex=by["regex"]) - - with pytest.raises(TypeError): - modin_df.filter() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_pipe(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - n = len(modin_df.index) - a, b, c = 2 % n, 0, 3 % n - col = modin_df.columns[3 % len(modin_df.columns)] - - def h(x): - return x.drop(columns=[col]) - - def g(x, arg1=0): - for _ in range(arg1): - x = x.append(x) - return x - - def f(x, arg2=0, arg3=0): - return x.drop([arg2, arg3]) - - df_equals( - f(g(h(modin_df), arg1=a), arg2=b, arg3=c), - (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), - ) - df_equals( - (modin_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), - (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("funcs", query_func_values, ids=query_func_keys) - def test_query(self, data, funcs): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - with pytest.raises(ValueError): - modin_df.query("") - with pytest.raises(NotImplementedError): - x = 2 # noqa F841 - modin_df.query("col1 < @x") - - try: - pandas_result = pandas_df.query(funcs) - except Exception as e: - with pytest.raises(type(e)): - modin_df.query(funcs) - else: - modin_result = modin_df.query(funcs) - df_equals(modin_result, pandas_result) - - def test_query_after_insert(self): - modin_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) - modin_df["z"] = modin_df.eval("x / y") - modin_df = modin_df.query("z >= 0") - modin_result = modin_df.reset_index(drop=True) - modin_result.columns = ["a", "b", "c"] - - pandas_df = pd.DataFrame({"x": [-1, 0, 1, None], "y": [1, 2, None, 3]}) - pandas_df["z"] = pandas_df.eval("x / y") - pandas_df = pandas_df.query("z >= 0") - pandas_result = pandas_df.reset_index(drop=True) - pandas_result.columns = ["a", "b", "c"] - - df_equals(modin_result, pandas_result) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_transform(self, request, data, func): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.transform(func) - except Exception as e: - with pytest.raises(type(e)): - modin_df.transform(func) - else: - modin_result = modin_df.transform(func) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) - def test_transform_numeric(self, request, data, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.transform(func) - except Exception as e: - with pytest.raises(type(e)): - modin_df.transform(func) - else: - modin_result = modin_df.transform(func) - df_equals(modin_result, pandas_result) - - -class TestDataFrameDefault: - def test_align(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).align(pd.DataFrame(data)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_to_numpy(self, data): - modin_frame = pd.DataFrame(data) - pandas_frame = pandas.DataFrame(data) - assert_array_equal(modin_frame.values, pandas_frame.values) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_partition_to_numpy(self, data): - frame = pd.DataFrame(data) - for ( - partition - ) in frame._query_compiler._modin_frame._partitions.flatten().tolist(): - assert_array_equal(partition.to_pandas().values, partition.to_numpy()) - - def test_asfreq(self): - index = pd.date_range("1/1/2000", periods=4, freq="T") - series = pd.Series([0.0, None, 2.0, 3.0], index=index) - df = pd.DataFrame({"s": series}) - with pytest.warns(UserWarning): - # We are only testing that this defaults to pandas, so we will just check for - # the warning - df.asfreq(freq="30S") - - def test_asof(self): - df = pd.DataFrame( - {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}, - index=pd.DatetimeIndex( - [ - "2018-02-27 09:01:00", - "2018-02-27 09:02:00", - "2018-02-27 09:03:00", - "2018-02-27 09:04:00", - "2018-02-27 09:05:00", - ] - ), - ) - with pytest.warns(UserWarning): - df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) - - def test_assign(self): - data = test_data_values[0] - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_result = modin_df.assign(new_column=pd.Series(modin_df.iloc[:, 0])) - pandas_result = pandas_df.assign(new_column=pandas.Series(pandas_df.iloc[:, 0])) - df_equals(modin_result, pandas_result) - modin_result = modin_df.assign( - new_column=pd.Series(modin_df.iloc[:, 0]), - new_column2=pd.Series(modin_df.iloc[:, 1]), - ) - pandas_result = pandas_df.assign( - new_column=pandas.Series(pandas_df.iloc[:, 0]), - new_column2=pandas.Series(pandas_df.iloc[:, 1]), - ) - df_equals(modin_result, pandas_result) - - def test_at_time(self): - i = pd.date_range("2008-01-01", periods=1000, freq="12H") - modin_df = pd.DataFrame( - {"A": list(range(1000)), "B": list(range(1000))}, index=i - ) - pandas_df = pandas.DataFrame( - {"A": list(range(1000)), "B": list(range(1000))}, index=i - ) - df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00")) - df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00")) - df_equals( - modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1) - ) - - def test_between_time(self): - i = pd.date_range("2008-01-01", periods=1000, freq="12H") - modin_df = pd.DataFrame( - {"A": list(range(1000)), "B": list(range(1000))}, index=i - ) - pandas_df = pandas.DataFrame( - {"A": list(range(1000)), "B": list(range(1000))}, index=i - ) - df_equals( - modin_df.between_time("12:00", "17:00"), - pandas_df.between_time("12:00", "17:00"), - ) - df_equals( - modin_df.between_time("3:00", "4:00"), - pandas_df.between_time("3:00", "4:00"), - ) - df_equals( - modin_df.T.between_time("12:00", "17:00", axis=1), - pandas_df.T.between_time("12:00", "17:00", axis=1), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_bfill(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.bfill(), pandas_df.bfill()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_bool(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(ValueError): - modin_df.bool() - modin_df.__bool__() - - single_bool_pandas_df = pandas.DataFrame([True]) - single_bool_modin_df = pd.DataFrame([True]) - - assert single_bool_pandas_df.bool() == single_bool_modin_df.bool() - - with pytest.raises(ValueError): - # __bool__ always raises this error for DataFrames - single_bool_modin_df.__bool__() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_boxplot(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - assert modin_df.boxplot() == to_pandas(modin_df).boxplot() - - def test_combine_first(self): - data1 = {"A": [None, 0], "B": [None, 4]} - modin_df1 = pd.DataFrame(data1) - pandas_df1 = pandas.DataFrame(data1) - data2 = {"A": [1, 1], "B": [3, 3]} - modin_df2 = pd.DataFrame(data2) - pandas_df2 = pandas.DataFrame(data2) - df_equals( - modin_df1.combine_first(modin_df2), pandas_df1.combine_first(pandas_df2) - ) - - def test_corr(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).corr() - - def test_corrwith(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).corrwith(pd.DataFrame(data)) - - def test_cov(self): - data = test_data_values[0] - modin_result = pd.DataFrame(data).cov() - pandas_result = pandas.DataFrame(data).cov() - df_equals(modin_result, pandas_result) - - @pytest.mark.skipif( - os.name == "nt", - reason="AssertionError: numpy array are different", - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_dot(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - col_len = len(modin_df.columns) - - # Test list input - arr = np.arange(col_len) - modin_result = modin_df.dot(arr) - pandas_result = pandas_df.dot(arr) - df_equals(modin_result, pandas_result) - - # Test bad dimensions - with pytest.raises(ValueError): - modin_result = modin_df.dot(np.arange(col_len + 10)) - - # Test series input - modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) - pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) - modin_result = modin_df.dot(modin_series) - pandas_result = pandas_df.dot(pandas_series) - df_equals(modin_result, pandas_result) - - # Test dataframe input - modin_result = modin_df.dot(modin_df.T) - pandas_result = pandas_df.dot(pandas_df.T) - df_equals(modin_result, pandas_result) - - # Test when input series index doesn't line up with columns - with pytest.raises(ValueError): - modin_result = modin_df.dot(pd.Series(np.arange(col_len))) - - # Test case when left dataframe has size (n x 1) - # and right dataframe has size (1 x n) - modin_df = pd.DataFrame(modin_series) - pandas_df = pandas.DataFrame(pandas_series) - modin_result = modin_df.dot(modin_df.T) - pandas_result = pandas_df.dot(pandas_df.T) - df_equals(modin_result, pandas_result) - - # Test case when left dataframe has size (1 x 1) - # and right dataframe has size (1 x n) - modin_result = pd.DataFrame([1]).dot(modin_df.T) - pandas_result = pandas.DataFrame([1]).dot(pandas_df.T) - df_equals(modin_result, pandas_result) - - @pytest.mark.skipif( - os.name == "nt", - reason="AssertionError: numpy array are different", - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_matmul(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - col_len = len(modin_df.columns) - - # Test list input - arr = np.arange(col_len) - modin_result = modin_df @ arr - pandas_result = pandas_df @ arr - df_equals(modin_result, pandas_result) - - # Test bad dimensions - with pytest.raises(ValueError): - modin_result = modin_df @ np.arange(col_len + 10) - - # Test series input - modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) - pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) - modin_result = modin_df @ modin_series - pandas_result = pandas_df @ pandas_series - df_equals(modin_result, pandas_result) - - # Test dataframe input - modin_result = modin_df @ modin_df.T - pandas_result = pandas_df @ pandas_df.T - df_equals(modin_result, pandas_result) - - # Test when input series index doesn't line up with columns - with pytest.raises(ValueError): - modin_result = modin_df @ pd.Series(np.arange(col_len)) - - def test_ewm(self): - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - with pytest.warns(UserWarning): - df.ewm(com=0.5).mean() - - def test_expanding(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).expanding() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_explode(self, data): - modin_df = pd.DataFrame(data) - with pytest.warns(UserWarning): - modin_df.explode(modin_df.columns[0]) - - def test_first(self): - i = pd.date_range("2010-04-09", periods=400, freq="2D") - modin_df = pd.DataFrame({"A": list(range(400)), "B": list(range(400))}, index=i) - pandas_df = pandas.DataFrame( - {"A": list(range(400)), "B": list(range(400))}, index=i - ) - df_equals(modin_df.first("3D"), pandas_df.first("3D")) - df_equals(modin_df.first("20D"), pandas_df.first("20D")) - - @pytest.mark.skip(reason="Defaulting to Pandas") - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_from_dict(self, data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_dict(None) - - @pytest.mark.skip(reason="Defaulting to Pandas") - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_from_items(self, data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_items(None) - - @pytest.mark.skip(reason="Defaulting to Pandas") - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_from_records(self, data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_records(None) - - def test_hist(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).hist(None) - - def test_infer_objects(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).infer_objects() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_info_default_param(self, data): - with io.StringIO() as first, io.StringIO() as second: - eval_general( - pd.DataFrame(data), - pandas.DataFrame(data), - verbose=None, - max_cols=None, - memory_usage=None, - null_counts=None, - operation=lambda df, **kwargs: df.info(**kwargs), - buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, - ) - modin_info = first.getvalue().splitlines() - pandas_info = second.getvalue().splitlines() - - assert modin_info[0] == str(pd.DataFrame) - assert pandas_info[0] == str(pandas.DataFrame) - assert modin_info[1:] == pandas_info[1:] - - @pytest.mark.parametrize("verbose", [True, False]) - @pytest.mark.parametrize("max_cols", [10, 99999999]) - @pytest.mark.parametrize("memory_usage", [True, False, "deep"]) - @pytest.mark.parametrize("null_counts", [True, False]) - def test_info(self, verbose, max_cols, memory_usage, null_counts): - data = test_data_values[0] - with io.StringIO() as first, io.StringIO() as second: - eval_general( - pd.DataFrame(data), - pandas.DataFrame(data), - operation=lambda df, **kwargs: df.info(**kwargs), - verbose=verbose, - max_cols=max_cols, - memory_usage=memory_usage, - null_counts=null_counts, - buf=lambda df: second if isinstance(df, pandas.DataFrame) else first, - ) - modin_info = first.getvalue().splitlines() - pandas_info = second.getvalue().splitlines() - - assert modin_info[0] == str(pd.DataFrame) - assert pandas_info[0] == str(pandas.DataFrame) - assert modin_info[1:] == pandas_info[1:] - - def test_interpolate(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).interpolate() - - def test_kurt_kurtosis_equals(self): - # It's optimization. If failed, df.kurt should be tested explicitly - # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. - data = test_data_values[0] - df_modin = pd.DataFrame(data) - assert df_modin.kurt == df_modin.kurtosis - - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("skipna", bool_arg_values, ids=bool_arg_keys) - @pytest.mark.parametrize("numeric_only", bool_arg_values, ids=bool_arg_keys) - def test_kurt_kurtosis(self, axis, skipna, numeric_only): - data = test_data_values[0] - df_modin = pd.DataFrame(data) - df_pandas = pandas.DataFrame(data) - - eval_general( - df_modin, - df_pandas, - lambda df: df.kurtosis( - axis=axis, skipna=skipna, level=None, numeric_only=numeric_only - ), - ) - - @pytest.mark.parametrize("level", [-1, 0, 1]) - def test_kurt_kurtosis_level(self, level): - data = test_data_values[0] - df_modin = pd.DataFrame(data) - df_pandas = pandas.DataFrame(data) - - index = generate_multiindex(len(data.keys())) - df_modin.columns = index - df_pandas.columns = index - eval_general( - df_modin, - df_pandas, - lambda df: df.kurtosis(axis=1, level=level), - ) - - def test_last(self): - modin_index = pd.date_range("2010-04-09", periods=400, freq="2D") - pandas_index = pandas.date_range("2010-04-09", periods=400, freq="2D") - modin_df = pd.DataFrame( - {"A": list(range(400)), "B": list(range(400))}, index=modin_index - ) - pandas_df = pandas.DataFrame( - {"A": list(range(400)), "B": list(range(400))}, index=pandas_index - ) - df_equals(modin_df.last("3D"), pandas_df.last("3D")) - df_equals(modin_df.last("20D"), pandas_df.last("20D")) - - def test_lookup(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).lookup([0, 1], ["col1", "col2"]) - - @pytest.mark.parametrize("data", test_data_values) - @pytest.mark.parametrize("axis", [None, 0, 1]) - @pytest.mark.parametrize("skipna", [None, True, False]) - def test_mad(self, data, axis, skipna): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - df_equals( - modin_df.mad(axis=axis, skipna=skipna, level=None), - pandas_df.mad(axis=axis, skipna=skipna, level=None), - ) - - @pytest.mark.parametrize("level", [-1, 0, 1]) - def test_mad_level(self, level): - data = test_data_values[0] - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - - index = generate_multiindex(len(data.keys())) - modin_df.columns = index - pandas_df.columns = index - eval_general( - modin_df, - pandas_df, - lambda df: df.mad(axis=1, level=level), - ) - - def test_mask(self): - df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) - m = df % 3 == 0 - with pytest.warns(UserWarning): - try: - df.mask(~m, -df) - except ValueError: - pass - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "id_vars", [lambda df: df.columns[0], lambda df: df.columns[:4], None] - ) - @pytest.mark.parametrize( - "value_vars", [lambda df: df.columns[-1], lambda df: df.columns[-4:], None] - ) - def test_melt(self, data, id_vars, value_vars): - eval_general( - *create_test_dfs(data), - lambda df, *args, **kwargs: df.melt(*args, **kwargs) - .sort_values(["variable", "value"]) - .reset_index(drop=True), - id_vars=id_vars, - value_vars=value_vars, - ) - - def test_pct_change(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).pct_change() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "index", [lambda df: df.columns[0], lambda df: df[df.columns[0]].values, None] - ) - @pytest.mark.parametrize("columns", [lambda df: df.columns[len(df.columns) // 2]]) - @pytest.mark.parametrize( - "values", [lambda df: df.columns[-1], lambda df: df.columns[-2:], None] - ) - def test_pivot(self, data, index, columns, values): - eval_general( - *create_test_dfs(data), - lambda df, *args, **kwargs: df.pivot(*args, **kwargs), - index=index, - columns=columns, - values=values, - check_exception_type=None, - ) - - def test_pivot_table(self): - df = pd.DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) - with pytest.warns(UserWarning): - df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc=np.sum) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_plot(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if name_contains(request.node.name, numeric_dfs): - # We have to test this way because equality in plots means same object. - zipped_plot_lines = zip(modin_df.plot().lines, pandas_df.plot().lines) - for left, right in zipped_plot_lines: - if isinstance(left.get_xdata(), np.ma.core.MaskedArray) and isinstance( - right.get_xdata(), np.ma.core.MaskedArray - ): - assert all((left.get_xdata() == right.get_xdata()).data) - else: - assert np.array_equal(left.get_xdata(), right.get_xdata()) - if isinstance(left.get_ydata(), np.ma.core.MaskedArray) and isinstance( - right.get_ydata(), np.ma.core.MaskedArray - ): - assert all((left.get_ydata() == right.get_ydata()).data) - else: - assert np.array_equal(left.get_xdata(), right.get_xdata()) - - def test_replace(self): - modin_df = pd.DataFrame( - {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} - ) - pandas_df = pandas.DataFrame( - {"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"]} - ) - modin_result = modin_df.replace({"A": 0, "B": 5}, 100) - pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.replace({"A": {0: 100, 4: 400}}) - pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}}) - df_equals(modin_result, pandas_result) - - modin_df = pd.DataFrame( - {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} - ) - pandas_df = pandas.DataFrame( - {"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]} - ) - modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) - pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new") - pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new") - df_equals(modin_result, pandas_result) - - modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) - pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("rule", ["5T", pandas.offsets.Hour()]) - @pytest.mark.parametrize("axis", [0, "columns"]) - @pytest.mark.parametrize("closed", ["left", "right"]) - @pytest.mark.parametrize("label", ["right", "left"]) - @pytest.mark.parametrize("on", [None, "DateColumn"]) - @pytest.mark.parametrize("level", [None, 1]) - def test_resample(self, rule, axis, closed, label, on, level): - freq = "H" - base = 2 - index = pandas.date_range("31/12/2000", periods=12, freq=freq) - data = {"A": range(12), "B": range(12)} - - pandas_df = pandas.DataFrame(data, index=index) - modin_df = pd.DataFrame(data, index=index) - - if on is not None and axis == 0: - pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") - modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") - else: - on = None - - if axis == "columns": - pandas_df = pandas_df.T - modin_df = modin_df.T - - if level is not None and axis == 0 and on is None: - index = pandas.MultiIndex.from_product( - [["a", "b", "c"], pandas.date_range("31/12/2000", periods=4, freq=freq)] - ) - pandas_df.index = index - modin_df.index = index - else: - level = None - - pandas_resampler = pandas_df.resample( - rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level - ) - modin_resampler = modin_df.resample( - rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level - ) - - df_equals(modin_resampler.count(), pandas_resampler.count()) - df_equals(modin_resampler.var(0), pandas_resampler.var(0)) - df_equals(modin_resampler.sum(), pandas_resampler.sum()) - df_equals(modin_resampler.std(), pandas_resampler.std()) - df_equals(modin_resampler.sem(), pandas_resampler.sem()) - df_equals(modin_resampler.size(), pandas_resampler.size()) - df_equals(modin_resampler.prod(), pandas_resampler.prod()) - if on is None: - df_equals(modin_resampler.ohlc(), pandas_resampler.ohlc()) - df_equals(modin_resampler.min(), pandas_resampler.min()) - df_equals(modin_resampler.median(), pandas_resampler.median()) - df_equals(modin_resampler.mean(), pandas_resampler.mean()) - df_equals(modin_resampler.max(), pandas_resampler.max()) - df_equals(modin_resampler.last(), pandas_resampler.last()) - df_equals(modin_resampler.first(), pandas_resampler.first()) - df_equals(modin_resampler.nunique(), pandas_resampler.nunique()) - df_equals( - modin_resampler.pipe(lambda x: x.max() - x.min()), - pandas_resampler.pipe(lambda x: x.max() - x.min()), - ) - df_equals( - modin_resampler.transform(lambda x: (x - x.mean()) / x.std()), - pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), - ) - df_equals( - pandas_resampler.aggregate("max"), - modin_resampler.aggregate("max"), - ) - df_equals( - modin_resampler.apply("sum"), - pandas_resampler.apply("sum"), - ) - df_equals( - modin_resampler.get_group(name=list(modin_resampler.groups)[0]), - pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), - ) - assert pandas_resampler.indices == modin_resampler.indices - assert pandas_resampler.groups == modin_resampler.groups - df_equals(modin_resampler.quantile(), pandas_resampler.quantile()) - if axis == 0: - # Upsampling from level= or on= selection is not supported - if on is None and level is None: - df_equals( - modin_resampler.interpolate(), - pandas_resampler.interpolate(), - ) - df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) - df_equals( - modin_resampler.fillna(method="nearest"), - pandas_resampler.fillna(method="nearest"), - ) - df_equals(modin_resampler.pad(), pandas_resampler.pad()) - df_equals(modin_resampler.nearest(), pandas_resampler.nearest()) - df_equals(modin_resampler.bfill(), pandas_resampler.bfill()) - df_equals(modin_resampler.backfill(), pandas_resampler.backfill()) - df_equals(modin_resampler.ffill(), pandas_resampler.ffill()) - df_equals( - pandas_resampler.apply(["sum", "mean", "max"]), - modin_resampler.apply(["sum", "mean", "max"]), - ) - df_equals( - modin_resampler.aggregate(["sum", "mean", "max"]), - pandas_resampler.aggregate(["sum", "mean", "max"]), - ) - - def test_sem(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).sem() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("index", ["default", "ndarray"]) - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) - def test_shift(self, data, index, axis, periods): - if index == "default": - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - elif index == "ndarray": - data_column_length = len(data[next(iter(data))]) - index_data = np.arange(2, data_column_length + 2) - modin_df = pd.DataFrame(data, index=index_data) - pandas_df = pandas.DataFrame(data, index=index_data) - - df_equals( - modin_df.shift(periods=periods, axis=axis), - pandas_df.shift(periods=periods, axis=axis), - ) - df_equals( - modin_df.shift(periods=periods, axis=axis, fill_value=777), - pandas_df.shift(periods=periods, axis=axis, fill_value=777), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("index", ["default", "ndarray"]) - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("periods", [0, 1, -1, 10, -10, 1000000000, -1000000000]) - def test_slice_shift(self, data, index, axis, periods): - if index == "default": - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - elif index == "ndarray": - data_column_length = len(data[next(iter(data))]) - index_data = np.arange(2, data_column_length + 2) - modin_df = pd.DataFrame(data, index=index_data) - pandas_df = pandas.DataFrame(data, index=index_data) - - df_equals( - modin_df.slice_shift(periods=periods, axis=axis), - pandas_df.slice_shift(periods=periods, axis=axis), - ) - - @pytest.mark.parametrize( - "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"] - ) - @pytest.mark.parametrize( - "is_multi_col", [True, False], ids=["col_multi", "col_index"] - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_stack(self, data, is_multi_idx, is_multi_col): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - if is_multi_idx: - if len(pandas_df.index) == 256: - index = pd.MultiIndex.from_product( - [ - ["a", "b", "c", "d"], - ["x", "y", "z", "last"], - ["i", "j", "k", "index"], - [1, 2, 3, 4], - ] - ) - elif len(pandas_df.index) == 100: - index = pd.MultiIndex.from_product( - [ - ["x", "y", "z", "last"], - ["a", "b", "c", "d", "f"], - ["i", "j", "k", "l", "index"], - ] - ) - else: - index = pandas_df.index - - if is_multi_col: - if len(pandas_df.columns) == 64: - columns = pd.MultiIndex.from_product( - [ - ["A", "B", "C", "D"], - ["xx", "yy", "zz", "LAST"], - [10, 20, 30, 40], - ] - ) - elif len(pandas_df.columns) == 100: - columns = pd.MultiIndex.from_product( - [ - ["xx", "yy", "zz", "LAST"], - ["A", "B", "C", "D", "F"], - ["I", "J", "K", "L", "INDEX"], - ] - ) - else: - columns = pandas_df.columns - - pandas_df.columns = columns - pandas_df.index = index - - modin_df.columns = columns - modin_df.index = index - - df_equals(modin_df.stack(), pandas_df.stack()) - - if is_multi_col: - df_equals(modin_df.stack(level=0), pandas_df.stack(level=0)) - df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1])) - df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2])) - - def test_style(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).style - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis1", [0, 1]) - @pytest.mark.parametrize("axis2", [0, 1]) - def test_swapaxes(self, data, axis1, axis2): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - pandas_result = pandas_df.swapaxes(axis1, axis2) - modin_result = modin_df.swapaxes(axis1, axis2) - df_equals(modin_result, pandas_result) - - def test_swapaxes_axes_names(self): - modin_df = pd.DataFrame(test_data_values[0]) - modin_result1 = modin_df.swapaxes(0, 1) - modin_result2 = modin_df.swapaxes("columns", "index") - df_equals(modin_result1, modin_result2) - - def test_swaplevel(self): - data = np.random.randint(1, 100, 12) - modin_df = pd.DataFrame( - data, - index=pd.MultiIndex.from_tuples( - [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - pandas_df = pandas.DataFrame( - data, - index=pandas.MultiIndex.from_tuples( - [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - df_equals( - modin_df.swaplevel("Number", "Color"), - pandas_df.swaplevel("Number", "Color"), - ) - df_equals(modin_df.swaplevel(), pandas_df.swaplevel()) - df_equals(modin_df.swaplevel(0, 1), pandas_df.swaplevel(0, 1)) - - def test_take(self): - modin_df = pd.DataFrame( - [ - ("falcon", "bird", 389.0), - ("parrot", "bird", 24.0), - ("lion", "mammal", 80.5), - ("monkey", "mammal", np.nan), - ], - columns=["name", "class", "max_speed"], - index=[0, 2, 3, 1], - ) - pandas_df = pandas.DataFrame( - [ - ("falcon", "bird", 389.0), - ("parrot", "bird", 24.0), - ("lion", "mammal", 80.5), - ("monkey", "mammal", np.nan), - ], - columns=["name", "class", "max_speed"], - index=[0, 2, 3, 1], - ) - df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3])) - df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_to_records(self, request, data): - eval_general( - *create_test_dfs(data), - lambda df: df.dropna().to_records(), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_to_string(self, data): - eval_general( - *create_test_dfs(data), - lambda df: df.to_string(), - ) - - def test_to_timestamp(self): - idx = pd.date_range("1/1/2012", periods=5, freq="M") - df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) - - with pytest.warns(UserWarning): - df.to_period().to_timestamp() - - def test_to_xarray(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).to_xarray() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_truncate(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - before = 1 - after = len(modin_df - 3) - df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) - - before = 1 - after = 3 - df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) - - before = modin_df.columns[1] - after = modin_df.columns[-3] - try: - pandas_result = pandas_df.truncate(before, after, axis=1) - except Exception as e: - with pytest.raises(type(e)): - modin_df.truncate(before, after, axis=1) - else: - modin_result = modin_df.truncate(before, after, axis=1) - df_equals(modin_result, pandas_result) - - before = modin_df.columns[1] - after = modin_df.columns[3] - try: - pandas_result = pandas_df.truncate(before, after, axis=1) - except Exception as e: - with pytest.raises(type(e)): - modin_df.truncate(before, after, axis=1) - else: - modin_result = modin_df.truncate(before, after, axis=1) - df_equals(modin_result, pandas_result) - - before = None - after = None - df_equals(modin_df.truncate(before, after), pandas_df.truncate(before, after)) - try: - pandas_result = pandas_df.truncate(before, after, axis=1) - except Exception as e: - with pytest.raises(type(e)): - modin_df.truncate(before, after, axis=1) - else: - modin_result = modin_df.truncate(before, after, axis=1) - df_equals(modin_result, pandas_result) - - def test_tshift(self): - idx = pd.date_range("1/1/2012", periods=5, freq="M") - data = np.random.randint(0, 100, size=(len(idx), 4)) - modin_df = pd.DataFrame(data, index=idx) - pandas_df = pandas.DataFrame(data, index=idx) - df_equals(modin_df.tshift(4), pandas_df.tshift(4)) - - def test_tz_convert(self): - modin_idx = pd.date_range( - "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" - ) - pandas_idx = pandas.date_range( - "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" - ) - data = np.random.randint(0, 100, size=(len(modin_idx), 4)) - modin_df = pd.DataFrame(data, index=modin_idx) - pandas_df = pandas.DataFrame(data, index=pandas_idx) - modin_result = modin_df.tz_convert("UTC", axis=0) - pandas_result = pandas_df.tz_convert("UTC", axis=0) - df_equals(modin_result, pandas_result) - - modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) - pandas_multi = pandas.MultiIndex.from_arrays( - [pandas_idx, range(len(modin_idx))] - ) - modin_series = pd.DataFrame(data, index=modin_multi) - pandas_series = pandas.DataFrame(data, index=pandas_multi) - df_equals( - modin_series.tz_convert("UTC", axis=0, level=0), - pandas_series.tz_convert("UTC", axis=0, level=0), - ) - - def test_tz_localize(self): - idx = pd.date_range("1/1/2012", periods=400, freq="2D") - data = np.random.randint(0, 100, size=(len(idx), 4)) - modin_df = pd.DataFrame(data, index=idx) - pandas_df = pandas.DataFrame(data, index=idx) - df_equals( - modin_df.tz_localize("UTC", axis=0), pandas_df.tz_localize("UTC", axis=0) - ) - df_equals( - modin_df.tz_localize("America/Los_Angeles", axis=0), - pandas_df.tz_localize("America/Los_Angeles", axis=0), - ) - - @pytest.mark.parametrize( - "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"] - ) - @pytest.mark.parametrize( - "is_multi_col", [True, False], ids=["col_multi", "col_index"] - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_unstack(self, data, is_multi_idx, is_multi_col): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - if is_multi_idx: - if len(pandas_df.index) == 256: - index = pd.MultiIndex.from_product( - [ - ["a", "b", "c", "d"], - ["x", "y", "z", "last"], - ["i", "j", "k", "index"], - [1, 2, 3, 4], - ] - ) - elif len(pandas_df.index) == 100: - index = pd.MultiIndex.from_product( - [ - ["x", "y", "z", "last"], - ["a", "b", "c", "d", "f"], - ["i", "j", "k", "l", "index"], - ] - ) - else: - index = pandas_df.index - - if is_multi_col: - if len(pandas_df.columns) == 64: - columns = pd.MultiIndex.from_product( - [ - ["A", "B", "C", "D"], - ["xx", "yy", "zz", "LAST"], - [10, 20, 30, 40], - ] - ) - elif len(pandas_df.columns) == 100: - columns = pd.MultiIndex.from_product( - [ - ["xx", "yy", "zz", "LAST"], - ["A", "B", "C", "D", "F"], - ["I", "J", "K", "L", "INDEX"], - ] - ) - else: - columns = pandas_df.columns - - pandas_df.columns = columns - pandas_df.index = index - - modin_df.columns = columns - modin_df.index = index - - df_equals(modin_df.unstack(), pandas_df.unstack()) - - if is_multi_idx: - df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) - df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) - df_equals( - modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2]) - ) - - if len(pandas_df.index) == 256: - df_equals( - modin_df.unstack(level=[0, 1, 2, 3]), - pandas_df.unstack(level=[0, 1, 2, 3]), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___array__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert_array_equal(modin_df.__array__(), pandas_df.__array__()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___bool__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.__bool__() - except Exception as e: - with pytest.raises(type(e)): - modin_df.__bool__() - else: - modin_result = modin_df.__bool__() - df_equals(modin_result, pandas_result) - - def test___getstate__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).__getstate__() - - def test___setstate__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - try: - pd.DataFrame(data).__setstate__(None) - except TypeError: - pass - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_hasattr_sparse(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - try: - pandas_result = hasattr(pandas_df, "sparse") - except Exception as e: - with pytest.raises(type(e)): - hasattr(modin_df, "sparse") - else: - modin_result = hasattr(modin_df, "sparse") - assert modin_result == pandas_result - - -class TestDataFrameReduction: - @pytest.mark.parametrize("method", ["all", "any"]) - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_all_any(self, data, axis, skipna, is_transposed, method): - eval_general( - *create_test_dfs(data), - lambda df: getattr((df.T if is_transposed else df), method)( - axis=axis, skipna=skipna, bool_only=None - ), - ) - - @pytest.mark.parametrize("method", ["all", "any"]) - @pytest.mark.parametrize( - "bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys) - ) - def test_all_any_specific(self, bool_only, method): - eval_general( - *create_test_dfs(test_data_diff_dtype), - lambda df: getattr(df, method)(bool_only=bool_only), - ) - - @pytest.mark.parametrize("method", ["all", "any"]) - @pytest.mark.parametrize("level", [-1, 0, 1]) - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("data", [test_data["int_data"]]) - def test_all_any_level(self, data, axis, level, method): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - - if axis == 0: - new_idx = generate_multiindex(len(modin_df.index)) - modin_df.index = new_idx - pandas_df.index = new_idx - else: - new_col = generate_multiindex(len(modin_df.columns)) - modin_df.columns = new_col - pandas_df.columns = new_col - - eval_general( - modin_df, - pandas_df, - lambda df: getattr(df, method)(axis=axis, level=level), - ) - - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_count(self, data, axis): - eval_general( - *create_test_dfs(data), - lambda df: df.count(axis=axis), - ) - - @pytest.mark.parametrize( - "numeric_only", - [ - pytest.param(True, marks=pytest.mark.xfail(reason="See #1965 for details")), - False, - None, - ], - ) - def test_count_specific(self, numeric_only): - eval_general( - *create_test_dfs(test_data_diff_dtype), - lambda df: df.count(numeric_only=numeric_only), - ) - - @pytest.mark.parametrize("level", [-1, 0, 1]) - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("data", [test_data["int_data"]]) - def test_count_level(self, data, axis, level): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - - if axis == 0: - new_idx = generate_multiindex(len(modin_df.index)) - modin_df.index = new_idx - pandas_df.index = new_idx - else: - new_col = generate_multiindex(len(modin_df.columns)) - modin_df.columns = new_col - pandas_df.columns = new_col - - eval_general( - modin_df, - pandas_df, - lambda df: df.count(axis=axis, level=level), - ) - - @pytest.mark.parametrize("percentiles", [None, 0.10, 0.11, 0.44, 0.78, 0.99]) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_describe(self, data, percentiles): - eval_general( - *create_test_dfs(data), - lambda df: df.describe(percentiles=percentiles), - ) - - @pytest.mark.parametrize( - "exclude,include", - [ - ([np.float64], None), - (np.float64, None), - (None, [np.timedelta64, np.datetime64, np.object, np.bool]), - (None, "all"), - (None, np.number), - ], - ) - def test_describe_specific(self, exclude, include): - eval_general( - *create_test_dfs(test_data_diff_dtype), - lambda df: df.drop("str_col", axis=1).describe( - exclude=exclude, include=include - ), - ) - - @pytest.mark.parametrize("data", [test_data["int_data"]]) - def test_describe_str(self, data): - modin_df = pd.DataFrame(data).applymap(str) - pandas_df = pandas.DataFrame(data).applymap(str) - - try: - df_equals(modin_df.describe(), pandas_df.describe()) - except AssertionError: - # We have to do this because we choose the highest count slightly differently - # than pandas. Because there is no true guarantee which one will be first, - # If they don't match, make sure that the `freq` is the same at least. - df_equals( - modin_df.describe().loc[["count", "unique", "freq"]], - pandas_df.describe().loc[["count", "unique", "freq"]], - ) - - def test_describe_dtypes(self): - data = { - "col1": list("abc"), - "col2": list("abc"), - "col3": list("abc"), - "col4": [1, 2, 3], - } - eval_general(*create_test_dfs(data), lambda df: df.describe()) - - @pytest.mark.parametrize("method", ["idxmin", "idxmax"]) - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_idxmin_idxmax(self, data, axis, skipna, is_transposed, method): - eval_general( - *create_test_dfs(data), - lambda df: getattr((df.T if is_transposed else df), method)( - axis=axis, skipna=skipna - ), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_last_valid_index(self, data): - modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - assert modin_df.last_valid_index() == pandas_df.last_valid_index() - - @pytest.mark.parametrize( - "index", bool_arg_values, ids=arg_keys("index", bool_arg_keys) - ) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_memory_usage(self, data, index): - eval_general(*create_test_dfs(data), lambda df: df.memory_usage(index=index)) - - @pytest.mark.parametrize("method", ["min", "max", "mean"]) - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_min_max_mean( - self, data, axis, skipna, numeric_only, is_transposed, method - ): - eval_general( - *create_test_dfs(data), - lambda df: getattr((df.T if is_transposed else df), method)( - axis=axis, skipna=skipna, numeric_only=numeric_only - ), - ) - - @pytest.mark.skipif( - os.name == "nt", - reason="Windows has a memory issue for large numbers on this test", - ) - @pytest.mark.parametrize( - "method", - [ - "prod", - pytest.param( - "product", - marks=pytest.mark.skipif( - pandas.DataFrame.product == pandas.DataFrame.prod - and pd.DataFrame.product == pd.DataFrame.prod, - reason="That method was already tested.", - ), - ), - ], - ) - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_prod( - self, - data, - axis, - skipna, - is_transposed, - method, - ): - eval_general( - *create_test_dfs(data), - lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( - axis=axis, - skipna=skipna, - ), - ) - - @pytest.mark.parametrize( - "numeric_only", - [ - pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), - False, - True, - ], - ) - @pytest.mark.parametrize( - "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) - ) - def test_prod_specific(self, min_count, numeric_only): - if min_count == 5 and numeric_only: - pytest.xfail("see #1953 for details") - eval_general( - *create_test_dfs(test_data_diff_dtype), - lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), - ) - - @pytest.mark.parametrize("is_transposed", [False, True]) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("data", [test_data["float_nan_data"]]) - def test_sum(self, data, axis, skipna, is_transposed): - eval_general( - *create_test_dfs(data), - lambda df: (df.T if is_transposed else df).sum( - axis=axis, - skipna=skipna, - ), - ) - - @pytest.mark.parametrize( - "numeric_only", - [ - pytest.param(None, marks=pytest.mark.xfail(reason="See #1976 for details")), - False, - True, - ], - ) - @pytest.mark.parametrize("min_count", int_arg_values) - def test_sum_specific(self, min_count, numeric_only): - eval_general( - *create_test_dfs(test_data_diff_dtype), - lambda df: df.sum(min_count=min_count, numeric_only=numeric_only), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_sum_single_column(self, data): - modin_df = pd.DataFrame(data).iloc[:, [0]] - pandas_df = pandas.DataFrame(data).iloc[:, [0]] - df_equals(modin_df.sum(), pandas_df.sum()) - df_equals(modin_df.sum(axis=1), pandas_df.sum(axis=1)) - - -class TestDataFrameWindow: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - def test_cummax(self, request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.cummax(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cummax(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cummax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.cummax(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cummax(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cummax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_cummax_int_and_float(self, axis): - data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - def test_cummin(self, request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.cummin(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cummin(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cummin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.cummin(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cummin(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cummin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_cummin_int_and_float(self, axis): - data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.cummin(axis=axis), pandas_df.cummin(axis=axis)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - def test_cumprod(self, request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.cumprod(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cumprod(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cumprod(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.cumprod(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cumprod(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cumprod(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - def test_cumsum(self, request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # pandas exhibits weird behavior for this case - # Remove this case when we can pull the error messages from backend - if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( - axis == 0 or axis == "rows" - ): - with pytest.raises(TypeError): - modin_df.cumsum(axis=axis, skipna=skipna) - else: - try: - pandas_result = pandas_df.cumsum(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cumsum(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cumsum(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( - axis == 0 or axis == "rows" - ): - with pytest.raises(TypeError): - modin_df.T.cumsum(axis=axis, skipna=skipna) - else: - try: - pandas_result = pandas_df.T.cumsum(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cumsum(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cumsum(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) - ) - def test_diff(self, request, data, axis, periods): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.diff(axis=axis, periods=periods) - except Exception as e: - with pytest.raises(type(e)): - modin_df.diff(axis=axis, periods=periods) - else: - modin_result = modin_df.diff(axis=axis, periods=periods) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.diff(axis=axis, periods=periods) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.diff(axis=axis, periods=periods) - else: - modin_result = modin_df.T.diff(axis=axis, periods=periods) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "keep", ["last", "first", False], ids=["last", "first", "False"] - ) - def test_duplicated(self, data, keep): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - pandas_result = pandas_df.duplicated(keep=keep) - modin_result = modin_df.duplicated(keep=keep) - df_equals(modin_result, pandas_result) - - import random - - subset = random.sample( - list(pandas_df.columns), random.randint(1, len(pandas_df.columns)) - ) - pandas_result = pandas_df.duplicated(keep=keep, subset=subset) - modin_result = modin_df.duplicated(keep=keep, subset=subset) - - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_ffill(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.ffill(), pandas_df.ffill()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize( - "method", - ["backfill", "bfill", "pad", "ffill", None], - ids=["backfill", "bfill", "pad", "ffill", "None"], - ) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize("limit", int_arg_values, ids=int_arg_keys) - def test_fillna(self, data, method, axis, limit): - # We are not testing when limit is not positive until pandas-27042 gets fixed. - # We are not testing when axis is over rows until pandas-17399 gets fixed. - if limit > 0 and axis != 1 and axis != "columns": - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.fillna( - 0, method=method, axis=axis, limit=limit - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.fillna(0, method=method, axis=axis, limit=limit) - else: - modin_result = modin_df.fillna(0, method=method, axis=axis, limit=limit) - df_equals(modin_result, pandas_result) - - def test_fillna_sanity(self): - # with different dtype - frame_data = [ - ["a", "a", np.nan, "a"], - ["b", "b", np.nan, "b"], - ["c", "c", np.nan, "c"], - ] - df = pandas.DataFrame(frame_data) - - result = df.fillna({2: "foo"}) - modin_df = pd.DataFrame(frame_data).fillna({2: "foo"}) - - df_equals(modin_df, result) - - modin_df = pd.DataFrame(df) - df.fillna({2: "foo"}, inplace=True) - modin_df.fillna({2: "foo"}, inplace=True) - df_equals(modin_df, result) - - frame_data = { - "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")], - "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT], - } - df = pandas.DataFrame(frame_data) - result = df.fillna(value={"Date": df["Date2"]}) - modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]}) - df_equals(modin_df, result) - - frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad")) - - frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data).fillna(method="bfill") - df_equals(modin_df, df.fillna(method="bfill")) - - def test_fillna_downcast(self): - # infer int64 from float64 - frame_data = {"a": [1.0, np.nan]} - df = pandas.DataFrame(frame_data) - result = df.fillna(0, downcast="infer") - modin_df = pd.DataFrame(frame_data).fillna(0, downcast="infer") - df_equals(modin_df, result) - - # infer int64 from float64 when fillna value is a dict - df = pandas.DataFrame(frame_data) - result = df.fillna({"a": 0}, downcast="infer") - modin_df = pd.DataFrame(frame_data).fillna({"a": 0}, downcast="infer") - df_equals(modin_df, result) - - def test_fillna_inplace(self): - frame_data = random_state.randn(10, 4) - df = pandas.DataFrame(frame_data) - df[1][:4] = np.nan - df[3][-4:] = np.nan - - modin_df = pd.DataFrame(df) - df.fillna(value=0, inplace=True) - try: - df_equals(modin_df, df) - except AssertionError: - pass - else: - assert False - - modin_df.fillna(value=0, inplace=True) - df_equals(modin_df, df) - - modin_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) - assert modin_df is None - - df[1][:4] = np.nan - df[3][-4:] = np.nan - modin_df = pd.DataFrame(df) - df.fillna(method="ffill", inplace=True) - try: - df_equals(modin_df, df) - except AssertionError: - pass - else: - assert False - - modin_df.fillna(method="ffill", inplace=True) - df_equals(modin_df, df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_frame_fillna_limit(self, data): - pandas_df = pandas.DataFrame(data) - - index = pandas_df.index - - result = pandas_df[:2].reindex(index) - modin_df = pd.DataFrame(result) - df_equals( - modin_df.fillna(method="pad", limit=2), result.fillna(method="pad", limit=2) - ) - - result = pandas_df[-2:].reindex(index) - modin_df = pd.DataFrame(result) - df_equals( - modin_df.fillna(method="backfill", limit=2), - result.fillna(method="backfill", limit=2), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_frame_pad_backfill_limit(self, data): - pandas_df = pandas.DataFrame(data) - - index = pandas_df.index - - result = pandas_df[:2].reindex(index) - modin_df = pd.DataFrame(result) - df_equals( - modin_df.fillna(method="pad", limit=2), result.fillna(method="pad", limit=2) - ) - - result = pandas_df[-2:].reindex(index) - modin_df = pd.DataFrame(result) - df_equals( - modin_df.fillna(method="backfill", limit=2), - result.fillna(method="backfill", limit=2), - ) - - def test_fillna_dtype_conversion(self): - # make sure that fillna on an empty frame works - df = pandas.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - modin_df = pd.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - df_equals(modin_df.fillna("nan"), df.fillna("nan")) - - frame_data = {"A": [1, np.nan], "B": [1.0, 2.0]} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - for v in ["", 1, np.nan, 1.0]: - df_equals(modin_df.fillna(v), df.fillna(v)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_fillna_skip_certain_blocks(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # don't try to fill boolean, int blocks - df_equals(modin_df.fillna(np.nan), pandas_df.fillna(np.nan)) - - def test_fillna_dict_series(self): - frame_data = { - "a": [np.nan, 1, 2, np.nan, np.nan], - "b": [1, 2, 3, np.nan, np.nan], - "c": [np.nan, 1, 2, 3, 4], - } - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) - - df_equals( - modin_df.fillna({"a": 0, "b": 5, "d": 7}), - df.fillna({"a": 0, "b": 5, "d": 7}), - ) - - # Series treated same as dict - df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max())) - - def test_fillna_dataframe(self): - frame_data = { - "a": [np.nan, 1, 2, np.nan, np.nan], - "b": [1, 2, 3, np.nan, np.nan], - "c": [np.nan, 1, 2, 3, 4], - } - df = pandas.DataFrame(frame_data, index=list("VWXYZ")) - modin_df = pd.DataFrame(frame_data, index=list("VWXYZ")) - - # df2 may have different index and columns - df2 = pandas.DataFrame( - { - "a": [np.nan, 10, 20, 30, 40], - "b": [50, 60, 70, 80, 90], - "foo": ["bar"] * 5, - }, - index=list("VWXuZ"), - ) - modin_df2 = pd.DataFrame(df2) - - # only those columns and indices which are shared get filled - df_equals(modin_df.fillna(modin_df2), df.fillna(df2)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_fillna_columns(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals( - modin_df.fillna(method="ffill", axis=1), - pandas_df.fillna(method="ffill", axis=1), - ) - - df_equals( - modin_df.fillna(method="ffill", axis=1), - pandas_df.fillna(method="ffill", axis=1), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_fillna_invalid_method(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(ValueError): - modin_df.fillna(method="ffil") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_fillna_invalid_value(self, data): - modin_df = pd.DataFrame(data) - # list - pytest.raises(TypeError, modin_df.fillna, [1, 2]) - # tuple - pytest.raises(TypeError, modin_df.fillna, (1, 2)) - # frame with series - pytest.raises(TypeError, modin_df.iloc[:, 0].fillna, modin_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_fillna_col_reordering(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.fillna(method="ffill"), pandas_df.fillna(method="ffill")) - - def test_fillna_datetime_columns(self): - frame_data = { - "A": [-1, -2, np.nan], - "B": pd.date_range("20130101", periods=3), - "C": ["foo", "bar", None], - "D": ["foo2", "bar2", None], - } - df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) - modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) - df_equals(modin_df.fillna("?"), df.fillna("?")) - - frame_data = { - "A": [-1, -2, np.nan], - "B": [ - pandas.Timestamp("2013-01-01"), - pandas.Timestamp("2013-01-02"), - pandas.NaT, - ], - "C": ["foo", "bar", None], - "D": ["foo2", "bar2", None], - } - df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) - modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) - df_equals(modin_df.fillna("?"), df.fillna("?")) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - def test_median(self, request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.median(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.median(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - def test_mode(self, request, data, axis, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.mode(axis=axis, numeric_only=numeric_only) - except Exception: - with pytest.raises(TypeError): - modin_df.mode(axis=axis, numeric_only=numeric_only) - else: - modin_result = modin_df.mode(axis=axis, numeric_only=numeric_only) - df_equals(modin_result, pandas_result) - - def test_nlargest(self): - data = { - "population": [ - 59000000, - 65000000, - 434000, - 434000, - 434000, - 337000, - 11300, - 11300, - 11300, - ], - "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - } - index = [ - "Italy", - "France", - "Malta", - "Maldives", - "Brunei", - "Iceland", - "Nauru", - "Tuvalu", - "Anguilla", - ] - modin_df = pd.DataFrame(data=data, index=index) - pandas_df = pandas.DataFrame(data=data, index=index) - df_equals( - modin_df.nlargest(3, "population"), pandas_df.nlargest(3, "population") - ) - - def test_nsmallest(self): - data = { - "population": [ - 59000000, - 65000000, - 434000, - 434000, - 434000, - 337000, - 11300, - 11300, - 11300, - ], - "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - } - index = [ - "Italy", - "France", - "Malta", - "Maldives", - "Brunei", - "Iceland", - "Nauru", - "Tuvalu", - "Anguilla", - ] - modin_df = pd.DataFrame(data=data, index=index) - pandas_df = pandas.DataFrame(data=data, index=index) - df_equals( - modin_df.nsmallest(n=3, columns="population"), - pandas_df.nsmallest(n=3, columns="population"), - ) - df_equals( - modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), - pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "dropna", bool_arg_values, ids=arg_keys("dropna", bool_arg_keys) - ) - def test_nunique(self, data, axis, dropna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_result = modin_df.nunique(axis=axis, dropna=dropna) - pandas_result = pandas_df.nunique(axis=axis, dropna=dropna) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.T.nunique(axis=axis, dropna=dropna) - pandas_result = pandas_df.T.nunique(axis=axis, dropna=dropna) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) - def test_quantile(self, request, data, q): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if not name_contains(request.node.name, no_numeric_dfs): - df_equals(modin_df.quantile(q), pandas_df.quantile(q)) - df_equals(modin_df.quantile(q, axis=1), pandas_df.quantile(q, axis=1)) - - try: - pandas_result = pandas_df.quantile(q, axis=1, numeric_only=False) - except Exception as e: - with pytest.raises(type(e)): - modin_df.quantile(q, axis=1, numeric_only=False) - else: - modin_result = modin_df.quantile(q, axis=1, numeric_only=False) - df_equals(modin_result, pandas_result) - else: - with pytest.raises(ValueError): - modin_df.quantile(q) - - if not name_contains(request.node.name, no_numeric_dfs): - df_equals(modin_df.T.quantile(q), pandas_df.T.quantile(q)) - df_equals(modin_df.T.quantile(q, axis=1), pandas_df.T.quantile(q, axis=1)) - - try: - pandas_result = pandas_df.T.quantile(q, axis=1, numeric_only=False) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.quantile(q, axis=1, numeric_only=False) - else: - modin_result = modin_df.T.quantile(q, axis=1, numeric_only=False) - df_equals(modin_result, pandas_result) - else: - with pytest.raises(ValueError): - modin_df.T.quantile(q) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - @pytest.mark.parametrize( - "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] - ) - def test_rank(self, data, axis, numeric_only, na_option): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.rank( - axis=axis, numeric_only=numeric_only, na_option=na_option - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.rank(axis=axis, numeric_only=numeric_only, na_option=na_option) - else: - modin_result = modin_df.rank( - axis=axis, numeric_only=numeric_only, na_option=na_option - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - def test_skew(self, request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) - def test_std(self, request, data, axis, skipna, numeric_only, ddof): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_values(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - np.testing.assert_equal(modin_df.values, pandas_df.values) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) - ) - @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) - ) - @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) - def test_var(self, request, data, axis, skipna, numeric_only, ddof): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception: - with pytest.raises(TypeError): - modin_df.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - - -class TestDataFrameIndexing: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_first_valid_index(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert modin_df.first_valid_index() == (pandas_df.first_valid_index()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) - def test_head(self, data, n): - # Test normal dataframe head - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.head(n), pandas_df.head(n)) - df_equals(modin_df.head(len(modin_df) + 1), pandas_df.head(len(pandas_df) + 1)) - - # Test head when we call it from a QueryCompilerView - modin_result = modin_df.loc[:, ["col1", "col3", "col3"]].head(n) - pandas_result = pandas_df.loc[:, ["col1", "col3", "col3"]].head(n) - df_equals(modin_result, pandas_result) - - @pytest.mark.skip(reason="Defaulting to Pandas") - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_iat(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - modin_df.iat() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_iloc(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if not name_contains(request.node.name, ["empty_data"]): - # Scaler - np.testing.assert_equal(modin_df.iloc[0, 1], pandas_df.iloc[0, 1]) - - # Series - df_equals(modin_df.iloc[0], pandas_df.iloc[0]) - df_equals(modin_df.iloc[1:, 0], pandas_df.iloc[1:, 0]) - df_equals(modin_df.iloc[1:2, 0], pandas_df.iloc[1:2, 0]) - - # DataFrame - df_equals(modin_df.iloc[[1, 2]], pandas_df.iloc[[1, 2]]) - # See issue #80 - # df_equals(modin_df.iloc[[1, 2], [1, 0]], pandas_df.iloc[[1, 2], [1, 0]]) - df_equals(modin_df.iloc[1:2, 0:2], pandas_df.iloc[1:2, 0:2]) - - # Issue #43 - modin_df.iloc[0:3, :] - - # Write Item - modin_df.iloc[[1, 2]] = 42 - pandas_df.iloc[[1, 2]] = 42 - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df.iloc[0] = modin_df.iloc[1] - pandas_df.iloc[0] = pandas_df.iloc[1] - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df.iloc[:, 0] = modin_df.iloc[:, 1] - pandas_df.iloc[:, 0] = pandas_df.iloc[:, 1] - df_equals(modin_df, pandas_df) - - # From issue #1775 - df_equals( - modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], - pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])], - ) - else: - with pytest.raises(IndexError): - modin_df.iloc[0, 1] - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_index(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.index, pandas_df.index) - modin_df_cp = modin_df.copy() - pandas_df_cp = pandas_df.copy() - - modin_df_cp.index = [str(i) for i in modin_df_cp.index] - pandas_df_cp.index = [str(i) for i in pandas_df_cp.index] - df_equals(modin_df_cp.index, pandas_df_cp.index) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_indexing_duplicate_axis(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))] - assert any(modin_df.index.duplicated()) - assert any(pandas_df.index.duplicated()) - - df_equals(modin_df.iloc[0], pandas_df.iloc[0]) - df_equals(modin_df.loc[0], pandas_df.loc[0]) - df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4]) - df_equals( - modin_df.loc[0, modin_df.columns[0:4]], - pandas_df.loc[0, pandas_df.columns[0:4]], - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_keys(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.keys(), pandas_df.keys()) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_loc(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - key1 = modin_df.columns[0] - key2 = modin_df.columns[1] - # Scaler - df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1]) - - # Series - df_equals(modin_df.loc[0], pandas_df.loc[0]) - df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) - df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) - - # DataFrame - df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) - - # List-like of booleans - indices = [i % 3 == 0 for i in range(len(modin_df.index))] - columns = [i % 5 == 0 for i in range(len(modin_df.columns))] - modin_result = modin_df.loc[indices, columns] - pandas_result = pandas_df.loc[indices, columns] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[:, columns] - pandas_result = pandas_df.loc[:, columns] - df_equals(modin_result, pandas_result) - - modin_result = modin_df.loc[indices] - pandas_result = pandas_df.loc[indices] - df_equals(modin_result, pandas_result) - - # See issue #80 - # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) - df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) - - # From issue #421 - df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) - df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) - - # From issue #1023 - key1 = modin_df.columns[0] - key2 = modin_df.columns[-2] - df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) - - # Write Item - modin_df_copy = modin_df.copy() - pandas_df_copy = pandas_df.copy() - modin_df_copy.loc[[1, 2]] = 42 - pandas_df_copy.loc[[1, 2]] = 42 - df_equals(modin_df_copy, pandas_df_copy) - - # From issue #1775 - df_equals( - modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], - pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], - ) - - # From issue #1374 - with pytest.raises(KeyError): - modin_df.loc["NO_EXIST"] - - def test_loc_multi_index(self): - modin_df = pd.read_csv( - "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 - ) - pandas_df = pandas.read_csv( - "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 - ) - - df_equals(modin_df.loc[1], pandas_df.loc[1]) - df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) - df_equals( - modin_df.loc[1, ("Presidents", "Pure mentions")], - pandas_df.loc[1, ("Presidents", "Pure mentions")], - ) - assert ( - modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] - == pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] - ) - df_equals( - modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"] - ) - - tuples = [ - ("bar", "one"), - ("bar", "two"), - ("bar", "three"), - ("bar", "four"), - ("baz", "one"), - ("baz", "two"), - ("baz", "three"), - ("baz", "four"), - ("foo", "one"), - ("foo", "two"), - ("foo", "three"), - ("foo", "four"), - ("qux", "one"), - ("qux", "two"), - ("qux", "three"), - ("qux", "four"), - ] - - modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) - pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) - frame_data = np.random.randint(0, 100, size=(16, 100)) - modin_df = pd.DataFrame( - frame_data, - index=modin_index, - columns=["col{}".format(i) for i in range(100)], - ) - pandas_df = pandas.DataFrame( - frame_data, - index=pandas_index, - columns=["col{}".format(i) for i in range(100)], - ) - df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) - assert ( - modin_df.loc[("bar", "one"), "col1"] - == pandas_df.loc[("bar", "one"), "col1"] - ) - df_equals( - modin_df.loc["bar", ("col1", "col2")], - pandas_df.loc["bar", ("col1", "col2")], - ) - - # From issue #1456 - transposed_modin = modin_df.T - transposed_pandas = pandas_df.T - df_equals( - transposed_modin.loc[transposed_modin.index[:-2], :], - transposed_pandas.loc[transposed_pandas.index[:-2], :], - ) - - # From issue #1610 - df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) - df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]]) - - @pytest.mark.parametrize("index", [["row1", "row2", "row3"], ["row1"]]) - @pytest.mark.parametrize("columns", [["col1", "col2"], ["col1"]]) - def test_loc_assignment(self, index, columns): - md_df, pd_df = create_test_dfs(index=index, columns=columns) - for i, ind in enumerate(index): - for j, col in enumerate(columns): - value_to_assign = int(str(i) + str(j)) - md_df.loc[ind][col] = value_to_assign - pd_df.loc[ind][col] = value_to_assign - df_equals(md_df, pd_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_loc_nested_assignment(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - key1 = modin_df.columns[0] - key2 = modin_df.columns[1] - - modin_df[key1].loc[0] = 500 - pandas_df[key1].loc[0] = 500 - df_equals(modin_df, pandas_df) - - modin_df[key2].loc[0] = None - pandas_df[key2].loc[0] = None - df_equals(modin_df, pandas_df) - - def test_iloc_assignment(self): - modin_df = pd.DataFrame( - index=["row1", "row2", "row3"], columns=["col1", "col2"] - ) - pandas_df = pandas.DataFrame( - index=["row1", "row2", "row3"], columns=["col1", "col2"] - ) - modin_df.iloc[0]["col1"] = 11 - modin_df.iloc[1]["col1"] = 21 - modin_df.iloc[2]["col1"] = 31 - modin_df.iloc[0]["col2"] = 12 - modin_df.iloc[1]["col2"] = 22 - modin_df.iloc[2]["col2"] = 32 - pandas_df.iloc[0]["col1"] = 11 - pandas_df.iloc[1]["col1"] = 21 - pandas_df.iloc[2]["col1"] = 31 - pandas_df.iloc[0]["col2"] = 12 - pandas_df.iloc[1]["col2"] = 22 - pandas_df.iloc[2]["col2"] = 32 - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_iloc_nested_assignment(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - key1 = modin_df.columns[0] - key2 = modin_df.columns[1] - - modin_df[key1].iloc[0] = 500 - pandas_df[key1].iloc[0] = 500 - df_equals(modin_df, pandas_df) - - modin_df[key2].iloc[0] = None - pandas_df[key2].iloc[0] = None - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_pop(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if "empty_data" not in request.node.name: - key = modin_df.columns[0] - temp_modin_df = modin_df.copy() - temp_pandas_df = pandas_df.copy() - modin_popped = temp_modin_df.pop(key) - pandas_popped = temp_pandas_df.pop(key) - df_equals(modin_popped, pandas_popped) - df_equals(temp_modin_df, temp_pandas_df) - - def test_reindex(self): - frame_data = { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 10, 11], - "col4": [12, 13, 14, 15], - "col5": [0, 0, 0, 0], - } - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - df_equals(modin_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) - df_equals(modin_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) - df_equals( - modin_df.reindex(["col1", "col3", "col4", "col2"], axis=1), - pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1), - ) - df_equals( - modin_df.reindex(["col1", "col7", "col4", "col8"], axis=1), - pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1), - ) - df_equals( - modin_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), - pandas_df.reindex( - index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"] - ), - ) - df_equals( - modin_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), - pandas_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0), - ) - - def test_reindex_like(self): - df1 = pd.DataFrame( - [ - [24.3, 75.7, "high"], - [31, 87.8, "high"], - [22, 71.6, "medium"], - [35, 95, "medium"], - ], - columns=["temp_celsius", "temp_fahrenheit", "windspeed"], - index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), - ) - df2 = pd.DataFrame( - [[28, "low"], [30, "low"], [35.1, "medium"]], - columns=["temp_celsius", "windspeed"], - index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]), - ) - with pytest.warns(UserWarning): - df2.reindex_like(df1) - - def test_rename_sanity(self): - test_data = pandas.DataFrame(tm.getSeriesData()) - mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} - - modin_df = pd.DataFrame(test_data) - df_equals(modin_df.rename(columns=mapping), test_data.rename(columns=mapping)) - - renamed2 = test_data.rename(columns=str.lower) - df_equals(modin_df.rename(columns=str.lower), renamed2) - - modin_df = pd.DataFrame(renamed2) - df_equals( - modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper) - ) - - # index - data = {"A": {"foo": 0, "bar": 1}} - - # gets sorted alphabetical - df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - tm.assert_index_equal( - modin_df.rename(index={"foo": "bar", "bar": "foo"}).index, - df.rename(index={"foo": "bar", "bar": "foo"}).index, - ) - - tm.assert_index_equal( - modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index - ) - - # Using the `mapper` functionality with `axis` - tm.assert_index_equal( - modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index - ) - tm.assert_index_equal( - modin_df.rename(str.upper, axis=1).columns, - df.rename(str.upper, axis=1).columns, - ) - - # have to pass something - with pytest.raises(TypeError): - modin_df.rename() - - # partial columns - renamed = test_data.rename(columns={"C": "foo", "D": "bar"}) - modin_df = pd.DataFrame(test_data) - tm.assert_index_equal( - modin_df.rename(columns={"C": "foo", "D": "bar"}).index, - test_data.rename(columns={"C": "foo", "D": "bar"}).index, - ) - - # other axis - renamed = test_data.T.rename(index={"C": "foo", "D": "bar"}) - tm.assert_index_equal( - test_data.T.rename(index={"C": "foo", "D": "bar"}).index, - modin_df.T.rename(index={"C": "foo", "D": "bar"}).index, - ) - - # index with name - index = pandas.Index(["foo", "bar"], name="name") - renamer = pandas.DataFrame(data, index=index) - modin_df = pd.DataFrame(data, index=index) - - renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) - modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"}) - tm.assert_index_equal(renamed.index, modin_renamed.index) - - assert renamed.index.name == modin_renamed.index.name - - def test_rename_multiindex(self): - tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] - tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] - index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) - columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) - - frame_data = [(0, 0), (1, 1)] - df = pandas.DataFrame(frame_data, index=index, columns=columns) - modin_df = pd.DataFrame(frame_data, index=index, columns=columns) - - # - # without specifying level -> accross all levels - renamed = df.rename( - index={"foo1": "foo3", "bar2": "bar3"}, - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, - ) - modin_renamed = modin_df.rename( - index={"foo1": "foo3", "bar2": "bar3"}, - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, - ) - tm.assert_index_equal(renamed.index, modin_renamed.index) - - renamed = df.rename( - index={"foo1": "foo3", "bar2": "bar3"}, - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, - ) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - assert renamed.index.names == modin_renamed.index.names - assert renamed.columns.names == modin_renamed.columns.names - - # - # with specifying a level - - # dict - renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) - modin_renamed = modin_df.rename( - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0 - ) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") - modin_renamed = modin_df.rename( - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz" - ) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - - renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) - modin_renamed = modin_df.rename( - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1 - ) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") - modin_renamed = modin_df.rename( - columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz" - ) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - - # function - func = str.upper - renamed = df.rename(columns=func, level=0) - modin_renamed = modin_df.rename(columns=func, level=0) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - renamed = df.rename(columns=func, level="fizz") - modin_renamed = modin_df.rename(columns=func, level="fizz") - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - - renamed = df.rename(columns=func, level=1) - modin_renamed = modin_df.rename(columns=func, level=1) - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - renamed = df.rename(columns=func, level="buzz") - modin_renamed = modin_df.rename(columns=func, level="buzz") - tm.assert_index_equal(renamed.columns, modin_renamed.columns) - - # index - renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) - modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) - tm.assert_index_equal(modin_renamed.index, renamed.index) - - @pytest.mark.skip(reason="Pandas does not pass this test") - def test_rename_nocopy(self): - test_data = pandas.DataFrame(tm.getSeriesData()) - modin_df = pd.DataFrame(test_data) - modin_renamed = modin_df.rename(columns={"C": "foo"}, copy=False) - modin_renamed["foo"] = 1 - assert (modin_df["C"] == 1).all() - - def test_rename_inplace(self): - test_data = pandas.DataFrame(tm.getSeriesData()) - modin_df = pd.DataFrame(test_data) - - df_equals( - modin_df.rename(columns={"C": "foo"}), - test_data.rename(columns={"C": "foo"}), - ) - - frame = test_data.copy() - modin_frame = modin_df.copy() - frame.rename(columns={"C": "foo"}, inplace=True) - modin_frame.rename(columns={"C": "foo"}, inplace=True) - - df_equals(modin_frame, frame) - - def test_rename_bug(self): - # rename set ref_locs, and set_index was not resetting - frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]} - df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - df = df.rename(columns={0: "a"}) - df = df.rename(columns={1: "b"}) - df = df.set_index(["a", "b"]) - df.columns = ["2001-01-01"] - - modin_df = modin_df.rename(columns={0: "a"}) - modin_df = modin_df.rename(columns={1: "b"}) - modin_df = modin_df.set_index(["a", "b"]) - modin_df.columns = ["2001-01-01"] - - df_equals(modin_df, df) - - def test_rename_axis(self): - data = {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]} - index = ["dog", "cat", "monkey"] - modin_df = pd.DataFrame(data, index) - pandas_df = pandas.DataFrame(data, index) - df_equals(modin_df.rename_axis("animal"), pandas_df.rename_axis("animal")) - df_equals( - modin_df.rename_axis("limbs", axis="columns"), - pandas_df.rename_axis("limbs", axis="columns"), - ) - - modin_df.rename_axis("limbs", axis="columns", inplace=True) - pandas_df.rename_axis("limbs", axis="columns", inplace=True) - df_equals(modin_df, pandas_df) - - new_index = pd.MultiIndex.from_product( - [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"] - ) - modin_df.index = new_index - pandas_df.index = new_index - - df_equals( - modin_df.rename_axis(index={"type": "class"}), - pandas_df.rename_axis(index={"type": "class"}), - ) - df_equals( - modin_df.rename_axis(columns=str.upper), - pandas_df.rename_axis(columns=str.upper), - ) - df_equals( - modin_df.rename_axis( - columns=[str.upper(o) for o in modin_df.columns.names] - ), - pandas_df.rename_axis( - columns=[str.upper(o) for o in pandas_df.columns.names] - ), - ) - - with pytest.raises(ValueError): - df_equals( - modin_df.rename_axis(str.upper, axis=1), - pandas_df.rename_axis(str.upper, axis=1), - ) - - def test_rename_axis_inplace(self): - test_frame = pandas.DataFrame(tm.getSeriesData()) - modin_df = pd.DataFrame(test_frame) - - result = test_frame.copy() - modin_result = modin_df.copy() - no_return = result.rename_axis("foo", inplace=True) - modin_no_return = modin_result.rename_axis("foo", inplace=True) - - assert no_return is modin_no_return - df_equals(modin_result, result) - - result = test_frame.copy() - modin_result = modin_df.copy() - no_return = result.rename_axis("bar", axis=1, inplace=True) - modin_no_return = modin_result.rename_axis("bar", axis=1, inplace=True) - - assert no_return is modin_no_return - df_equals(modin_result, result) - - def test_reorder_levels(self): - data = np.random.randint(1, 100, 12) - modin_df = pd.DataFrame( - data, - index=pd.MultiIndex.from_tuples( - [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - pandas_df = pandas.DataFrame( - data, - index=pandas.MultiIndex.from_tuples( - [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - df_equals( - modin_df.reorder_levels(["Letter", "Color", "Number"]), - pandas_df.reorder_levels(["Letter", "Color", "Number"]), - ) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_reset_index(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_result = modin_df.reset_index(inplace=False) - pandas_result = pandas_df.reset_index(inplace=False) - df_equals(modin_result, pandas_result) - - modin_df_cp = modin_df.copy() - pd_df_cp = pandas_df.copy() - modin_df_cp.reset_index(inplace=True) - pd_df_cp.reset_index(inplace=True) - df_equals(modin_df_cp, pd_df_cp) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - def test_sample(self, data, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - with pytest.raises(ValueError): - modin_df.sample(n=3, frac=0.4, axis=axis) - - with pytest.raises(KeyError): - modin_df.sample(frac=0.5, weights="CoLuMn_No_ExIsT", axis=0) - - with pytest.raises(ValueError): - modin_df.sample(frac=0.5, weights=modin_df.columns[0], axis=1) - - with pytest.raises(ValueError): - modin_df.sample( - frac=0.5, weights=[0.5 for _ in range(len(modin_df.index[:-1]))], axis=0 - ) - - with pytest.raises(ValueError): - modin_df.sample( - frac=0.5, - weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], - axis=1, - ) - - with pytest.raises(ValueError): - modin_df.sample(n=-3, axis=axis) - - with pytest.raises(ValueError): - modin_df.sample(frac=0.2, weights=pandas.Series(), axis=axis) - - if isinstance(axis, str): - num_axis = pandas.DataFrame()._get_axis_number(axis) - else: - num_axis = axis - - # weights that sum to 1 - sums = sum(i % 2 for i in range(len(modin_df.axes[num_axis]))) - weights = [i % 2 / sums for i in range(len(modin_df.axes[num_axis]))] - - modin_result = modin_df.sample( - frac=0.5, random_state=42, weights=weights, axis=axis - ) - pandas_result = pandas_df.sample( - frac=0.5, random_state=42, weights=weights, axis=axis - ) - df_equals(modin_result, pandas_result) - - # weights that don't sum to 1 - weights = [i % 2 for i in range(len(modin_df.axes[num_axis]))] - modin_result = modin_df.sample( - frac=0.5, random_state=42, weights=weights, axis=axis - ) - pandas_result = pandas_df.sample( - frac=0.5, random_state=42, weights=weights, axis=axis - ) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.sample(n=0, axis=axis) - pandas_result = pandas_df.sample(n=0, axis=axis) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.sample(frac=0.5, random_state=42, axis=axis) - pandas_result = pandas_df.sample(frac=0.5, random_state=42, axis=axis) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.sample(n=2, random_state=42, axis=axis) - pandas_result = pandas_df.sample(n=2, random_state=42, axis=axis) - df_equals(modin_result, pandas_result) - - # issue #1692, numpy RandomState object - # We must create a new random state for each iteration because the values that - # are selected will be impacted if the object has already been used. - random_state = np.random.RandomState(42) - modin_result = modin_df.sample(frac=0.5, random_state=random_state, axis=axis) - - random_state = np.random.RandomState(42) - pandas_result = pandas_df.sample(frac=0.5, random_state=random_state, axis=axis) - df_equals(modin_result, pandas_result) - - def test_select_dtypes(self): - frame_data = { - "test1": list("abc"), - "test2": np.arange(3, 6).astype("u1"), - "test3": np.arange(8.0, 11.0, dtype="float64"), - "test4": [True, False, True], - "test5": pandas.date_range("now", periods=3).values, - "test6": list(range(5, 8)), - } - df = pandas.DataFrame(frame_data) - rd = pd.DataFrame(frame_data) - - include = np.float, "integer" - exclude = (np.bool_,) - r = rd.select_dtypes(include=include, exclude=exclude) - - e = df[["test2", "test3", "test6"]] - df_equals(r, e) - - r = rd.select_dtypes(include=np.bool_) - e = df[["test4"]] - df_equals(r, e) - - r = rd.select_dtypes(exclude=np.bool_) - e = df[["test1", "test2", "test3", "test5", "test6"]] - df_equals(r, e) - - try: - pd.DataFrame().select_dtypes() - assert False - except ValueError: - assert True - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) - def test_tail(self, data, n): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - df_equals(modin_df.tail(n), pandas_df.tail(n)) - df_equals(modin_df.tail(len(modin_df)), pandas_df.tail(len(pandas_df))) - - def test_xs(self): - d = { - "num_legs": [4, 4, 2, 2], - "num_wings": [0, 0, 2, 2], - "class": ["mammal", "mammal", "mammal", "bird"], - "animal": ["cat", "dog", "bat", "penguin"], - "locomotion": ["walks", "walks", "flies", "walks"], - } - df = pd.DataFrame(data=d) - df = df.set_index(["class", "animal", "locomotion"]) - with pytest.warns(UserWarning): - df.xs("mammal") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___getitem__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - key = modin_df.columns[0] - modin_col = modin_df.__getitem__(key) - assert isinstance(modin_col, pd.Series) - - pd_col = pandas_df[key] - df_equals(pd_col, modin_col) - - slices = [ - (None, -1), - (-1, None), - (1, 2), - (1, None), - (None, 1), - (1, -1), - (-3, -1), - (1, -1, 2), - ] - - # slice test - for slice_param in slices: - s = slice(*slice_param) - df_equals(modin_df[s], pandas_df[s]) - - # Test empty - df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10]) - - def test_getitem_empty_mask(self): - # modin-project/modin#517 - modin_frames = [] - pandas_frames = [] - data1 = np.random.randint(0, 100, size=(100, 4)) - mdf1 = pd.DataFrame(data1, columns=list("ABCD")) - pdf1 = pandas.DataFrame(data1, columns=list("ABCD")) - modin_frames.append(mdf1) - pandas_frames.append(pdf1) - - data2 = np.random.randint(0, 100, size=(100, 4)) - mdf2 = pd.DataFrame(data2, columns=list("ABCD")) - pdf2 = pandas.DataFrame(data2, columns=list("ABCD")) - modin_frames.append(mdf2) - pandas_frames.append(pdf2) - - data3 = np.random.randint(0, 100, size=(100, 4)) - mdf3 = pd.DataFrame(data3, columns=list("ABCD")) - pdf3 = pandas.DataFrame(data3, columns=list("ABCD")) - modin_frames.append(mdf3) - pandas_frames.append(pdf3) - - modin_data = pd.concat(modin_frames) - pandas_data = pandas.concat(pandas_frames) - df_equals( - modin_data[[False for _ in modin_data.index]], - pandas_data[[False for _ in modin_data.index]], - ) - - def test_getitem_datetime_slice(self): - data = {"data": range(1000)} - index = pd.date_range("2017/1/4", periods=1000) - modin_df = pd.DataFrame(data=data, index=index) - pandas_df = pandas.DataFrame(data=data, index=index) - - s = slice("2017-01-06", "2017-01-09") - df_equals(modin_df[s], pandas_df[s]) - - def test_getitem_same_name(self): - data = [ - [1, 2, 3, 4], - [5, 6, 7, 8], - [9, 10, 11, 12], - [13, 14, 15, 16], - [17, 18, 19, 20], - ] - columns = ["c1", "c2", "c1", "c3"] - modin_df = pd.DataFrame(data, columns=columns) - pandas_df = pandas.DataFrame(data, columns=columns) - df_equals(modin_df["c1"], pandas_df["c1"]) - df_equals(modin_df["c2"], pandas_df["c2"]) - df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]]) - df_equals(modin_df["c3"], pandas_df["c3"]) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___getattr__(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) # noqa F841 - - if "empty_data" not in request.node.name: - key = modin_df.columns[0] - col = modin_df.__getattr__(key) - - col = modin_df.__getattr__("col1") - assert isinstance(col, pd.Series) - - col = getattr(modin_df, "col1") - assert isinstance(col, pd.Series) - - # Check that lookup in column doesn't override other attributes - df2 = modin_df.rename(index=str, columns={key: "columns"}) - assert isinstance(df2.columns, pandas.Index) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___setitem__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_df.__setitem__(modin_df.columns[-1], 1) - pandas_df.__setitem__(pandas_df.columns[-1], 1) - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_df[modin_df.columns[-1]] = pd.DataFrame(modin_df[modin_df.columns[0]]) - pandas_df[pandas_df.columns[-1]] = pandas.DataFrame( - pandas_df[pandas_df.columns[0]] - ) - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - rows = len(modin_df) - arr = np.arange(rows * 2).reshape(-1, 2) - modin_df[modin_df.columns[-1]] = arr - pandas_df[pandas_df.columns[-1]] = arr - df_equals(pandas_df, modin_df) - - with pytest.raises(ValueError, match=r"Wrong number of items passed"): - modin_df["___NON EXISTENT COLUMN"] = arr - - modin_df[modin_df.columns[0]] = np.arange(len(modin_df)) - pandas_df[pandas_df.columns[0]] = np.arange(len(pandas_df)) - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(columns=modin_df.columns) - pandas_df = pandas.DataFrame(columns=pandas_df.columns) - - for col in modin_df.columns: - modin_df[col] = np.arange(1000) - - for col in pandas_df.columns: - pandas_df[col] = np.arange(1000) - - df_equals(modin_df, pandas_df) - - # Test series assignment to column - modin_df = pd.DataFrame(columns=modin_df.columns) - pandas_df = pandas.DataFrame(columns=pandas_df.columns) - modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]] - pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]] - df_equals(modin_df, pandas_df) - - if not sys.version_info.major == 3 and sys.version_info.minor > 6: - # This test doesn't work correctly on Python 3.6 - # Test 2d ndarray assignment to column - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values - pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values - df_equals(modin_df, pandas_df) - assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0])) - - # Transpose test - modin_df = pd.DataFrame(data).T - pandas_df = pandas.DataFrame(data).T - - # We default to pandas on non-string column names - if not all(isinstance(c, str) for c in modin_df.columns): - with pytest.warns(UserWarning): - modin_df[modin_df.columns[0]] = 0 - else: - modin_df[modin_df.columns[0]] = 0 - - pandas_df[pandas_df.columns[0]] = 0 - - df_equals(modin_df, pandas_df) - - modin_df.columns = [str(i) for i in modin_df.columns] - pandas_df.columns = [str(i) for i in pandas_df.columns] - - modin_df[modin_df.columns[0]] = 0 - pandas_df[pandas_df.columns[0]] = 0 - - df_equals(modin_df, pandas_df) - - modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345 - pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345 - - df_equals(modin_df, pandas_df) - - def test___setitem__mask(self): - # DataFrame mask: - data = test_data["int_data"] - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - mean = int((RAND_HIGH + RAND_LOW) / 2) - pandas_df[pandas_df > mean] = -50 - modin_df[modin_df > mean] = -50 - - df_equals(modin_df, pandas_df) - - # Array mask: - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - array = (pandas_df > mean).to_numpy() - - modin_df[array] = -50 - pandas_df[array] = -50 - - df_equals(modin_df, pandas_df) - - # Array mask of wrong size: - with pytest.raises(ValueError): - array = np.array([[1, 2], [3, 4]]) - modin_df[array] = 20 - - @pytest.mark.parametrize( - "data", - [ - {}, - pytest.param( - {"id": [], "max_speed": [], "health": []}, - marks=pytest.mark.xfail( - reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now" - ), - ), - ], - ids=["empty", "empty_columns"], - ) - @pytest.mark.parametrize( - "value", - [np.array(["one", "two"]), [11, 22]], - ids=["ndarray", "list"], - ) - @pytest.mark.parametrize("convert_to_series", [False, True]) - @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) - def test_setitem_on_empty_df(self, data, value, convert_to_series, new_col_id): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value - modin_df[new_col_id] = pd.Series(value) if convert_to_series else value - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___len__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - assert len(modin_df) == len(pandas_df) - - def test_index_order(self): - # see #1708 and #1869 for details - df_modin, df_pandas = ( - pd.DataFrame(test_data["float_nan_data"]), - pandas.DataFrame(test_data["float_nan_data"]), - ) - rows_number = len(df_modin.index) - level_0 = np.random.choice([x for x in range(10)], rows_number) - level_1 = np.random.choice([x for x in range(10)], rows_number) - index = pandas.MultiIndex.from_arrays([level_0, level_1]) - - df_modin.index = index - df_pandas.index = index - - for func in ["all", "any", "mad", "count"]: - df_equals( - getattr(df_modin, func)(level=0).index, - getattr(df_pandas, func)(level=0).index, - ) - - -class TestDataFrameIter: - def test_items(self): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_items = modin_df.items() - pandas_items = pandas_df.items() - for modin_item, pandas_item in zip(modin_items, pandas_items): - modin_index, modin_series = modin_item - pandas_index, pandas_series = pandas_item - df_equals(pandas_series, modin_series) - assert pandas_index == modin_index - - def test_iteritems(self): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_items = modin_df.iteritems() - pandas_items = pandas_df.iteritems() - for modin_item, pandas_item in zip(modin_items, pandas_items): - modin_index, modin_series = modin_item - pandas_index, pandas_series = pandas_item - df_equals(pandas_series, modin_series) - assert pandas_index == modin_index - - def test_iterrows(self): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_iterrows = modin_df.iterrows() - pandas_iterrows = pandas_df.iterrows() - for modin_row, pandas_row in zip(modin_iterrows, pandas_iterrows): - modin_index, modin_series = modin_row - pandas_index, pandas_series = pandas_row - df_equals(pandas_series, modin_series) - assert pandas_index == modin_index - - @pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) - @pytest.mark.parametrize("index", [True, False]) - def test_itertuples(self, name, index): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) - for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): - np.testing.assert_equal(modin_row, pandas_row) - - mi_index_modin = pd.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] - ) - mi_index_pandas = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(pandas_df.columns))] - ) - modin_df.columns = mi_index_modin - pandas_df.columns = mi_index_pandas - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) - for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): - np.testing.assert_equal(modin_row, pandas_row) - - def test___iter__(self): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_iterator = modin_df.__iter__() - - # Check that modin_iterator implements the iterator interface - assert hasattr(modin_iterator, "__iter__") - assert hasattr(modin_iterator, "next") or hasattr(modin_iterator, "__next__") - - pd_iterator = pandas_df.__iter__() - assert list(modin_iterator) == list(pd_iterator) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___contains__(self, request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - result = False - key = "Not Exist" - assert result == modin_df.__contains__(key) - assert result == (key in modin_df) - - if "empty_data" not in request.node.name: - result = True - key = pandas_df.columns[0] - assert result == modin_df.__contains__(key) - assert result == (key in modin_df) - - def test__options_display(self): - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - pandas.options.display.max_rows = 10 - pandas.options.display.max_columns = 10 - x = repr(pandas_df) - pd.options.display.max_rows = 5 - pd.options.display.max_columns = 5 - y = repr(modin_df) - assert x != y - pd.options.display.max_rows = 10 - pd.options.display.max_columns = 10 - y = repr(modin_df) - assert x == y - - # test for old fixed max values - pandas.options.display.max_rows = 75 - pandas.options.display.max_columns = 75 - x = repr(pandas_df) - pd.options.display.max_rows = 75 - pd.options.display.max_columns = 75 - y = repr(modin_df) - assert x == y - - def test___finalize__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).__finalize__(None) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___copy__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_df_copy, pandas_df_copy = modin_df.__copy__(), pandas_df.__copy__() - df_equals(modin_df_copy, pandas_df_copy) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test___deepcopy__(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - modin_df_copy, pandas_df_copy = ( - modin_df.__deepcopy__(), - pandas_df.__deepcopy__(), - ) - df_equals(modin_df_copy, pandas_df_copy) - - def test___repr__(self): - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - assert repr(pandas_df) == repr(modin_df) - - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - assert repr(pandas_df) == repr(modin_df) - - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - assert repr(pandas_df) == repr(modin_df) - - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - assert repr(pandas_df) == repr(modin_df) - - # ___repr___ method has a different code path depending on - # whether the number of rows is >60; and a different code path - # depending on the number of columns is >20. - # Previous test cases already check the case when cols>20 - # and rows>60. The cases that follow exercise the other three - # combinations. - # rows <= 60, cols > 20 - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - assert repr(pandas_df) == repr(modin_df) - - # rows <= 60, cols <= 20 - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - assert repr(pandas_df) == repr(modin_df) - - # rows > 60, cols <= 20 - frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) - pandas_df = pandas.DataFrame(frame_data) - modin_df = pd.DataFrame(frame_data) - - assert repr(pandas_df) == repr(modin_df) - - # Empty - pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)]) - modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) - - assert repr(pandas_df) == repr(modin_df) - - # From Issue #1705 - string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" -"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 -"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 -"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 -"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 -"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 -"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" - pandas_df = pandas.read_csv(io.StringIO(string_data)) - modin_df = pd.read_csv(io.StringIO(string_data)) - assert repr(pandas_df) == repr(modin_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_reset_index_with_multi_index(self, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if len(modin_df.columns) > len(pandas_df.columns): - col0 = modin_df.columns[0] - col1 = modin_df.columns[1] - modin_cols = modin_df.groupby([col0, col1]).count().reset_index().columns - pandas_cols = pandas_df.groupby([col0, col1]).count().reset_index().columns - - assert modin_cols.equals(pandas_cols) - - def test_reset_index_with_named_index(self): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_df.index.name = pandas_df.index.name = "NAME_OF_INDEX" - df_equals(modin_df, pandas_df) - df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) - - modin_df.reset_index(drop=True, inplace=True) - pandas_df.reset_index(drop=True, inplace=True) - df_equals(modin_df, pandas_df) - - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - modin_df.index.name = pandas_df.index.name = "NEW_NAME" - df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_inplace_series_ops(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - if len(modin_df.columns) > len(pandas_df.columns): - col0 = modin_df.columns[0] - col1 = modin_df.columns[1] - pandas_df[col1].dropna(inplace=True) - modin_df[col1].dropna(inplace=True) - df_equals(modin_df, pandas_df) - - pandas_df[col0].fillna(0, inplace=True) - modin_df[col0].fillna(0, inplace=True) - df_equals(modin_df, pandas_df) - - def test___setattr__( - self, - ): - pandas_df = pandas.DataFrame([1, 2, 3]) - modin_df = pd.DataFrame([1, 2, 3]) - - pandas_df.new_col = [4, 5, 6] - modin_df.new_col = [4, 5, 6] - - df_equals(modin_df, pandas_df) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_isin(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - val = [1, 2, 3, 4] - pandas_result = pandas_df.isin(val) - modin_result = modin_df.isin(val) - - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_constructor(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - df_equals(pandas_df, modin_df) - - pandas_df = pandas.DataFrame({k: pandas.Series(v) for k, v in data.items()}) - modin_df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()}) - df_equals(pandas_df, modin_df) - - @pytest.mark.parametrize( - "data", - [ - np.arange(1, 10000, dtype=np.float32), - [ - pd.Series([1, 2, 3], dtype="int32"), - pandas.Series([4, 5, 6], dtype="int64"), - np.array([7, 8, 9], dtype=np.float32), - ], - pandas.Categorical([1, 2, 3, 4, 5]), - ], - ) - def test_constructor_dtypes(self, data): - md_df, pd_df = create_test_dfs(data) - df_equals(md_df, pd_df) - - def test_constructor_columns_and_index(self): - modin_df = pd.DataFrame( - [[1, 1, 10], [2, 4, 20], [3, 7, 30]], - index=[1, 2, 3], - columns=["id", "max_speed", "health"], - ) - pandas_df = pandas.DataFrame( - [[1, 1, 10], [2, 4, 20], [3, 7, 30]], - index=[1, 2, 3], - columns=["id", "max_speed", "health"], - ) - df_equals(modin_df, pandas_df) - df_equals(pd.DataFrame(modin_df), pandas.DataFrame(pandas_df)) - df_equals( - pd.DataFrame(modin_df, columns=["max_speed", "health"]), - pandas.DataFrame(pandas_df, columns=["max_speed", "health"]), - ) - df_equals( - pd.DataFrame(modin_df, index=[1, 2]), - pandas.DataFrame(pandas_df, index=[1, 2]), - ) - df_equals( - pd.DataFrame(modin_df, index=[1, 2], columns=["health"]), - pandas.DataFrame(pandas_df, index=[1, 2], columns=["health"]), - ) - df_equals( - pd.DataFrame(modin_df.iloc[:, 0], index=[1, 2, 3]), - pandas.DataFrame(pandas_df.iloc[:, 0], index=[1, 2, 3]), - ) - df_equals( - pd.DataFrame(modin_df.iloc[:, 0], columns=["NO_EXIST"]), - pandas.DataFrame(pandas_df.iloc[:, 0], columns=["NO_EXIST"]), - ) - with pytest.raises(NotImplementedError): - pd.DataFrame(modin_df, index=[1, 2, 99999]) - with pytest.raises(NotImplementedError): - pd.DataFrame(modin_df, columns=["NO_EXIST"]) - - -class TestDataFrameJoinSort: - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - def test_combine(self, data): - pandas_df = pandas.DataFrame(data) - modin_df = pd.DataFrame(data) - - modin_df.combine( - modin_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 - ) - pandas_df.combine( - pandas_df + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 - ) - - @pytest.mark.parametrize( - "test_data, test_data2", - [ - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - ), - ], - ) - def test_join(self, test_data, test_data2): - modin_df = pd.DataFrame( - test_data, - columns=["col{}".format(i) for i in range(test_data.shape[1])], - index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), - ) - pandas_df = pandas.DataFrame( - test_data, - columns=["col{}".format(i) for i in range(test_data.shape[1])], - index=pandas.Index( - [i for i in range(1, test_data.shape[0] + 1)], name="key" - ), - ) - modin_df2 = pd.DataFrame( - test_data2, - columns=["col{}".format(i) for i in range(test_data2.shape[1])], - index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), - ) - pandas_df2 = pandas.DataFrame( - test_data2, - columns=["col{}".format(i) for i in range(test_data2.shape[1])], - index=pandas.Index( - [i for i in range(1, test_data2.shape[0] + 1)], name="key" - ), - ) - - hows = ["inner", "left", "right", "outer"] - ons = ["col33", "col34"] - sorts = [False, True] - for i in range(4): - for j in range(2): - modin_result = modin_df.join( - modin_df2, - how=hows[i], - on=ons[j], - sort=sorts[j], - lsuffix="_caller", - rsuffix="_other", - ) - pandas_result = pandas_df.join( - pandas_df2, - how=hows[i], - on=ons[j], - sort=sorts[j], - lsuffix="_caller", - rsuffix="_other", - ) - df_equals(modin_result, pandas_result) - - frame_data = { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6], - } - - modin_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - frame_data2 = {"col5": [0], "col6": [1]} - modin_df2 = pd.DataFrame(frame_data2) - pandas_df2 = pandas.DataFrame(frame_data2) - - join_types = ["left", "right", "outer", "inner"] - for how in join_types: - modin_join = modin_df.join(modin_df2, how=how) - pandas_join = pandas_df.join(pandas_df2, how=how) - df_equals(modin_join, pandas_join) - - frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} - - modin_df3 = pd.DataFrame(frame_data3) - pandas_df3 = pandas.DataFrame(frame_data3) - - join_types = ["left", "outer", "inner"] - for how in join_types: - modin_join = modin_df.join([modin_df2, modin_df3], how=how) - pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) - df_equals(modin_join, pandas_join) - - @pytest.mark.parametrize( - "test_data, test_data2", - [ - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 7, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), - ), - ( - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 7)), - np.random.uniform(0, 100, size=(2 ** 6, 2 ** 6)), - ), - ], - ) - def test_merge(self, test_data, test_data2): - modin_df = pd.DataFrame( - test_data, - columns=["col{}".format(i) for i in range(test_data.shape[1])], - index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), - ) - pandas_df = pandas.DataFrame( - test_data, - columns=["col{}".format(i) for i in range(test_data.shape[1])], - index=pandas.Index( - [i for i in range(1, test_data.shape[0] + 1)], name="key" - ), - ) - modin_df2 = pd.DataFrame( - test_data2, - columns=["col{}".format(i) for i in range(test_data2.shape[1])], - index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), - ) - pandas_df2 = pandas.DataFrame( - test_data2, - columns=["col{}".format(i) for i in range(test_data2.shape[1])], - index=pandas.Index( - [i for i in range(1, test_data2.shape[0] + 1)], name="key" - ), - ) - - hows = ["left", "inner"] - ons = ["col33", ["col33", "col34"]] - sorts = [False, True] - for i in range(2): - for j in range(2): - modin_result = modin_df.merge( - modin_df2, how=hows[i], on=ons[j], sort=sorts[j] - ) - pandas_result = pandas_df.merge( - pandas_df2, how=hows[i], on=ons[j], sort=sorts[j] - ) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.merge( - modin_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], - ) - pandas_result = pandas_df.merge( - pandas_df2, - how=hows[i], - left_on="key", - right_on="key", - sort=sorts[j], - ) - df_equals(modin_result, pandas_result) - - # Test for issue #1771 - modin_df = pd.DataFrame({"name": np.arange(40)}) - modin_df2 = pd.DataFrame({"name": [39], "position": [0]}) - pandas_df = pandas.DataFrame({"name": np.arange(40)}) - pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]}) - modin_result = modin_df.merge(modin_df2, on="name", how="inner") - pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner") - df_equals(modin_result, pandas_result) - - frame_data = { - "col1": [0, 1, 2, 3], - "col2": [4, 5, 6, 7], - "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6], - } - - modin_df = pd.DataFrame(frame_data) - pandas_df = pandas.DataFrame(frame_data) - - frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} - modin_df2 = pd.DataFrame(frame_data2) - pandas_df2 = pandas.DataFrame(frame_data2) - - join_types = ["outer", "inner"] - for how in join_types: - # Defaults - modin_result = modin_df.merge(modin_df2, how=how) - pandas_result = pandas_df.merge(pandas_df2, how=how) - df_equals(modin_result, pandas_result) - - # left_on and right_index - modin_result = modin_df.merge( - modin_df2, how=how, left_on="col1", right_index=True - ) - pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on="col1", right_index=True - ) - df_equals(modin_result, pandas_result) - - # left_index and right_on - modin_result = modin_df.merge( - modin_df2, how=how, left_index=True, right_on="col1" - ) - pandas_result = pandas_df.merge( - pandas_df2, how=how, left_index=True, right_on="col1" - ) - df_equals(modin_result, pandas_result) - - # left_on and right_on col1 - modin_result = modin_df.merge( - modin_df2, how=how, left_on="col1", right_on="col1" - ) - pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on="col1", right_on="col1" - ) - df_equals(modin_result, pandas_result) - - # left_on and right_on col2 - modin_result = modin_df.merge( - modin_df2, how=how, left_on="col2", right_on="col2" - ) - pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on="col2", right_on="col2" - ) - df_equals(modin_result, pandas_result) - - # left_index and right_index - modin_result = modin_df.merge( - modin_df2, how=how, left_index=True, right_index=True - ) - pandas_result = pandas_df.merge( - pandas_df2, how=how, left_index=True, right_index=True - ) - df_equals(modin_result, pandas_result) - - # Named Series promoted to DF - s = pd.Series(frame_data2.get("col1")) - with pytest.raises(ValueError): - modin_df.merge(s) - - s = pd.Series(frame_data2.get("col1"), name="col1") - df_equals(modin_df.merge(s), modin_df.merge(modin_df2[["col1"]])) - - with pytest.raises(TypeError): - modin_df.merge("Non-valid type") - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) - ) - @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) - @pytest.mark.parametrize( - "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) - ) - def test_sort_index(self, data, axis, ascending, na_position, sort_remaining): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # Change index value so sorting will actually make a difference - if axis == "rows" or axis == 0: - length = len(modin_df.index) - modin_df.index = [(i - length / 2) % length for i in range(length)] - pandas_df.index = [(i - length / 2) % length for i in range(length)] - # Add NaNs to sorted index - if axis == "rows" or axis == 0: - length = len(modin_df.index) - modin_df.index = [ - np.nan if i % 2 == 0 else modin_df.index[i] for i in range(length) - ] - pandas_df.index = [ - np.nan if i % 2 == 0 else pandas_df.index[i] for i in range(length) - ] - else: - length = len(modin_df.columns) - modin_df.columns = [ - np.nan if i % 2 == 0 else modin_df.columns[i] for i in range(length) - ] - pandas_df.columns = [ - np.nan if i % 2 == 0 else pandas_df.columns[i] for i in range(length) - ] - - modin_result = modin_df.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=False - ) - pandas_result = pandas_df.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=False - ) - df_equals(modin_result, pandas_result) - - modin_df_cp = modin_df.copy() - pandas_df_cp = pandas_df.copy() - modin_df_cp.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=True - ) - pandas_df_cp.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=True - ) - df_equals(modin_df_cp, pandas_df_cp) - - # MultiIndex - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df.index = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(modin_df))] - ) - pandas_df.index = pandas.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(pandas_df))] - ) - modin_df.columns = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(modin_df.columns))] - ) - pandas_df.columns = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(pandas_df.columns))] - ) - - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(level=0), pandas_df.sort_index(level=0)) - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(axis=0), pandas_df.sort_index(axis=0)) - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(axis=1), pandas_df.sort_index(axis=1)) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) - @pytest.mark.parametrize( - "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) - ) - @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) - def test_sort_values(self, request, data, axis, ascending, na_position): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if "empty_data" not in request.node.name and ( - (axis == 0 or axis == "over rows") - or name_contains(request.node.name, numeric_dfs) - ): - index = ( - modin_df.index if axis == 1 or axis == "columns" else modin_df.columns - ) - key = index[0] - modin_result = modin_df.sort_values( - key, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=False, - ) - pandas_result = pandas_df.sort_values( - key, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=False, - ) - df_equals(modin_result, pandas_result) - - modin_df_cp = modin_df.copy() - pandas_df_cp = pandas_df.copy() - modin_df_cp.sort_values( - key, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=True, - ) - pandas_df_cp.sort_values( - key, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=True, - ) - df_equals(modin_df_cp, pandas_df_cp) - - keys = [key, index[-1]] - modin_result = modin_df.sort_values( - keys, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=False, - ) - pandas_result = pandas_df.sort_values( - keys, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=False, - ) - df_equals(modin_result, pandas_result) - - modin_df_cp = modin_df.copy() - pandas_df_cp = pandas_df.copy() - modin_df_cp.sort_values( - keys, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=True, - ) - pandas_df_cp.sort_values( - keys, - axis=axis, - ascending=ascending, - na_position=na_position, - inplace=True, - ) - df_equals(modin_df_cp, pandas_df_cp) - - def test_sort_values_with_duplicates(self): - modin_df = pd.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) - pandas_df = pandas.DataFrame({"col": [2, 1, 1]}, index=[1, 1, 0]) - - key = modin_df.columns[0] - modin_result = modin_df.sort_values(key, inplace=False) - pandas_result = pandas_df.sort_values(key, inplace=False) - df_equals(modin_result, pandas_result) - - modin_df.sort_values(key, inplace=True) - pandas_df.sort_values(key, inplace=True) - df_equals(modin_df, pandas_df) - - def test_sort_values_with_string_index(self): - modin_df = pd.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) - pandas_df = pandas.DataFrame({"col": [25, 17, 1]}, index=["ccc", "bbb", "aaa"]) - - key = modin_df.columns[0] - modin_result = modin_df.sort_values(key, inplace=False) - pandas_result = pandas_df.sort_values(key, inplace=False) - df_equals(modin_result, pandas_result) - - modin_df.sort_values(key, inplace=True) - pandas_df.sort_values(key, inplace=True) - df_equals(modin_df, pandas_df) - - def test_where(self): - frame_data = random_state.randn(100, 10) - pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) - modin_df = pd.DataFrame(frame_data, columns=list("abcdefghij")) - pandas_cond_df = pandas_df % 5 < 2 - modin_cond_df = modin_df % 5 < 2 - - pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) - modin_result = modin_df.where(modin_cond_df, -modin_df) - assert all((to_pandas(modin_result) == pandas_result).all()) - - other = pandas_df.loc[3] - pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) - modin_result = modin_df.where(modin_cond_df, other, axis=1) - assert all((to_pandas(modin_result) == pandas_result).all()) - - other = pandas_df["e"] - pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) - modin_result = modin_df.where(modin_cond_df, other, axis=0) - assert all((to_pandas(modin_result) == pandas_result).all()) - - pandas_result = pandas_df.where(pandas_df < 2, True) - modin_result = modin_df.where(modin_df < 2, True) - assert all((to_pandas(modin_result) == pandas_result).all()) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_cummax_int_and_float(axis): + data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) From 826699e1c28059e1109d1c62b433b9a45f358f56 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 3 Sep 2020 05:31:00 -0500 Subject: [PATCH 088/120] REFACTOR-#1879: Move logic for `groupby.agg` into query compiler (#1885) Signed-off-by: Devin Petersohn --- modin/backends/pandas/query_compiler.py | 25 ++++++++++++++++++++ modin/pandas/groupby.py | 31 ++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index bdf05597049..9bfdfdc165d 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2192,6 +2192,31 @@ def _callable_func(self, func, axis, *args, **kwargs): lambda df, **kwargs: pandas.DataFrame(df.size()), lambda df, **kwargs: df.sum() ) + def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): + """Apply aggregation functions to a grouped dataframe per-column. + + Parameters + ---------- + by : PandasQueryCompiler + The column to group by + func_dict : dict of str, callable/string + The dictionary mapping of column to function + groupby_args : dict + The dictionary of keyword arguments for the group by. + agg_args : dict + The dictionary of keyword arguments for the aggregation functions + drop : bool + Whether or not to drop the column from the data. + + Returns + ------- + PandasQueryCompiler + The result of the per-column aggregations on the grouped dataframe. + """ + return self.default_to_pandas( + lambda df: df.groupby(by=by, **groupby_args).agg(func_dict, **agg_args) + ) + def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): # since we're going to modify `groupby_args` dict in a `groupby_agg_builder`, # we want to copy it to not propagate these changes into source dict, in case diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 4773d8f877a..1a43f458dfc 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -363,19 +363,44 @@ def aggregate(self, func=None, *args, **kwargs): # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") + if isinstance(func, dict) or func is None: + if func is None: + func = {} + else: + if any(i not in self._df.columns for i in func.keys()): + from pandas.core.base import SpecificationError - if func is None or is_list_like(func): + raise SpecificationError("nested renamer is not supported") + if isinstance(self._by, type(self._query_compiler)): + by = list(self._by.columns) + else: + by = self._by + # We convert to the string version of the function for simplicity. + func_dict = { + k: v if not callable(v) or v.__name__ not in dir(self) else v.__name__ + for k, v in func.items() + } + subset_cols = list(func_dict.keys()) + ( + list(self._by.columns) + if isinstance(self._by, type(self._query_compiler)) + and all(c in self._df.columns for c in self._by.columns) + else [] + ) + return type(self._df)( + query_compiler=self._df[subset_cols]._query_compiler.groupby_dict_agg( + by, func_dict, self._kwargs, kwargs, drop=self._drop + ) + ) + if is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) - if isinstance(func, str): agg_func = getattr(self, func, None) if callable(agg_func): return agg_func(*args, **kwargs) - return self._apply_agg_function( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), drop=self._as_index, From badecb4268bbe80f81c48ccfb07e3ef2fc6f96be Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Thu, 3 Sep 2020 08:56:57 -0500 Subject: [PATCH 089/120] FIX-#1998: Unpin msgpack to satisfy Ray requirements (#1999) Signed-off-by: Gregory Shimansky --- requirements.txt | 2 +- requirements/df_test_requires.txt | 2 +- requirements/env_windows.yml | 2 +- requirements/windows_test_requires.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e1ccda9eb8a..374602b1320 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ openpyxl xlrd matplotlib<=3.2.2 sqlalchemy -msgpack<1.0 +msgpack pandas_gbq cloudpickle rpyc diff --git a/requirements/df_test_requires.txt b/requirements/df_test_requires.txt index 15b759d8145..47c726ecae3 100644 --- a/requirements/df_test_requires.txt +++ b/requirements/df_test_requires.txt @@ -8,5 +8,5 @@ matplotlib xarray scipy Jinja2 -msgpack<1.0 +msgpack pandas_gbq diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index 2203119ee96..c1abadd0b63 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -35,6 +35,6 @@ dependencies: - xlrd - matplotlib - sqlalchemy - - msgpack<1.0 + - msgpack - cloudpickle - rpyc diff --git a/requirements/windows_test_requires.txt b/requirements/windows_test_requires.txt index 89cedc905b9..efd109e9612 100644 --- a/requirements/windows_test_requires.txt +++ b/requirements/windows_test_requires.txt @@ -21,4 +21,4 @@ openpyxl xlrd matplotlib<=3.2.2 sqlalchemy -msgpack<1.0 +msgpack From 587a46636331c95afc57e4219c3b873896a8a387 Mon Sep 17 00:00:00 2001 From: YarShev Date: Fri, 4 Sep 2020 00:04:44 +0300 Subject: [PATCH 090/120] FIX-#1953: Fix computing of reduced indices (#1960) for reduction operation Signed-off-by: Igoshev, Yaroslav --- modin/engines/base/frame/data.py | 27 +- modin/pandas/base.py | 193 ---------- modin/pandas/dataframe.py | 333 ++++++++++++++++- modin/pandas/series.py | 340 +++++++++++++++++- modin/pandas/test/dataframe/test_reduction.py | 24 ++ modin/pandas/test/dataframe/test_window.py | 48 +++ modin/pandas/test/test_series.py | 66 ++++ 7 files changed, 800 insertions(+), 231 deletions(-) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index b6bfdec1801..01ae29917da 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -78,9 +78,9 @@ def __init__( ) self._column_widths_cache = column_widths self._dtypes = dtypes - self._filter_empties() if validate_axes is not False: self._validate_internal_indices(mode=validate_axes) + self._filter_empties() @property def _row_lengths(self): @@ -284,6 +284,11 @@ def _validate_axis_equality(self, axis: int, force: bool = False): is_lenghts_matches = len(self.axes[axis]) == len(internal_axis) if not is_equals: if force: + if not is_lenghts_matches: + if axis: + self._column_widths_cache = None + else: + self._row_lengths_cache = None new_axis = self.axes[axis] if is_lenghts_matches else internal_axis self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches) else: @@ -336,9 +341,9 @@ def _validate_internal_indices(self, mode=None, **kwargs): args = args_dict.get(mode, args_dict["custom"]) if args.get("validate_index", True): - self._validate_axis_equality(axis=0) + self._validate_axis_equality(axis=0, force=args.get("force")) if args.get("validate_columns", True): - self._validate_axis_equality(axis=1) + self._validate_axis_equality(axis=1, force=args.get("force")) def _apply_index_objs(self, axis=None): """Lazily applies the index object (Index or Columns) to the partitions. @@ -1000,13 +1005,19 @@ def _compute_map_reduce_metadata(self, axis, new_parts): ) def _fold_reduce(self, axis, func): - """Applies map that reduce Manager to series but require knowledge of full axis. + """ + Apply function that reduce Manager to series but require knowledge of full axis. - Args: - func: Function to reduce the Manager by. This function takes in a Manager. - axis: axis to apply the function to. + Parameters + ---------- + axis : 0 or 1 + The axis to apply the function to (0 - index, 1 - columns). + func : callable + The function to reduce the Manager by. This function takes in a Manager. - Return: + Returns + ------- + BasePandasFrame Pandas series containing the reduced data. """ func = self._build_mapreduce_func(axis, func) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 71450b75182..4bf2f4c0c12 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1655,29 +1655,6 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): ) ) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Computes median across the DataFrame. - - Args: - axis (int): The axis to take the median on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The median of the DataFrame. (Pandas series) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.median( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - ) - def memory_usage(self, index=True, deep=False): """Returns the memory usage of each column in bytes @@ -1862,52 +1839,6 @@ def pow(self, other, axis="columns", level=None, fill_value=None): "pow", other, axis=axis, level=level, fill_value=fill_value ) - def prod( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 0 - - Returns: - prod : Series or DataFrame (if level specified) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) - if min_count > 1: - return data._reduce_dimension( - query_compiler=data._query_compiler.prod_min_count( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - return data._reduce_dimension( - data._query_compiler.prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - - product = prod radd = add def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): @@ -2733,32 +2664,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Return unbiased skew over requested axis Normalized by N-1 - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - numeric_only : boolean, default None - - Returns: - skew : Series or DataFrame (if level specified) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.skew( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - ) - def sort_index( self, axis=0, @@ -2842,33 +2747,6 @@ def sort_values( ) return self._create_or_update_from_compiler(result, inplace) - def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes standard deviation across the DataFrame. - - Args: - axis (int): The axis to take the std on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The std of the DataFrame (Pandas Series) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.std( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - ) - def sub(self, other, axis="columns", level=None, fill_value=None): """Subtract a DataFrame/Series/scalar from this DataFrame. @@ -2887,50 +2765,6 @@ def sub(self, other, axis="columns", level=None, fill_value=None): subtract = sub - def sum( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """Perform a sum across the DataFrame. - - Args: - axis (int): The axis to sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The sum of the DataFrame. - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - data = self._validate_dtypes_sum_prod_mean( - axis, numeric_only, ignore_axis=False - ) - if min_count > 1: - return data._reduce_dimension( - query_compiler=data._query_compiler.sum_min_count( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - return data._reduce_dimension( - data._query_compiler.sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - def swapaxes(self, axis1, axis2, copy=True): axis1 = self._get_axis_number(axis1) axis2 = self._get_axis_number(axis2) @@ -3333,33 +3167,6 @@ def tz_localize( ) return self.set_axis(labels=new_labels, axis=axis, inplace=not copy) - def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes variance across the DataFrame. - - Args: - axis (int): The axis to take the variance on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The variance of the DataFrame. - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.var( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - ) - def __abs__(self): """Creates a modified DataFrame by taking the absolute value. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 9e9bfa384b8..1d71d36003f 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1376,6 +1376,53 @@ def lt(self, other, axis="columns", level=None): "lt", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The median of the values for the requested axis + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + def melt( self, id_vars=None, @@ -1820,6 +1867,32 @@ def prod( min_count=0, **kwargs, ): + """ + Return the product of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The product of the values for the requested axis. + """ axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( @@ -1831,9 +1904,22 @@ def prod( return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) + + data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) if min_count > 1: - return self._reduce_dimension( - query_compiler=self._query_compiler.prod_min_count( + return data._reduce_dimension( + data._query_compiler.prod_min_count( axis=axis, skipna=skipna, level=level, @@ -1842,13 +1928,15 @@ def prod( **kwargs, ) ) - return super(DataFrame, self).prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + return data._reduce_dimension( + data._query_compiler.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) product = prod @@ -2167,6 +2255,53 @@ def set_index( if not inplace: return frame + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return unbiased skew over requested axis. Normalized by N-1 + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), + count along a particular level, collapsing into a Series. + numeric_only : boolean, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased skew over requested axis. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + @property def sparse(self): return self._default_to_pandas(pandas.DataFrame.sparse) @@ -2182,6 +2317,62 @@ def squeeze(self, axis=None): else: return self.copy() + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + The axis to take the std on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The sample standard deviation. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def stack(self, level=-1, dropna=True): """ Stack the prescribed level(s) from columns to index. @@ -2244,6 +2435,32 @@ def sum( min_count=0, **kwargs, ): + """ + Return the sum of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The sum of the values for the requested axis + """ axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( @@ -2255,13 +2472,41 @@ def sum( return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) - return super(DataFrame, self).sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean( + axis, numeric_only, ignore_axis=False + ) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) def _to_datetime(self, **kwargs): @@ -2488,6 +2733,62 @@ def update( ) self._update_inplace(new_query_compiler=query_compiler) + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + The axis to take the variance on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The unbiased variance. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def where( self, cond, diff --git a/modin/pandas/series.py b/modin/pandas/series.py index b9c41f60454..65551ab381c 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -990,6 +990,53 @@ def arg(s): ) ) + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The median of the values for the requested axis + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + def memory_usage(self, index=True, deep=False): if index: result = self._reduce_dimension( @@ -1109,6 +1156,109 @@ def unstack(self, level=-1, fill_value=None): return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return unbiased skew over requested axis. Normalized by N-1 + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), + count along a particular level, collapsing into a scalar. + numeric_only : boolean, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Unbiased skew over requested axis. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0)} + The axis to take the std on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The sample standard deviation. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + @property def plot( self, @@ -1154,17 +1304,69 @@ def prod( min_count=0, **kwargs, ): + """ + Return the product of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The product of the values for the requested axis. + """ axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan - return super(Series, self).prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) product = prod @@ -1458,17 +1660,71 @@ def sum( min_count=0, **kwargs, ): + """ + Return the sum of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The sum of the values for the requested axis + """ axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan - return super(Series, self).sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean( + axis, numeric_only, ignore_axis=False + ) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) def swaplevel(self, i=-2, j=-1, copy=True): @@ -1659,6 +1915,62 @@ def value_counts( ) ) + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0)} + The axis to take the variance on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The unbiased variance. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def view(self, dtype=None): return self.__constructor__( query_compiler=self._query_compiler.series_view(dtype=dtype) diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py index dc3587a58c0..85d1246fd29 100644 --- a/modin/pandas/test/dataframe/test_reduction.py +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -279,6 +279,18 @@ def test_prod( ), ) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.prod(level=0) + pandas_result = pandas_df.prod(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize( "numeric_only", @@ -315,6 +327,18 @@ def test_sum(data, axis, skipna, is_transposed): ), ) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.sum(level=0) + pandas_result = pandas_df.sum(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize( "numeric_only", diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index 621b776def7..e228606a5ae 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -570,6 +570,18 @@ def test_median(request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.median(level=0) + pandas_result = pandas_df.median(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @@ -782,6 +794,18 @@ def test_skew(request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.skew(level=0) + pandas_result = pandas_df.skew(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @@ -824,6 +848,18 @@ def test_std(request, data, axis, skipna, numeric_only, ddof): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.std(level=0) + pandas_result = pandas_df.std(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): @@ -872,3 +908,15 @@ def test_var(request, data, axis, skipna, numeric_only, ddof): modin_result = modin_df.T.var( axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof ) + + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.var(level=0) + pandas_result = pandas_df.var(level=0) + df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 95fa93ec505..8695d7004b3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1971,6 +1971,17 @@ def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.median(level=0) + pandas_result = pandas_series.median(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", [True, False], ids=["True", "False"]) @@ -2200,6 +2211,17 @@ def test_prod(data, axis, skipna, numeric_only, min_count, operation): min_count=min_count, ) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.prod(level=0) + pandas_result = pandas_series.prod(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) @@ -2660,6 +2682,17 @@ def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.skew(level=0) + pandas_result = pandas_series.skew(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray"]) @@ -2778,6 +2811,17 @@ def test_std(request, data, skipna, ddof): modin_result = modin_series.std(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.std(level=0) + pandas_result = pandas_series.std(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sub(data): @@ -2826,6 +2870,17 @@ def test_sum(data, axis, skipna, numeric_only, min_count): min_count=min_count, ) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.sum(level=0) + pandas_result = pandas_series.sum(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1, "columns", "index"]) @@ -3204,6 +3259,17 @@ def test_var(data, skipna, ddof): modin_result = modin_series.var(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.var(level=0) + pandas_result = pandas_series.var(level=0) + df_equals(modin_result, pandas_result) + def test_view(): modin_series = pd.Series([-2, -1, 0, 1, 2], dtype="int8") From 4d75a6d840d2c2c5117628e261d9f9edcc410f8d Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Fri, 4 Sep 2020 11:59:17 +0300 Subject: [PATCH 091/120] FEAT-#1944: Added ability to broadcast multi-partitioned frames (#1945) Signed-off-by: Dmitry Chigarev --- modin/engines/base/frame/partition_manager.py | 55 ++++++------ .../pandas_on_dask/frame/partition_manager.py | 51 +++++------ .../pandas_on_ray/frame/partition_manager.py | 84 +++++++------------ modin/pandas/dataframe.py | 5 -- modin/pandas/test/test_groupby.py | 22 +++++ 5 files changed, 104 insertions(+), 113 deletions(-) diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index f1340fd1e5b..52c75c1ac3b 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -76,25 +76,10 @@ def row_partitions(cls, partitions): @classmethod def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): - by_parts = np.squeeze(by) - if len(by_parts.shape) == 0: - by_parts = np.array([by_parts.item()]) - [obj.drain_call_queue() for obj in by_parts] - new_partitions = np.array( - [ - [ - part.add_to_apply_calls( - map_func, - other=by_parts[col_idx].get() - if axis - else by_parts[row_idx].get(), - ) - for col_idx, part in enumerate(partitions[row_idx]) - ] - for row_idx in range(len(partitions)) - ] + mapped_partitions = cls.broadcast_apply( + axis, map_func, left=partitions, right=by, other_name="other" ) - return cls.map_axis_partitions(axis, new_partitions, reduce_func) + return cls.map_axis_partitions(axis, mapped_partitions, reduce_func) @classmethod def broadcast_apply_select_indices( @@ -167,35 +152,43 @@ def get_partitions(index): return new_partitions @classmethod - def broadcast_apply(cls, axis, apply_func, left, right): + def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): """Broadcast the right partitions to left and apply a function. Note: This will often be overridden by implementations. It materializes the entire partitions of the right and applies them to the left through `apply`. - Args: + Parameters + ---------- axis: The axis to apply and broadcast over. apply_func: The function to apply. left: The left partitions. right: The right partitions. + other_name: Name of key-value argument for `apply_func` that + obtains `right`. (optional, by default it's `"r"`) - Returns: + Returns + ------- A new `np.array` of partition objects. """ - if right.shape == (1, 1): - right_parts = right[0] - else: - right_parts = np.squeeze(right) + [obj.drain_call_queue() for row in right for obj in row] + new_right = np.empty(shape=right.shape[axis], dtype=object) - [obj.drain_call_queue() for obj in right_parts] - return np.array( + if axis: + right = right.T + + for i in range(len(right)): + new_right[i] = pandas.concat( + [right[i][j].get() for j in range(len(right[i]))], axis=axis ^ 1 + ) + right = new_right.T if axis else new_right + + new_partitions = np.array( [ [ part.add_to_apply_calls( apply_func, - r=right_parts[col_idx].get() - if axis - else right_parts[row_idx].get(), + **{other_name: right[col_idx] if axis else right[row_idx]}, ) for col_idx, part in enumerate(left[row_idx]) ] @@ -203,6 +196,8 @@ def broadcast_apply(cls, axis, apply_func, left, right): ] ) + return new_partitions + @classmethod def map_partitions(cls, partitions, map_func): """Applies `map_func` to every partition. diff --git a/modin/engines/dask/pandas_on_dask/frame/partition_manager.py b/modin/engines/dask/pandas_on_dask/frame/partition_manager.py index 115bbe818f7..afd58db5d41 100644 --- a/modin/engines/dask/pandas_on_dask/frame/partition_manager.py +++ b/modin/engines/dask/pandas_on_dask/frame/partition_manager.py @@ -20,12 +20,13 @@ ) from .partition import PandasOnDaskFramePartition from modin.error_message import ErrorMessage +import pandas from distributed.client import _get_global_client import cloudpickle as pkl -def deploy_func(df, other, apply_func, call_queue_df=None, call_queue_other=None): +def deploy_func(df, apply_func, call_queue_df=None, call_queues_other=None, *others): if call_queue_df is not None and len(call_queue_df) > 0: for call, kwargs in call_queue_df: if isinstance(call, bytes): @@ -33,16 +34,20 @@ def deploy_func(df, other, apply_func, call_queue_df=None, call_queue_other=None if isinstance(kwargs, bytes): kwargs = pkl.loads(kwargs) df = call(df, **kwargs) - if call_queue_other is not None and len(call_queue_other) > 0: - for call, kwargs in call_queue_other: - if isinstance(call, bytes): - call = pkl.loads(call) - if isinstance(kwargs, bytes): - kwargs = pkl.loads(kwargs) - other = call(other, **kwargs) + new_others = np.empty(shape=len(others), dtype=object) + for i, call_queue_other in enumerate(call_queues_other): + other = others[i] + if call_queue_other is not None and len(call_queue_other) > 0: + for call, kwargs in call_queue_other: + if isinstance(call, bytes): + call = pkl.loads(call) + if isinstance(kwargs, bytes): + kwargs = pkl.loads(kwargs) + other = call(other, **kwargs) + new_others[i] = other if isinstance(apply_func, bytes): apply_func = pkl.loads(apply_func) - return apply_func(df, other) + return apply_func(df, new_others) class DaskFrameManager(BaseFrameManager): @@ -98,16 +103,12 @@ def get_indices(cls, axis, partitions, index_func): return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx @classmethod - def broadcast_apply(cls, axis, apply_func, left, right): + def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): + def mapper(df, others): + other = pandas.concat(others, axis=axis ^ 1) + return apply_func(df, **{other_name: other}) + client = _get_global_client() - right_parts = np.squeeze(right) - if len(right_parts.shape) == 0: - right_parts = np.array([right_parts.item()]) - assert ( - len(right_parts.shape) == 1 - ), "Invalid broadcast partitions shape {}\n{}".format( - right_parts.shape, [[i.get() for i in j] for j in right_parts] - ) return np.array( [ [ @@ -115,14 +116,16 @@ def broadcast_apply(cls, axis, apply_func, left, right): client.submit( deploy_func, part.future, - right_parts[col_idx].future - if axis - else right_parts[row_idx].future, - apply_func, + mapper, part.call_queue, - right_parts[col_idx].call_queue + [obj[col_idx].call_queue for obj in right] if axis - else right_parts[row_idx].call_queue, + else [obj.call_queue for obj in right[row_idx]], + *( + [obj[col_idx].future for obj in right] + if axis + else [obj.future for obj in right[row_idx]] + ), pure=False, ) ) diff --git a/modin/engines/ray/pandas_on_ray/frame/partition_manager.py b/modin/engines/ray/pandas_on_ray/frame/partition_manager.py index 814e94d8633..9ca7ea7a74b 100644 --- a/modin/engines/ray/pandas_on_ray/frame/partition_manager.py +++ b/modin/engines/ray/pandas_on_ray/frame/partition_manager.py @@ -20,12 +20,13 @@ ) from .partition import PandasOnRayFramePartition from modin.error_message import ErrorMessage +import pandas import ray @ray.remote -def func(df, other, apply_func, call_queue_df=None, call_queue_other=None): +def func(df, apply_func, call_queue_df=None, call_queues_other=None, *others): if call_queue_df is not None and len(call_queue_df) > 0: for call, kwargs in call_queue_df: if isinstance(call, ray.ObjectID): @@ -33,14 +34,18 @@ def func(df, other, apply_func, call_queue_df=None, call_queue_other=None): if isinstance(kwargs, ray.ObjectID): kwargs = ray.get(kwargs) df = call(df, **kwargs) - if call_queue_other is not None and len(call_queue_other) > 0: - for call, kwargs in call_queue_other: - if isinstance(call, ray.ObjectID): - call = ray.get(call) - if isinstance(kwargs, ray.ObjectID): - kwargs = ray.get(kwargs) - other = call(other, **kwargs) - return apply_func(df, other) + new_others = np.empty(shape=len(others), dtype=object) + for i, call_queue_other in enumerate(call_queues_other): + other = others[i] + if call_queue_other is not None and len(call_queue_other) > 0: + for call, kwargs in call_queue_other: + if isinstance(call, ray.ObjectID): + call = ray.get(call) + if isinstance(kwargs, ray.ObjectID): + kwargs = ray.get(kwargs) + other = call(other, **kwargs) + new_others[i] = other + return apply_func(df, new_others) class PandasOnRayFrameManager(RayFrameManager): @@ -95,59 +100,28 @@ def get_indices(cls, axis, partitions, index_func=None): return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx @classmethod - def groupby_reduce( - cls, axis, partitions, by, map_func, reduce_func - ): # pragma: no cover - map_func = ray.put(map_func) - by_parts = np.squeeze(by) - if len(by_parts.shape) == 0: - by_parts = np.array([by_parts.item()]) - new_partitions = np.array( - [ - [ - PandasOnRayFramePartition( - func.remote( - part.oid, - by_parts[col_idx].oid if axis else by_parts[row_idx].oid, - map_func, - part.call_queue, - by_parts[col_idx].call_queue - if axis - else by_parts[row_idx].call_queue, - ) - ) - for col_idx, part in enumerate(partitions[row_idx]) - ] - for row_idx in range(len(partitions)) - ] - ) - return cls.map_axis_partitions(axis, new_partitions, reduce_func) + def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): + def mapper(df, others): + other = pandas.concat(others, axis=axis ^ 1) + return apply_func(df, **{other_name: other}) - @classmethod - def broadcast_apply(cls, axis, apply_func, left, right): - map_func = ray.put(apply_func) - right_parts = np.squeeze(right) - if len(right_parts.shape) == 0: - right_parts = np.array([right_parts.item()]) - assert ( - len(right_parts.shape) == 1 - ), "Invalid broadcast partitions shape {}\n{}".format( - right_parts.shape, [[i.get() for i in j] for j in right_parts] - ) - return np.array( + mapper = ray.put(mapper) + new_partitions = np.array( [ [ PandasOnRayFramePartition( func.remote( part.oid, - right_parts[col_idx].oid - if axis - else right_parts[row_idx].oid, - map_func, + mapper, part.call_queue, - right_parts[col_idx].call_queue + [obj[col_idx].call_queue for obj in right] if axis - else right_parts[row_idx].call_queue, + else [obj.call_queue for obj in right[row_idx]], + *( + [obj[col_idx].oid for obj in right] + if axis + else [obj.oid for obj in right[row_idx]] + ), ) ) for col_idx, part in enumerate(left[row_idx]) @@ -155,3 +129,5 @@ def broadcast_apply(cls, axis, apply_func, left, right): for row_idx in range(len(left)) ] ) + + return new_partitions diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 1d71d36003f..c62ec8d6048 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -460,11 +460,6 @@ def groupby( # We can just revert Series back to names because the parent is # this dataframe: by = [o.name if isinstance(o, Series) else o for o in by] - - warnings.warn( - "Multi-column groupby is a new feature. " - "Please report any bugs/issues to bug_reports@modin.org." - ) by = self.__getitem__(by)._query_compiler drop = True else: diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index f0d79b9f6e8..329e1258e2b 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -22,6 +22,7 @@ create_test_dfs, eval_general, df_categories_equals, + test_data_values, ) pd.DEFAULT_NPARTITIONS = 4 @@ -1284,3 +1285,24 @@ def get_columns(df): pandas_df.groupby(by=get_columns(pandas_df)) with pytest.raises(KeyError): modin_df.groupby(by=get_columns(modin_df)) + + +@pytest.mark.parametrize( + "func_to_apply", + [ + lambda df: df.sum(), + lambda df: df.count(), + lambda df: df.size(), + lambda df: df.mean(), + lambda df: df.quantile(), + ], +) +def test_multi_column_groupby_different_partitions(func_to_apply): + data = test_data_values[0] + md_df, pd_df = create_test_dfs(data) + + # columns that will be located in a different partitions + by = [pd_df.columns[0], pd_df.columns[-1]] + + md_grp, pd_grp = md_df.groupby(by), pd_df.groupby(by) + eval_general(md_grp, pd_grp, func_to_apply) From 1cf36e3b2a54eb5926ef4f5b4c32f319a00e31ba Mon Sep 17 00:00:00 2001 From: ienkovich Date: Fri, 4 Sep 2020 16:40:50 +0300 Subject: [PATCH 092/120] REFACTOR-#2009: avoid index access in is_scalar calls (#2010) Signed-off-by: ienkovich --- modin/pandas/base.py | 3 +-- modin/pandas/indexing.py | 3 ++- modin/pandas/series.py | 3 +-- modin/pandas/utils.py | 27 +++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 4bf2f4c0c12..4124b1e1190 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -15,7 +15,6 @@ import numpy as np from numpy import nan import pandas -from pandas.api.types import is_scalar from pandas.compat import numpy as numpy_compat from pandas.core.common import count_not_none, pipe from pandas.core.dtypes.common import ( @@ -33,7 +32,7 @@ import pickle as pkl from modin.error_message import ErrorMessage -from modin.pandas.utils import try_cast_to_pandas +from modin.pandas.utils import try_cast_to_pandas, is_scalar # Similar to pandas, sentinel value to use as kwarg in place of None when None has # special meaning and needs to be distinguished from a user explicitly passing None. diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 84d32115622..802acdced5d 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -13,12 +13,13 @@ import numpy as np import pandas -from pandas.api.types import is_scalar, is_list_like, is_bool +from pandas.api.types import is_list_like, is_bool from pandas.core.dtypes.common import is_integer from pandas.core.indexing import IndexingError from .dataframe import DataFrame from .series import Series +from .utils import is_scalar """Indexing Helper Class works as follows: diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 65551ab381c..fa0237430cc 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, - is_scalar, ) import sys import warnings @@ -28,7 +27,7 @@ from .base import BasePandasDataset from .iterator import PartitionIterator from .utils import _inherit_docstrings -from .utils import from_pandas, to_pandas +from .utils import from_pandas, to_pandas, is_scalar if sys.version_info[0] == 3 and sys.version_info[1] >= 7: # Python >= 3.7 diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index d2c346d05ff..e21c7617dd5 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -155,3 +155,30 @@ def hashable(obj): except TypeError: return False return True + + +def is_scalar(obj): + """ + Return True if given object is scalar. + + This method wrks the same as is_scalar method from Pandas but + it is optimized for Modin frames. For BasePandasDataset objects + Pandas version of is_scalar tries to access missing attribute + causing index scan. This tiggers execution for lazy frames and + we avoid it by handling BasePandasDataset objects separately. + + Parameters + ---------- + val : object + Object to check. + + Returns + ------- + bool + True if given object is scalar and False otherwise. + """ + + from pandas.api.types import is_scalar as pandas_is_scalar + from .base import BasePandasDataset + + return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj) From ac85a49b6954e0a70c5c60a2cd7a3a542e501d06 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Fri, 4 Sep 2020 19:32:43 +0300 Subject: [PATCH 093/120] FIX-#1284: Series.searchsorted (#1668) Signed-off-by: Alexander Myskov --- docs/supported_apis/series_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 ++ modin/backends/pandas/query_compiler.py | 89 ++++++++++++++++++++++++ modin/pandas/series.py | 55 ++++++++++++++- modin/pandas/test/test_series.py | 71 +++++++++++++++++-- 5 files changed, 212 insertions(+), 9 deletions(-) diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index 3c89d23ade6..de87ea616d6 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -360,7 +360,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``sample`` | Y | +-----------------------------+---------------------------------+ -| ``searchsorted`` | D | +| ``searchsorted`` | Y | +-----------------------------+---------------------------------+ | ``sem`` | D | +-----------------------------+---------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index fea95f65fd8..06a513b5854 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -537,6 +537,10 @@ def to_numeric(self, arg, **kwargs): def unique(self, **kwargs): pass + @abc.abstractmethod + def searchsorted(self, **kwargs): + pass + # END Abstract map partitions operations def value_counts(self, **kwargs): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 9bfdfdc165d..d0c85507c7e 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -17,6 +17,7 @@ is_list_like, is_numeric_dtype, is_datetime_or_timedelta_dtype, + is_scalar, ) from pandas.core.base import DataError @@ -1267,6 +1268,94 @@ def unique(self): ) return self.__constructor__(new_modin_frame) + def searchsorted(self, **kwargs): + """ + Return a QueryCompiler with value/values indicies, which they should be inserted + to maintain order of the passed Series. + + Returns + ------- + PandasQueryCompiler + """ + + def map_func(part, *args, **kwargs): + + elements_number = len(part.index) + assert elements_number > 0, "Wrong mapping behaviour of MapReduce" + + # unify value type + value = kwargs.pop("value") + value = np.array([value]) if is_scalar(value) else value + + if elements_number == 1: + part = part[part.columns[0]] + else: + part = part.squeeze() + + part_index_start = part.index.start + part_index_stop = part.index.stop + + result = part.searchsorted(value=value, *args, **kwargs) + + processed_results = {} + value_number = 0 + for value_result in result: + value_result += part_index_start + + if value_result > part_index_start and value_result < part_index_stop: + processed_results[f"value{value_number}"] = { + "relative_location": "current_partition", + "index": value_result, + } + elif value_result <= part_index_start: + processed_results[f"value{value_number}"] = { + "relative_location": "previoius_partitions", + "index": part_index_start, + } + else: + processed_results[f"value{value_number}"] = { + "relative_location": "next_partitions", + "index": part_index_stop, + } + + value_number += 1 + + return pandas.DataFrame(processed_results) + + def reduce_func(map_results, *args, **kwargs): + def get_value_index(value_result): + value_result_grouped = value_result.groupby(level=0) + rel_location = value_result_grouped.get_group("relative_location") + ind = value_result_grouped.get_group("index") + # executes if result is inside of the mapped part + if "current_partition" in rel_location.values: + assert ( + rel_location[rel_location == "current_partition"].count() == 1 + ), "Each value should have single result" + return ind[rel_location.values == "current_partition"] + # executes if result is between mapped parts + elif rel_location.nunique(dropna=False) > 1: + return ind[rel_location.values == "previoius_partitions"][0] + # executes if result is outside of the mapped part + else: + if "next_partitions" in rel_location.values: + return ind[-1] + else: + return ind[0] + + map_results_parsed = map_results.apply( + lambda ser: get_value_index(ser) + ).squeeze() + + if isinstance(map_results_parsed, pandas.Series): + map_results_parsed = map_results_parsed.to_list() + + return pandas.Series(map_results_parsed) + + return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)( + self, **kwargs + ) + # Dt map partitions operations dt_date = MapFunction.register(_dt_prop_map("date")) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index fa0237430cc..382f759c732 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1595,9 +1595,58 @@ def replace( return self._create_or_update_from_compiler(new_query_compiler, inplace) def searchsorted(self, value, side="left", sorter=None): - return self._default_to_pandas( - pandas.Series.searchsorted, value, side=side, sorter=sorter - ) + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Series self such that, if the + corresponding elements in value were inserted before the indices, + the order of self would be preserved. + + Parameters + ---------- + value: array_like + Values to insert into self. + side: {"left", "right"}, optional + If "left", the index of the first suitable location found is + given. If "right", return the last such index. If there is no + suitable index, return either 0 or N (where N is the length of self). + sorter: 1-D array_like, optional + Optional array of integer indices that sort self into ascending order. + They are typically the result of np.argsort. + + Returns + ------- + int or array of int + A scalar or array of insertion points with the same shape as value. + """ + searchsorted_qc = self._query_compiler + if sorter is not None: + # `iloc` method works slowly (https://github.com/modin-project/modin/issues/1903), + # so _default_to_pandas is used for now + # searchsorted_qc = self.iloc[sorter].reset_index(drop=True)._query_compiler + # sorter = None + return self._default_to_pandas( + pandas.Series.searchsorted, value, side=side, sorter=sorter + ) + # searchsorted should return item number irrespective of Series index, so + # Series.index is always set to pandas.RangeIndex, which can be easily processed + # on the query_compiler level + if not isinstance(searchsorted_qc.index, pandas.RangeIndex): + searchsorted_qc = searchsorted_qc.reset_index(drop=True) + + result = self.__constructor__( + query_compiler=searchsorted_qc.searchsorted( + value=value, side=side, sorter=sorter + ) + ).squeeze() + + # matching Pandas output + if not is_scalar(value) and not is_list_like(result): + result = np.array([result]) + elif isinstance(result, type(self)): + result = result.to_numpy() + + return result def sort_values( self, diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 8695d7004b3..468da3c8925 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -172,13 +172,16 @@ def inter_df_math_helper_one_side(modin_series, pandas_series, op): pass -def create_test_series(vals): +def create_test_series(vals, sort=False): if isinstance(vals, dict): modin_series = pd.Series(vals[next(iter(vals.keys()))]) pandas_series = pandas.Series(vals[next(iter(vals.keys()))]) else: modin_series = pd.Series(vals) pandas_series = pandas.Series(vals) + if sort: + modin_series = modin_series.sort_values().reset_index(drop=True) + pandas_series = pandas_series.sort_values().reset_index(drop=True) return modin_series, pandas_series @@ -2633,11 +2636,69 @@ def test_sample(data): modin_series.sample(n=-3) +@pytest.mark.parametrize("single_value_data", [True, False]) +@pytest.mark.parametrize("use_multiindex", [True, False]) +@pytest.mark.parametrize("sorter", [True, None]) +@pytest.mark.parametrize("values_number", [1, 2, 5]) +@pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_searchsorted(data): - modin_series, pandas_series = create_test_series(data) - with pytest.warns(UserWarning): - modin_series.searchsorted(3) +def test_searchsorted( + data, side, values_number, sorter, use_multiindex, single_value_data +): + data = data if not single_value_data else data[next(iter(data.keys()))][0] + if not sorter: + modin_series, pandas_series = create_test_series(vals=data, sort=True) + else: + modin_series, pandas_series = create_test_series(vals=data) + sorter = np.argsort(list(modin_series)) + + if use_multiindex: + rows_number = len(modin_series.index) + level_0_series = random_state.choice([0, 1], rows_number) + level_1_series = random_state.choice([2, 3], rows_number) + index_series = pd.MultiIndex.from_arrays( + [level_0_series, level_1_series], names=["first", "second"] + ) + modin_series.index = index_series + pandas_series.index = index_series + + min_sample = modin_series.min(skipna=True) + max_sample = modin_series.max(skipna=True) + + if single_value_data: + values = [data] + else: + values = [] + values.append(pandas_series.sample(n=values_number, random_state=random_state)) + values.append( + random_state.uniform(low=min_sample, high=max_sample, size=values_number) + ) + values.append( + random_state.uniform( + low=max_sample, high=2 * max_sample, size=values_number + ) + ) + values.append( + random_state.uniform( + low=min_sample - max_sample, high=min_sample, size=values_number + ) + ) + pure_float = random_state.uniform(float(min_sample), float(max_sample)) + pure_int = int(pure_float) + values.append(pure_float) + values.append(pure_int) + + test_cases = [ + modin_series.searchsorted(value=value, side=side, sorter=sorter) + == pandas_series.searchsorted(value=value, side=side, sorter=sorter) + for value in values + ] + test_cases = [ + case.all() if not isinstance(case, bool) else case for case in test_cases + ] + + for case in test_cases: + assert case @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 0768600f993afec67ed0bd2251d54abdf357b425 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Fri, 4 Sep 2020 12:35:38 -0400 Subject: [PATCH 094/120] FEAT-#1222: Implement DataFrame.asof() without Pandas fallback (#1989) Signed-off-by: Itamar Turner-Trauring --- docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/pandas/base.py | 24 +++++- modin/pandas/test/dataframe/test_default.py | 17 ---- modin/pandas/test/dataframe/test_indexing.py | 88 ++++++++++++++++++++ modin/pandas/test/test_series.py | 63 ++++++++++---- 6 files changed, 162 insertions(+), 34 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index fc364e540bf..133f3527764 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -47,7 +47,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``asfreq`` | `asfreq`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``asof`` | `asof`_ | D | | +| ``asof`` | `asof`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``assign`` | `assign`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index de87ea616d6..b8996ad74e3 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -54,7 +54,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``asobject`` | D | +-----------------------------+---------------------------------+ -| ``asof`` | D | +| ``asof`` | Y | +-----------------------------+---------------------------------+ | ``astype`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 4124b1e1190..e3c2ad0b3b1 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -625,7 +625,29 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): ) def asof(self, where, subset=None): - return self._default_to_pandas("asof", where, subset=subset) + scalar = not is_list_like(where) + if isinstance(where, pandas.Index): + # Prevent accidental mutation of original: + where = where.copy() + else: + if scalar: + where = [where] + where = pandas.Index(where) + + if subset is None: + data = self + else: + # Only relevant for DataFrames: + data = self[subset] + no_na_index = data.dropna().index + new_index = pandas.Index([no_na_index.asof(i) for i in where]) + result = self.reindex(new_index) + result.index = where + + if scalar: + # Need to return a Series: + result = result.squeeze() + return result def astype(self, dtype, copy=True, errors="raise"): col_dtypes = {} diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index bd486163653..83f36d23ae2 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -72,23 +72,6 @@ def test_asfreq(): df.asfreq(freq="30S") -def test_asof(): - df = pd.DataFrame( - {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}, - index=pd.DatetimeIndex( - [ - "2018-02-27 09:01:00", - "2018-02-27 09:02:00", - "2018-02-27 09:03:00", - "2018-02-27 09:04:00", - "2018-02-27 09:05:00", - ] - ), - ) - with pytest.warns(UserWarning): - df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) - - def test_assign(): data = test_data_values[0] modin_df = pd.DataFrame(data) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 4a669949ddc..a89c0d538f9 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -20,6 +20,7 @@ import sys from modin.pandas.test.utils import ( + NROWS, RAND_LOW, RAND_HIGH, df_equals, @@ -41,6 +42,93 @@ matplotlib.use("Agg") +@pytest.mark.parametrize( + "dates", + [ + ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], + ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], + ], +) +@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) +def test_asof_with_nan(dates, subset): + data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]} + index = pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ) + modin_where = pd.DatetimeIndex(dates) + pandas_where = pandas.DatetimeIndex(dates) + compare_asof(data, index, modin_where, pandas_where, subset) + + +@pytest.mark.parametrize( + "dates", + [ + ["2018-02-27 09:03:30", "2018-02-27 09:04:30"], + ["2018-02-27 09:03:00", "2018-02-27 09:05:00"], + ], +) +@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None]) +def test_asof_without_nan(dates, subset): + data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]} + index = pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ) + modin_where = pd.DatetimeIndex(dates) + pandas_where = pandas.DatetimeIndex(dates) + compare_asof(data, index, modin_where, pandas_where, subset) + + +@pytest.mark.parametrize( + "lookup", + [ + [60, 70, 90], + [60.5, 70.5, 100], + ], +) +@pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None]) +def test_asof_large(lookup, subset): + data = test_data["float_nan_data"] + index = list(range(NROWS)) + modin_where = pd.Index(lookup) + pandas_where = pandas.Index(lookup) + compare_asof(data, index, modin_where, pandas_where, subset) + + +def compare_asof( + data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset +): + modin_df = pd.DataFrame(data, index=index) + pandas_df = pandas.DataFrame(data, index=index) + df_equals( + modin_df.asof(modin_where, subset=subset), + pandas_df.asof(pandas_where, subset=subset), + ) + df_equals( + modin_df.asof(modin_where.values, subset=subset), + pandas_df.asof(pandas_where.values, subset=subset), + ) + df_equals( + modin_df.asof(list(modin_where.values), subset=subset), + pandas_df.asof(list(pandas_where.values), subset=subset), + ) + df_equals( + modin_df.asof(modin_where.values[0], subset=subset), + pandas_df.asof(pandas_where.values[0], subset=subset), + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_first_valid_index(data): modin_df = pd.DataFrame(data) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 468da3c8925..e6fc6daf369 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -845,21 +845,56 @@ def test_asfreq(): series.asfreq(freq="30S") -def test_asof(): - series = pd.Series( - [10, 20, 30, 40, 50], - index=pd.DatetimeIndex( - [ - "2018-02-27 09:01:00", - "2018-02-27 09:02:00", - "2018-02-27 09:03:00", - "2018-02-27 09:04:00", - "2018-02-27 09:05:00", - ] - ), +@pytest.mark.parametrize( + "where", + [ + 20, + 30, + [10, 40], + [20, 30], + [20], + 25, + [25, 45], + [25, 30], + pandas.Index([20, 30]), + pandas.Index([10]), + ], +) +def test_asof(where): + # With NaN: + values = [1, 2, np.nan, 4] + index = [10, 20, 30, 40] + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index ) - with pytest.warns(UserWarning): - series.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) + df_equals(modin_series.asof(where), pandas_series.asof(where)) + + # No NaN: + values = [1, 2, 7, 4] + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index + ) + df_equals(modin_series.asof(where), pandas_series.asof(where)) + + +@pytest.mark.parametrize( + "where", + [ + 20, + 30, + [10.5, 40.5], + [10], + pandas.Index([20, 30]), + pandas.Index([10.5]), + ], +) +def test_asof_large(where): + values = test_data["float_nan_data"]["col1"] + index = list(range(len(values))) + modin_series, pandas_series = pd.Series(values, index=index), pandas.Series( + values, index=index + ) + df_equals(modin_series.asof(where), pandas_series.asof(where)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From c7e947be7c5e2ef9093f6d35bec84b7fbfcba9ad Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Fri, 4 Sep 2020 20:53:26 +0300 Subject: [PATCH 095/120] FEAT-#1523: 'nrows' support to 'read_csv' added (#1894) Signed-off-by: Dmitry Chigarev --- modin/engines/base/io/text/csv_reader.py | 43 +-- modin/engines/base/io/text/fwf_reader.py | 43 +-- modin/engines/base/io/text/json_reader.py | 18 +- .../engines/base/io/text/text_file_reader.py | 252 +++++++++++++++--- modin/pandas/test/data/newlines.csv | 41 +++ modin/pandas/test/test_io.py | 114 +++++--- 6 files changed, 400 insertions(+), 111 deletions(-) diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py index f1c71d9a6ce..0c443a35828 100644 --- a/modin/engines/base/io/text/csv_reader.py +++ b/modin/engines/base/io/text/csv_reader.py @@ -15,6 +15,7 @@ from modin.data_management.utils import compute_chunksize from pandas.io.parsers import _validate_usecols_arg import pandas +import csv import sys @@ -54,9 +55,7 @@ def _read(cls, filepath_or_buffer, **kwargs): skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) - # TODO: replace this by reading lines from file. - if kwargs.get("nrows") is not None: - return cls.single_worker_read(filepath_or_buffer, **kwargs) + nrows = kwargs.pop("nrows", None) names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: @@ -97,6 +96,7 @@ def _read(cls, filepath_or_buffer, **kwargs): quotechar = kwargs.get("quotechar", '"').encode( encoding if encoding is not None else "UTF-8" ) + is_quoting = kwargs.get("quoting", "") != csv.QUOTE_NONE with cls.file_open(filepath_or_buffer, "rb", compression_type) as f: # Skip the header since we already have the header information and skip the # rows we are told to skip. @@ -110,24 +110,24 @@ def _read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - for _ in range(skiprows): - f.readline() + cls.offset( + f, + nrows=skiprows, + quotechar=quotechar, + is_quoting=is_quoting, + ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] - total_bytes = cls.file_size(f) # Max number of partitions available from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) - # This is the chunksize each partition will read - chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) - # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): @@ -145,15 +145,22 @@ def _read(cls, filepath_or_buffer, **kwargs): for i in range(num_splits) ] - while f.tell() < total_bytes: - args = { - "fname": filepath_or_buffer, - "num_splits": num_splits, - **partition_kwargs, - } - partition_id = cls.call_deploy( - f, chunk_size, num_splits + 2, args, quotechar=quotechar - ) + args = { + "fname": filepath_or_buffer, + "num_splits": num_splits, + **partition_kwargs, + } + + splits = cls.partitioned_file( + f, + nrows=nrows, + num_partitions=num_partitions, + quotechar=quotechar, + is_quoting=is_quoting, + ) + for start, end in splits: + args.update({"start": start, "end": end}) + partition_id = cls.deploy(cls.parse, num_splits + 2, args) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) diff --git a/modin/engines/base/io/text/fwf_reader.py b/modin/engines/base/io/text/fwf_reader.py index 72b15941269..7506ce448c2 100644 --- a/modin/engines/base/io/text/fwf_reader.py +++ b/modin/engines/base/io/text/fwf_reader.py @@ -15,6 +15,7 @@ from modin.data_management.utils import compute_chunksize from pandas.io.parsers import _validate_usecols_arg import pandas +from csv import QUOTE_NONE import sys @@ -60,9 +61,7 @@ def read(cls, filepath_or_buffer, **kwargs): skiprows = kwargs.get("skiprows") if skiprows is not None and not isinstance(skiprows, int): return cls.single_worker_read(filepath_or_buffer, **kwargs) - # TODO: replace this by reading lines from file. - if kwargs.get("nrows") is not None: - return cls.single_worker_read(filepath_or_buffer, **kwargs) + nrows = kwargs.pop("nrows", None) names = kwargs.get("names", None) index_col = kwargs.get("index_col", None) if names is None: @@ -103,6 +102,7 @@ def read(cls, filepath_or_buffer, **kwargs): quotechar = kwargs.get("quotechar", '"').encode( encoding if encoding is not None else "UTF-8" ) + is_quoting = kwargs.get("quoting", "") != QUOTE_NONE with cls.file_open(filepath_or_buffer, "rb", compression_type) as f: # Skip the header since we already have the header information and skip the # rows we are told to skip. @@ -116,24 +116,24 @@ def read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - for _ in range(skiprows): - f.readline() + cls.offset( + f, + nrows=skiprows, + quotechar=quotechar, + is_quoting=is_quoting, + ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions partition_ids = [] index_ids = [] dtypes_ids = [] - total_bytes = cls.file_size(f) # Max number of partitions available from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS # This is the number of splits for the columns num_splits = min(len(column_names), num_partitions) - # This is the chunksize each partition will read - chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) - # Metadata column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(column_names): @@ -151,15 +151,22 @@ def read(cls, filepath_or_buffer, **kwargs): for i in range(num_splits) ] - while f.tell() < total_bytes: - args = { - "fname": filepath_or_buffer, - "num_splits": num_splits, - **partition_kwargs, - } - partition_id = cls.call_deploy( - f, chunk_size, num_splits + 2, args, quotechar=quotechar - ) + args = { + "fname": filepath_or_buffer, + "num_splits": num_splits, + **partition_kwargs, + } + + splits = cls.partitioned_file( + f, + nrows=nrows, + num_partitions=num_partitions, + quotechar=quotechar, + is_quoting=is_quoting, + ) + for start, end in splits: + args.update({"start": start, "end": end}) + partition_id = cls.deploy(cls.parse, num_splits + 2, args) partition_ids.append(partition_id[:-2]) index_ids.append(partition_id[-2]) dtypes_ids.append(partition_id[-1]) diff --git a/modin/engines/base/io/text/json_reader.py b/modin/engines/base/io/text/json_reader.py index f7a98f875e8..7c2465d75c7 100644 --- a/modin/engines/base/io/text/json_reader.py +++ b/modin/engines/base/io/text/json_reader.py @@ -16,6 +16,7 @@ from io import BytesIO import pandas import numpy as np +from csv import QUOTE_NONE class JSONReader(TextFileReader): @@ -36,12 +37,10 @@ def _read(cls, path_or_buf, **kwargs): empty_pd_df = pandas.DataFrame(columns=columns) with cls.file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: - total_bytes = cls.file_size(f) from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS num_splits = min(len(columns), num_partitions) - chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) partition_ids = [] index_ids = [] @@ -59,11 +58,16 @@ def _read(cls, path_or_buf, **kwargs): for i in range(num_splits) ] - while f.tell() < total_bytes: - start = f.tell() - args = {"fname": path_or_buf, "num_splits": num_splits, "start": start} - args.update(kwargs) - partition_id = cls.call_deploy(f, chunk_size, num_splits + 3, args) + args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} + + splits = cls.partitioned_file( + f, + num_partitions=num_partitions, + is_quoting=(args.get("quoting", "") != QUOTE_NONE), + ) + for start, end in splits: + args.update({"start": start, "end": end}) + partition_id = cls.deploy(cls.parse, num_splits + 3, args) partition_ids.append(partition_id[:-3]) index_ids.append(partition_id[-3]) dtypes_ids.append(partition_id[-2]) diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py index cd31fe248f0..3856148f19a 100644 --- a/modin/engines/base/io/text/text_file_reader.py +++ b/modin/engines/base/io/text/text_file_reader.py @@ -12,42 +12,12 @@ # governing permissions and limitations under the License. from modin.engines.base.io.file_reader import FileReader -import re import numpy as np import warnings -import csv +import os class TextFileReader(FileReader): - @classmethod - def call_deploy(cls, f, chunk_size, num_return_vals, args, quotechar=b'"'): - args["start"] = f.tell() - chunk = f.read(chunk_size) - line = f.readline() # Ensure we read up to a newline - # We need to ensure that one row isn't being split across different partitions - - if args.get("quoting", "") != csv.QUOTE_NONE: - quote_count = ( - re.subn(quotechar, b"", chunk)[1] + re.subn(quotechar, b"", line)[1] - ) - while quote_count % 2 != 0: - line = f.readline() - quote_count += re.subn(quotechar, b"", line)[1] - if not line: - break - - if quote_count % 2 != 0: - warnings.warn("File has mismatched quotes") - - # The workers return multiple objects for each part of the file read: - # - The first n - 2 objects are partitions of data - # - The n - 1 object is the length of the partition or the index if - # `index_col` is specified. We compute the index below. - # - The nth object is the dtypes of the partition. We combine these to - # form the final dtypes below. - args["end"] = f.tell() - return cls.deploy(cls.parse, num_return_vals, args) - @classmethod def build_partition(cls, partition_ids, row_lengths, column_widths): return np.array( @@ -81,3 +51,223 @@ def pathlib_or_pypath(cls, filepath_or_buffer): except ImportError: # pragma: no cover pass return False + + @classmethod + def offset( + cls, + f, + nrows=None, + skiprows=None, + chunk_size_bytes=None, + quotechar=b'"', + is_quoting=True, + ): + """ + Moves the file offset at the specified amount of bytes/rows. + + Parameters + ---------- + f: file object + nrows: int, number of rows to read. Optional, if not specified will only + consider `chunk_size_bytes` parameter. + chunk_size_bytes: int, Will read new rows while file pointer + is less than `chunk_size_bytes`. Optional, if not specified will only + consider `nrows` parameter. + skiprows: array or callable (optional), specifies rows to skip + quotechar: char that indicates quote in a file + (optional, by default it's '\"') + is_quoting: bool, Whether or not to consider quotes + (optional, by default it's `True`) + + Returns + ------- + bool: If file pointer reached the end of the file, but did not find + closing quote returns `False`. `True` in any other case. + """ + assert ( + nrows is not None or chunk_size_bytes is not None + ), "`nrows` and `chunk_size_bytes` can't be None at the same time" + + if nrows is not None or skiprows is not None: + return cls._read_rows( + f, + nrows=nrows, + skiprows=skiprows, + quotechar=quotechar, + is_quoting=is_quoting, + max_bytes=chunk_size_bytes, + )[0] + + outside_quotes = True + + chunk = f.read(chunk_size_bytes) + line = f.readline() # Ensure we read up to a newline + # We need to ensure that one row isn't being split across different partitions + + if is_quoting: + outside_quotes = not ((chunk.count(quotechar) + line.count(quotechar)) % 2) + while not outside_quotes: + line = f.readline() + outside_quotes = line.count(quotechar) % 2 + if not line: + break + + return outside_quotes + + @classmethod + def partitioned_file( + cls, + f, + nrows=None, + skiprows=None, + num_partitions=None, + quotechar=b'"', + is_quoting=True, + from_begin=False, + ): + """Computes chunk sizes in bytes for every partition. + + Parameters + ---------- + f: file to be partitioned + nrows: int (optional), number of rows of file to read + skiprows: array or callable (optional), specifies rows to skip + num_partitions: int, for what number of partitions split a file. + Optional, if not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` + quotechar: char that indicates quote in a file + (optional, by default it's '\"') + is_quoting: bool, Whether or not to consider quotes + (optional, by default it's `True`) + from_begin: bool, Whether or not to set the file pointer to the begining of the file + (optional, by default it's `False`) + + Returns + ------- + An array, where each element of array is a tuple of two ints: + beginning and the end offsets of the current chunk. + """ + if num_partitions is None: + from modin.pandas import DEFAULT_NPARTITIONS + + num_partitions = DEFAULT_NPARTITIONS + + result = [] + + old_position = f.tell() + if from_begin: + f.seek(0, os.SEEK_SET) + + current_start = f.tell() + total_bytes = cls.file_size(f) + + # if `nrows` are specified we want to use rows as a part measure + if nrows is not None: + chunk_size_bytes = None + rows_per_part = max(1, num_partitions, nrows // num_partitions) + else: + chunk_size_bytes = max(1, num_partitions, total_bytes // num_partitions) + rows_per_part = None + nrows = float("inf") + + rows_readed = 0 + while f.tell() < total_bytes and rows_readed < nrows: + if rows_per_part is not None and rows_readed + rows_per_part > nrows: + rows_per_part = nrows - rows_readed + + outside_quotes = cls.offset( + f, + nrows=rows_per_part, + skiprows=skiprows, + chunk_size_bytes=chunk_size_bytes, + quotechar=quotechar, + is_quoting=is_quoting, + ) + + result.append((current_start, f.tell())) + current_start = f.tell() + if rows_per_part is not None: + rows_readed += rows_per_part + + if is_quoting and not outside_quotes: + warnings.warn("File has mismatched quotes") + + f.seek(old_position, os.SEEK_SET) + + return result + + @classmethod + def _read_rows( + cls, + f, + nrows=None, + skiprows=None, + quotechar=b'"', + is_quoting=True, + max_bytes=None, + ): + """ + Moves the file offset at the specified amount of rows + Note: the difference between `offset` is that `_read_rows` is more + specific version of `offset` which is focused of reading **rows**. + In common case it's better to use `offset`. + + Parameters + ---------- + f: file object + nrows: int, number of rows to read. Optional, if not specified will only + consider `max_bytes` parameter. + skiprows: int, array or callable (optional), specifies rows to skip + quotechar: char that indicates quote in a file + (optional, by default it's '\"') + is_quoting: bool, Whether or not to consider quotes + (optional, by default it's `True`) + max_bytes: int, Will read new rows while file pointer + is less than `max_bytes`. Optional, if not specified will only + consider `nrows` parameter, if both not specified will read till + the end of the file. + + Returns + ------- + tuple of bool and int, + bool: If file pointer reached the end of the file, but did not find + closing quote returns `False`. `True` in any other case. + int: Number of rows that was readed. + """ + assert skiprows is None or isinstance( + skiprows, int + ), f"Skiprows as a {type(skiprows)} is not supported yet." + + if nrows is None and max_bytes is None: + max_bytes = float("inf") + + if nrows is not None and nrows <= 0: + return True, 0 + + # we need this condition to avoid unnecessary checks in `stop_condition` + # which executes in a huge for loop + if nrows is not None and max_bytes is None: + stop_condition = lambda rows_readed: rows_readed >= nrows # noqa (E731) + elif nrows is not None and max_bytes is not None: + stop_condition = ( + lambda rows_readed: f.tell() >= max_bytes or rows_readed >= nrows + ) # noqa (E731) + else: + stop_condition = lambda rows_readed: f.tell() >= max_bytes # noqa (E731) + + if max_bytes is not None: + max_bytes = max_bytes + f.tell() + + rows_readed = 0 + outside_quotes = True + for line in f: + if is_quoting and line.count(quotechar) % 2: + outside_quotes = not outside_quotes + if outside_quotes: + rows_readed += 1 + if stop_condition(rows_readed): + break + + if not outside_quotes: + rows_readed += 1 + + return outside_quotes, rows_readed diff --git a/modin/pandas/test/data/newlines.csv b/modin/pandas/test/data/newlines.csv index 4dffea96b57..2e16ed2eaa8 100644 --- a/modin/pandas/test/data/newlines.csv +++ b/modin/pandas/test/data/newlines.csv @@ -11,12 +11,53 @@ reproduce the issue",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 +"And there is another +string with several +newline characters +that will probably cause some +problem for Modin +and I suspect that +we +will hopefully +reproduce the issue",2,3,4 +"I",2,3,4 +"J",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 +"And there is another +string with several +newline characters +that will probably cause some +problem for Modin +and I suspect that +we +will hopefully +reproduce the issue",2,"And +there is another +string with several +newline characters +that will probably cause some +problem for Modin +and I suspect that +we +will hopefully +reproduce the issue",4 +"I",2,3,4 "J",2,3,4 "H",2,3,4 "I",2,3,4 "J",2,3,4 +"H",2,3,4 +"I",2,3,4 +"And there is another +string with several +newline characters +that will probably cause some +problem for Modin +and I suspect that +we +will hopefully +reproduce the issue",2,3,4 diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index ec307a5a911..5bc1207b733 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -31,6 +31,7 @@ json_short_bytes, json_long_string, json_long_bytes, + eval_general, ) from modin import execution_engine @@ -59,6 +60,26 @@ SMALL_ROW_SIZE = 2000 +def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs): + def applyier(module, *args, **kwargs): + result = getattr(module, fn_name)(*args, **kwargs) + # There could be some missmatches in dtypes, so we're + # casting the whole frame to `str` before comparison. + # See issue #1931 for details. + if cast_to_str: + result = result.astype(str) + return result + + eval_general( + pd, + pandas, + applyier, + path=path, + *args, + **kwargs, + ) + + @pytest.fixture def make_parquet_file(): """Pytest fixture factory that makes a parquet file/dir for testing. @@ -728,31 +749,34 @@ def test_from_sas(): df_equals(modin_df, pandas_df) -def test_from_csv(make_csv_file): +@pytest.mark.parametrize("nrows", [123, None]) +def test_from_csv(make_csv_file, nrows): make_csv_file() - pandas_df = pandas.read_csv(TEST_CSV_FILENAME) - modin_df = pd.read_csv(TEST_CSV_FILENAME) + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, nrows=nrows) + modin_df = pd.read_csv(TEST_CSV_FILENAME, nrows=nrows) df_equals(modin_df, pandas_df) - pandas_df = pandas.read_csv(Path(TEST_CSV_FILENAME)) - modin_df = pd.read_csv(Path(TEST_CSV_FILENAME)) + pandas_df = pandas.read_csv(Path(TEST_CSV_FILENAME), nrows=nrows) + modin_df = pd.read_csv(Path(TEST_CSV_FILENAME), nrows=nrows) df_equals(modin_df, pandas_df) -def test_from_csv_sep_none(make_csv_file): +@pytest.mark.parametrize("nrows", [123, None]) +def test_from_csv_sep_none(make_csv_file, nrows): make_csv_file() with pytest.warns(ParserWarning): - pandas_df = pandas.read_csv(TEST_CSV_FILENAME, sep=None) + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, sep=None, nrows=nrows) with pytest.warns(ParserWarning): - modin_df = pd.read_csv(TEST_CSV_FILENAME, sep=None) + modin_df = pd.read_csv(TEST_CSV_FILENAME, sep=None, nrows=nrows) df_equals(modin_df, pandas_df) -def test_from_csv_bad_quotes(): +@pytest.mark.parametrize("nrows", [2, None]) +def test_from_csv_bad_quotes(nrows): csv_bad_quotes = """1, 2, 3, 4 one, two, three, four five, "six", seven, "eight @@ -761,13 +785,14 @@ def test_from_csv_bad_quotes(): with open(TEST_CSV_FILENAME, "w") as f: f.write(csv_bad_quotes) - pandas_df = pandas.read_csv(TEST_CSV_FILENAME) - modin_df = pd.read_csv(TEST_CSV_FILENAME) + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, nrows=nrows) + modin_df = pd.read_csv(TEST_CSV_FILENAME, nrows=nrows) df_equals(modin_df, pandas_df) -def test_from_csv_quote_none(): +@pytest.mark.parametrize("nrows", [2, None]) +def test_from_csv_quote_none(nrows): csv_bad_quotes = """1, 2, 3, 4 one, two, three, four five, "six", seven, "eight @@ -775,8 +800,8 @@ def test_from_csv_quote_none(): with open(TEST_CSV_FILENAME, "w") as f: f.write(csv_bad_quotes) - pandas_df = pandas.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE) - modin_df = pd.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE) + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE, nrows=nrows) + modin_df = pd.read_csv(TEST_CSV_FILENAME, quoting=csv.QUOTE_NONE, nrows=nrows) df_equals(modin_df, pandas_df) @@ -1058,26 +1083,33 @@ def test_from_csv_chunksize(make_csv_file): df_equals(modin_df, pd_df) -def test_from_csv_skiprows(make_csv_file): +@pytest.mark.parametrize("nrows", [123, None]) +def test_from_csv_skiprows(make_csv_file, nrows): make_csv_file() - pandas_df = pandas.read_csv(TEST_CSV_FILENAME, skiprows=2) - modin_df = pd.read_csv(TEST_CSV_FILENAME, skiprows=2) + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, skiprows=2, nrows=nrows) + modin_df = pd.read_csv(TEST_CSV_FILENAME, skiprows=2, nrows=nrows) df_equals(modin_df, pandas_df) pandas_df = pandas.read_csv( - TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2 + TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2, nrows=nrows ) modin_df = pd.read_csv( - TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2 + TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=2, nrows=nrows ) df_equals(modin_df, pandas_df) pandas_df = pandas.read_csv( - TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=lambda x: x % 2 + TEST_CSV_FILENAME, + names=["c1", "c2", "c3", "c4"], + skiprows=lambda x: x % 2, + nrows=nrows, ) modin_df = pd.read_csv( - TEST_CSV_FILENAME, names=["c1", "c2", "c3", "c4"], skiprows=lambda x: x % 2 + TEST_CSV_FILENAME, + names=["c1", "c2", "c3", "c4"], + skiprows=lambda x: x % 2, + nrows=nrows, ) df_equals(modin_df, pandas_df) @@ -1097,10 +1129,6 @@ def test_from_csv_encoding(make_csv_file, encoding): def test_from_csv_default_to_pandas_behavior(make_csv_file): make_csv_file() - with pytest.warns(UserWarning): - # Test nrows - pd.read_csv(TEST_CSV_FILENAME, nrows=10) - with pytest.warns(UserWarning): # This tests that we default to pandas on a buffer from io import StringIO @@ -1111,11 +1139,12 @@ def test_from_csv_default_to_pandas_behavior(make_csv_file): pd.read_csv(TEST_CSV_FILENAME, skiprows=lambda x: x in [0, 2]) -def test_from_csv_index_col(make_csv_file): +@pytest.mark.parametrize("nrows", [123, None]) +def test_from_csv_index_col(make_csv_file, nrows): make_csv_file() - pandas_df = pandas.read_csv(TEST_CSV_FILENAME, index_col="col1") - modin_df = pd.read_csv(TEST_CSV_FILENAME, index_col="col1") + pandas_df = pandas.read_csv(TEST_CSV_FILENAME, index_col="col1", nrows=nrows) + modin_df = pd.read_csv(TEST_CSV_FILENAME, index_col="col1", nrows=nrows) df_equals(modin_df, pandas_df) @@ -1142,10 +1171,16 @@ def test_from_csv_parse_dates(make_csv_file): df_equals(modin_df, pandas_df) -def test_from_csv_newlines_in_quotes(): - pandas_df = pandas.read_csv("modin/pandas/test/data/newlines.csv") - modin_df = pd.read_csv("modin/pandas/test/data/newlines.csv") - df_equals(modin_df, pandas_df) +@pytest.mark.parametrize("nrows", [21, 5, None]) +@pytest.mark.parametrize("skiprows", [4, 1, 500, None]) +def test_from_csv_newlines_in_quotes(nrows, skiprows): + eval_io( + path="modin/pandas/test/data/newlines.csv", + fn_name="read_csv", + nrows=nrows, + skiprows=skiprows, + cast_to_str=True, + ) @pytest.mark.skip(reason="No clipboard on Travis") @@ -1537,15 +1572,20 @@ def test_fwf_file_chunksize(): df_equals(modin_df, pd_df) -def test_fwf_file_skiprows(): +@pytest.mark.parametrize("nrows", [13, None]) +def test_fwf_file_skiprows(nrows): setup_fwf_file(overwrite=True) - pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skiprows=2) - modin_df = pd.read_fwf(TEST_FWF_FILENAME, skiprows=2) + pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skiprows=2, nrows=nrows) + modin_df = pd.read_fwf(TEST_FWF_FILENAME, skiprows=2, nrows=nrows) df_equals(modin_df, pandas_df) - pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5]) - modin_df = pd.read_fwf(TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5]) + pandas_df = pandas.read_fwf( + TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5], nrows=nrows + ) + modin_df = pd.read_fwf( + TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5], nrows=nrows + ) df_equals(modin_df, pandas_df) From 333e32c80e41307093e18451328d77c8525e3e6b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 13:14:14 +0300 Subject: [PATCH 096/120] DOCS-#2015: update supported docs (#2016) Signed-off-by: Anatoly Myachev --- docs/supported_apis/dataframe_supported.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 133f3527764..ae4f6dc43f4 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -147,11 +147,11 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``floordiv`` | `floordiv`_ | Y | See ``add`` | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``from_dict`` | `from_dict`_ | Y | | +| ``from_dict`` | `from_dict`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``from_items`` | `from_items`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``from_records`` | `from_records`_ | Y | | +| ``from_records`` | `from_records`_ | D | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``ftypes`` | `ftypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ From 2f658e0c9292f6e26606466f0f1db2e7fac235e0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 14:18:07 +0300 Subject: [PATCH 097/120] TEST-2022: speed up prepare-cache job (#2023) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b534b62bef..e0a1aa5e837 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,19 +70,11 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: actions/setup-python@v2 with: python-version: ${{matrix.python-version}} - channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - auto-update-conda: true - - shell: bash -l {0} - run: pip install ray==0.8.7 - - name: Conda environment - shell: bash -l {0} - run: | - conda info - conda list + architecture: "x64" + - run: pip install ray==0.8.7 test-api: needs: prepare-cache From 51edadb3fc61d7b53d008b2e0dc1cfa5d9a85c4f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 14:45:25 +0300 Subject: [PATCH 098/120] TEST-#2024: remove test_dataframe.py (#2025) Signed-off-by: Anatoly Myachev --- modin/pandas/test/test_dataframe.py | 36 ----------------------------- 1 file changed, 36 deletions(-) delete mode 100644 modin/pandas/test/test_dataframe.py diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py deleted file mode 100644 index 1b5eb93b842..00000000000 --- a/modin/pandas/test/test_dataframe.py +++ /dev/null @@ -1,36 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -import pytest -import pandas -import matplotlib -import modin.pandas as pd - -from modin.pandas.test.utils import ( - df_equals, - axis_values, - axis_keys, -) - -pd.DEFAULT_NPARTITIONS = 4 - -# Force matplotlib to not use any Xwindows backend. -matplotlib.use("Agg") - - -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -def test_cummax_int_and_float(axis): - data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) From 7d6eef9b49f4d6b7d9f6875a94d91b8f9470b0c6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 14:51:24 +0300 Subject: [PATCH 099/120] TEST-#2020: decrease parallel tests on Ubuntu (#2021) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0a1aa5e837..2379f311250 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -172,11 +172,10 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 - name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) + name: test-ubuntu (engine ${{matrix.engine}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v2 with: @@ -199,35 +198,41 @@ jobs: conda info conda list - name: Install HDF5 - if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py - if: matrix.part != 3 + run: pytest modin/pandas/test/dataframe/test_binary.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_default.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_indexing.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_iter.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_join_sort.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_map_metadata.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_reduction.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_udf.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_window.py - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_rolling.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_concat.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_groupby.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_reshape.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_general.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_io.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/experimental/pandas/test/test_io_exp.py - if: matrix.part == 3 - shell: bash -l {0} run: bash <(curl -s https://codecov.io/bash) From 03fe0d6afa492a4a89bb9116ac61d713864c1c46 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 15:34:59 +0300 Subject: [PATCH 100/120] TEST-#2030: speed up cache; decrease parallel jobs in push.yml (#2031) Signed-off-by: Anatoly Myachev --- .github/workflows/push.yml | 45 ++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 527358d18d3..0152eaf391d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -23,19 +23,11 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: actions/setup-python@v2 with: python-version: ${{matrix.python-version}} - channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - auto-update-conda: true - - shell: bash -l {0} - run: pip install ray==0.8.7 - - name: Conda environment - shell: bash -l {0} - run: | - conda info - conda list + architecture: "x64" + - run: pip install ray==0.8.7 test-all: needs: prepare-cache @@ -44,11 +36,10 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["python", "ray", "dask"] - part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 - name: test (${{matrix.engine}}, part ${{matrix.part}}, python ${{matrix.python-version}}) + name: test-ubuntu (engine ${{matrix.engine}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v2 with: @@ -71,35 +62,41 @@ jobs: conda info conda list - name: Install HDF5 - if: matrix.part == 3 run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py - if: matrix.part != 3 + run: pytest modin/pandas/test/dataframe/test_binary.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_default.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_indexing.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_iter.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_join_sort.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_map_metadata.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_reduction.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_udf.py + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_window.py - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_rolling.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_concat.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_groupby.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_reshape.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_general.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_io.py - if: matrix.part == 3 - shell: bash -l {0} run: python -m pytest modin/experimental/pandas/test/test_io_exp.py - if: matrix.part == 3 - shell: bash -l {0} run: bash <(curl -s https://codecov.io/bash) From 3a6dc67dae1d8c4ab0bee36afe6cbd2d574f7075 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 16:44:04 +0300 Subject: [PATCH 101/120] TEST-#2028: speed up window tests (#2029) Signed-off-by: Anatoly Myachev --- modin/pandas/test/dataframe/test_window.py | 463 +++++---------------- 1 file changed, 100 insertions(+), 363 deletions(-) diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index e228606a5ae..a8e5b967517 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -33,6 +33,10 @@ bool_arg_values, int_arg_keys, int_arg_values, + test_data, + eval_general, + create_test_dfs, + test_data_diff_dtype, ) pd.DEFAULT_NPARTITIONS = 4 @@ -41,174 +45,51 @@ matplotlib.use("Agg") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) -) -def test_cummax(request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.cummax(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cummax(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cummax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.cummax(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cummax(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cummax(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -def test_cummax_int_and_float(axis): - data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.cummax(axis=axis), pandas_df.cummax(axis=axis)) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) -def test_cummin(request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) +@pytest.mark.parametrize("method", ["cumprod", "cummin", "cummax", "cumsum"]) +def test_cumprod_cummin_cummax_cumsum(axis, skipna, method): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, method)(axis=axis, skipna=skipna), + ) - try: - pandas_result = pandas_df.cummin(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cummin(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cummin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - try: - pandas_result = pandas_df.T.cummin(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cummin(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cummin(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize("method", ["cumprod", "cummin", "cummax", "cumsum"]) +def test_cumprod_cummin_cummax_cumsum_transposed(axis, method): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: getattr(df.T, method)(axis=axis), + ) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -def test_cummin_int_and_float(axis): +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_cummax_int_and_float(axis, method): data = {"col1": list(range(1000)), "col2": [i * 0.1 for i in range(1000)]} - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - df_equals(modin_df.cummin(axis=axis), pandas_df.cummin(axis=axis)) + eval_general(*create_test_dfs(data), lambda df: getattr(df, method)(axis=axis)) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) -) -def test_cumprod(request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.cumprod(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cumprod(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cumprod(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.cumprod(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cumprod(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cumprod(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) -) -def test_cumsum(request, data, axis, skipna): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - # pandas exhibits weird behavior for this case - # Remove this case when we can pull the error messages from backend - if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( - axis == 0 or axis == "rows" - ): - with pytest.raises(TypeError): - modin_df.cumsum(axis=axis, skipna=skipna) - else: - try: - pandas_result = pandas_df.cumsum(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.cumsum(axis=axis, skipna=skipna) - else: - modin_result = modin_df.cumsum(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( - axis == 0 or axis == "rows" - ): - with pytest.raises(TypeError): - modin_df.T.cumsum(axis=axis, skipna=skipna) - else: - try: - pandas_result = pandas_df.T.cumsum(axis=axis, skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.cumsum(axis=axis, skipna=skipna) - else: - modin_result = modin_df.T.cumsum(axis=axis, skipna=skipna) - df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) ) -def test_diff(request, data, axis, periods): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) +def test_diff(axis, periods): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: df.diff(axis=axis, periods=periods), + ) - try: - pandas_result = pandas_df.diff(axis=axis, periods=periods) - except Exception as e: - with pytest.raises(type(e)): - modin_df.diff(axis=axis, periods=periods) - else: - modin_result = modin_df.diff(axis=axis, periods=periods) - df_equals(modin_result, pandas_result) - try: - pandas_result = pandas_df.T.diff(axis=axis, periods=periods) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.diff(axis=axis, periods=periods) - else: - modin_result = modin_df.T.diff(axis=axis, periods=periods) - df_equals(modin_result, pandas_result) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +def test_diff_transposed(axis): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df.T.diff(axis=axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -532,55 +413,62 @@ def test_fillna_datetime_columns(): df_equals(modin_df.fillna("?"), df.fillna("?")) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) +@pytest.mark.parametrize("method", ["median", "skew"]) +def test_median_skew(axis, skipna, method): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, method)(axis=axis, skipna=skipna), + ) + + +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize("method", ["median", "skew"]) +def test_median_skew_transposed(axis, method): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: getattr(df.T, method)(axis=axis), + ) + + @pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) + "numeric_only", + [ + pytest.param( + True, + marks=pytest.mark.xfail( + reason="Internal and external indices do not match." + ), + ), + False, + pytest.param( + None, + marks=pytest.mark.xfail( + reason="Internal and external indices do not match." + ), + ), + ], ) -def test_median(request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) +@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank"]) +def test_median_skew_std_var_rank_specific(numeric_only, method): + eval_general( + *create_test_dfs(test_data_diff_dtype), + lambda df: getattr(df, method)(numeric_only=numeric_only), + ) - try: - pandas_result = pandas_df.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.median(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - try: - pandas_result = pandas_df.T.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.median(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.median( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - # test for issue #1953 +@pytest.mark.parametrize("method", ["median", "skew", "std", "var"]) +def test_median_skew_std_var_1953(method): + # See #1953 for details arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] - modin_df = pd.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - pandas_df = pandas.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - modin_result = modin_df.median(level=0) - pandas_result = pandas_df.median(level=0) - df_equals(modin_result, pandas_result) + data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] + modin_df = pd.DataFrame(data, index=arrays) + pandas_df = pandas.DataFrame(data, index=arrays) + + eval_general(modin_df, pandas_df, lambda df: getattr(df, method)(level=0)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -730,135 +618,37 @@ def test_quantile(request, data, q): modin_df.T.quantile(q) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) -) +@pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] ) -def test_rank(data, axis, numeric_only, na_option): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.rank( - axis=axis, numeric_only=numeric_only, na_option=na_option - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.rank(axis=axis, numeric_only=numeric_only, na_option=na_option) - else: - modin_result = modin_df.rank( - axis=axis, numeric_only=numeric_only, na_option=na_option - ) - df_equals(modin_result, pandas_result) +def test_rank_transposed(axis, na_option): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df.rank(axis=axis, na_option=na_option), + ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) -@pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) -) -def test_skew(request, data, axis, skipna, numeric_only): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) - else: - modin_result = modin_df.T.skew( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - df_equals(modin_result, pandas_result) - - # test for issue #1953 - arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] - modin_df = pd.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - pandas_df = pandas.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays +@pytest.mark.parametrize("method", ["str", "var", "rank"]) +def test_std_var_rank(axis, skipna, method): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, method)(axis=axis, skipna=skipna), ) - modin_result = modin_df.skew(level=0) - pandas_result = pandas_df.skew(level=0) - df_equals(modin_result, pandas_result) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) -) -@pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) -) +@pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) -def test_std(request, data, axis, skipna, numeric_only, ddof): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.std(axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof) - else: - modin_result = modin_df.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception as e: - with pytest.raises(type(e)): - modin_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.T.std( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - # test for issue #1953 - arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] - modin_df = pd.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays +@pytest.mark.parametrize("method", ["std", "var"]) +def test_std_var_transposed(axis, ddof, method): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: getattr(df.T, method)(axis=axis, ddof=ddof), ) - pandas_df = pandas.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - modin_result = modin_df.std(level=0) - pandas_result = pandas_df.std(level=0) - df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -867,56 +657,3 @@ def test_values(data): pandas_df = pandas.DataFrame(data) np.testing.assert_equal(modin_df.values, pandas_df.values) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize( - "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) -) -@pytest.mark.parametrize( - "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) -) -@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) -def test_var(request, data, axis, skipna, numeric_only, ddof): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception: - with pytest.raises(TypeError): - modin_df.var(axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof) - else: - modin_result = modin_df.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - df_equals(modin_result, pandas_result) - - try: - pandas_result = pandas_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - except Exception: - with pytest.raises(TypeError): - modin_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - else: - modin_result = modin_df.T.var( - axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof - ) - - # test for issue #1953 - arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] - modin_df = pd.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - pandas_df = pandas.DataFrame( - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays - ) - modin_result = modin_df.var(level=0) - pandas_result = pandas_df.var(level=0) - df_equals(modin_result, pandas_result) From 3ea0edb5f612d6d15a566e4dd1d8a246348865c2 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 18:09:34 +0300 Subject: [PATCH 102/120] TEST-#2026: speed up test_join_sort.py (#2027) * TEST-#2026: speed up test_join_sort.py Signed-off-by: Anatoly Myachev * TEST-#2026: Update modin/pandas/test/dataframe/test_join_sort.py Co-authored-by: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Signed-off-by: Anatoly Myachev --- modin/pandas/test/dataframe/test_join_sort.py | 116 ++++++++---------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index b7232de38bb..7e1edf022dc 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -30,6 +30,9 @@ axis_values, bool_arg_keys, bool_arg_values, + test_data, + generate_multiindex, + eval_general, ) pd.DEFAULT_NPARTITIONS = 4 @@ -306,82 +309,67 @@ def test_merge(test_data, test_data2): modin_df.merge("Non-valid type") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) ) @pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) -@pytest.mark.parametrize( - "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) -) -def test_sort_index(data, axis, ascending, na_position, sort_remaining): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) +def test_sort_index(axis, ascending, na_position): + data = test_data["float_nan_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) # Change index value so sorting will actually make a difference - if axis == "rows" or axis == 0: + if axis == 0: length = len(modin_df.index) - modin_df.index = [(i - length / 2) % length for i in range(length)] - pandas_df.index = [(i - length / 2) % length for i in range(length)] + for df in [modin_df, pandas_df]: + df.index = [(i - length / 2) % length for i in range(length)] + # Add NaNs to sorted index - if axis == "rows" or axis == 0: - length = len(modin_df.index) - modin_df.index = [ - np.nan if i % 2 == 0 else modin_df.index[i] for i in range(length) - ] - pandas_df.index = [ - np.nan if i % 2 == 0 else pandas_df.index[i] for i in range(length) - ] - else: - length = len(modin_df.columns) - modin_df.columns = [ - np.nan if i % 2 == 0 else modin_df.columns[i] for i in range(length) - ] - pandas_df.columns = [ - np.nan if i % 2 == 0 else pandas_df.columns[i] for i in range(length) - ] - - modin_result = modin_df.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=False - ) - pandas_result = pandas_df.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=False - ) - df_equals(modin_result, pandas_result) + for df in [modin_df, pandas_df]: + sort_index = df.axes[axis] + df.set_axis( + [np.nan if i % 2 == 0 else sort_index[i] for i in range(len(sort_index))], + axis=axis, + inplace=True, + ) - modin_df_cp = modin_df.copy() - pandas_df_cp = pandas_df.copy() - modin_df_cp.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=True - ) - pandas_df_cp.sort_index( - axis=axis, ascending=ascending, na_position=na_position, inplace=True + eval_general( + modin_df, + pandas_df, + lambda df: df.sort_index( + axis=axis, ascending=ascending, na_position=na_position + ), ) - df_equals(modin_df_cp, pandas_df_cp) - # MultiIndex - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - modin_df.index = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(modin_df))] - ) - pandas_df.index = pandas.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(pandas_df))] - ) - modin_df.columns = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(modin_df.columns))] - ) - pandas_df.columns = pd.MultiIndex.from_tuples( - [(i // 10, i // 5, i) for i in range(len(pandas_df.columns))] - ) - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(level=0), pandas_df.sort_index(level=0)) - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(axis=0), pandas_df.sort_index(axis=0)) - with pytest.warns(UserWarning): - df_equals(modin_df.sort_index(axis=1), pandas_df.sort_index(axis=1)) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +def test_sort_index_inplace(axis): + data = test_data["int_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + for df in [modin_df, pandas_df]: + df.sort_index(axis=axis, inplace=True) + df_equals(modin_df, pandas_df) + + +@pytest.mark.parametrize( + "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) +) +def test_sort_multiindex(sort_remaining): + data = test_data["int_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + for index in ["index", "columns"]: + new_index = generate_multiindex(len(getattr(modin_df, index))) + for df in [modin_df, pandas_df]: + setattr(df, index, new_index) + + for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: + with pytest.warns(UserWarning): + df_equals( + modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), + pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 8265b719e01ee482c269f44d60f8fd3aa590c21f Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Mon, 7 Sep 2020 18:13:29 +0300 Subject: [PATCH 103/120] FIX-#1959 #1987: Fix `duplicated` and `drop_duplicates` functions (#1994) Signed-off-by: Alexey Prutskov --- modin/pandas/dataframe.py | 6 +++++- modin/pandas/test/dataframe/test_window.py | 6 +++++- modin/pandas/test/test_series.py | 10 ++++++++-- modin/pandas/test/utils.py | 20 +++++++++++++------- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index c62ec8d6048..0d31ba9f820 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -282,11 +282,15 @@ def duplicated(self, subset=None, keep="first"): Returns: Series """ + import hashlib + df = self[subset] if subset is not None else self # if the number of columns we are checking for duplicates is larger than 1, we must # hash them to generate a single value that can be compared across rows. if len(df.columns) > 1: - hashed = df.apply(lambda s: hash(tuple(s)), axis=1).to_frame() + hashed = df.apply( + lambda s: hashlib.new("md5", str(tuple(s)).encode()).hexdigest(), axis=1 + ).to_frame() else: hashed = df duplicates = hashed.apply(lambda s: s.duplicated(keep=keep)).squeeze(axis=1) diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index a8e5b967517..ce2bc0e2ba2 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -24,6 +24,8 @@ name_contains, test_data_values, test_data_keys, + test_data_with_duplicates_values, + test_data_with_duplicates_keys, no_numeric_dfs, quantiles_keys, quantiles_values, @@ -92,7 +94,9 @@ def test_diff_transposed(axis): ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys +) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index e6fc6daf369..13bb7450711 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -31,6 +31,8 @@ test_data, test_data_values, test_data_keys, + test_data_with_duplicates_values, + test_data_with_duplicates_keys, test_string_data_values, test_string_data_keys, test_string_list_data_values, @@ -1391,7 +1393,9 @@ def test_drop(): modin_series.drop(None, None, None, None) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys +) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) @@ -1527,7 +1531,9 @@ def test_dt(): df_equals(modin_series.dt.to_timestamp(), pandas_series.dt.to_timestamp()) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys +) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] ) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index a412a730704..7e2c57064d1 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -101,6 +101,11 @@ "col{}".format(int(NCOLS / 2)) ) +for col in test_data["float_nan_data"]: + for row in range(NROWS // 2): + if row % 16 == 0: + test_data["float_nan_data"][col][row] = np.NaN + test_data_values = list(test_data.values()) test_data_keys = list(test_data.keys()) @@ -128,21 +133,22 @@ ] for i in range(NCOLS) }, - "subset_duplicates": { - "col{}".format(i): [ - i if j % 7 == 0 and i in [1, 3, 7] else x - for j, x in enumerate(range(NROWS)) - ] - for i in range(NCOLS) - }, "has_name_column": { "name": ["one", "two", "two", "three"], "col1": [1, 2, 2, 3], "col3": [10, 20, 20, 3], "col7": [100, 201, 200, 300], }, + "str_columns": { + "col_str{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [ + "s" + str(x % 5) for x in range(NROWS) + ] + for i in range(NCOLS) + }, } +test_data_with_duplicates["float_nan"] = test_data["float_nan_data"] + test_data_small = { "small": { "col0": [1, 2, 3, 4], From a9c6e500e986ae94de6ab56ef39e8ff38f516965 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 19:50:23 +0300 Subject: [PATCH 104/120] TEST-#2037: speed up test_binary with refactor dataframe.py (#2038) Signed-off-by: Anatoly Myachev --- modin/pandas/dataframe.py | 105 ++---- modin/pandas/test/dataframe/test_binary.py | 381 +++++---------------- 2 files changed, 116 insertions(+), 370 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 0d31ba9f820..bae377e57c9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -3086,88 +3086,29 @@ def __delitem__(self, key): raise KeyError(key) self._update_inplace(new_query_compiler=self._query_compiler.delitem(key)) - def __add__(self, other, axis=None, level=None, fill_value=None): - return self.add(other, axis=axis, level=level, fill_value=fill_value) - - def __iadd__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.add(other, axis=axis, level=level, fill_value=fill_value) - - def __radd__(self, other, axis=None, level=None, fill_value=None): - return self.radd(other, axis=axis, level=level, fill_value=fill_value) - - def __mul__(self, other, axis=None, level=None, fill_value=None): - return self.mul(other, axis=axis, level=level, fill_value=fill_value) - - def __imul__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.mul(other, axis=axis, level=level, fill_value=fill_value) - - def __rmul__(self, other, axis=None, level=None, fill_value=None): - return self.rmul(other, axis=axis, level=level, fill_value=fill_value) - - def __pow__(self, other, axis=None, level=None, fill_value=None): - return self.pow(other, axis=axis, level=level, fill_value=fill_value) - - def __ipow__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.pow(other, axis=axis, level=level, fill_value=fill_value) - - def __rpow__(self, other, axis=None, level=None, fill_value=None): - return self.rpow(other, axis=axis, level=level, fill_value=fill_value) - - def __sub__(self, other, axis=None, level=None, fill_value=None): - return self.sub(other, axis=axis, level=level, fill_value=fill_value) - - def __isub__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.sub(other, axis=axis, level=level, fill_value=fill_value) - - def __rsub__(self, other, axis=None, level=None, fill_value=None): - return self.rsub(other, axis=axis, level=level, fill_value=fill_value) - - def __floordiv__(self, other, axis=None, level=None, fill_value=None): - return self.floordiv(other, axis=axis, level=level, fill_value=fill_value) - - def __ifloordiv__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.floordiv(other, axis=axis, level=level, fill_value=fill_value) - - def __rfloordiv__(self, other, axis=None, level=None, fill_value=None): - return self.rfloordiv(other, axis=axis, level=level, fill_value=fill_value) - - def __truediv__(self, other, axis=None, level=None, fill_value=None): - return self.truediv(other, axis=axis, level=level, fill_value=fill_value) - - def __itruediv__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.truediv(other, axis=axis, level=level, fill_value=fill_value) - - def __rtruediv__(self, other, axis=None, level=None, fill_value=None): - return self.rtruediv(other, axis=axis, level=level, fill_value=fill_value) - - def __mod__(self, other, axis=None, level=None, fill_value=None): - return self.mod(other, axis=axis, level=level, fill_value=fill_value) - - def __imod__( - self, other, axis=None, level=None, fill_value=None - ): # pragma: no cover - return self.mod(other, axis=axis, level=level, fill_value=fill_value) - - def __rmod__(self, other, axis=None, level=None, fill_value=None): - return self.rmod(other, axis=axis, level=level, fill_value=fill_value) - - def __div__(self, other, axis=None, level=None, fill_value=None): - return self.div(other, axis=axis, level=level, fill_value=fill_value) - - def __rdiv__(self, other, axis=None, level=None, fill_value=None): - return self.rdiv(other, axis=axis, level=level, fill_value=fill_value) + __add__ = add + __iadd__ = add # pragma: no cover + __radd__ = radd + __mul__ = mul + __imul__ = mul # pragma: no cover + __rmul__ = rmul + __pow__ = pow + __ipow__ = pow # pragma: no cover + __rpow__ = rpow + __sub__ = sub + __isub__ = sub # pragma: no cover + __rsub__ = rsub + __floordiv__ = floordiv + __ifloordiv__ = floordiv # pragma: no cover + __rfloordiv__ = rfloordiv + __truediv__ = truediv + __itruediv__ = truediv # pragma: no cover + __rtruediv__ = rtruediv + __mod__ = mod + __imod__ = mod # pragma: no cover + __rmod__ = rmod + __div__ = div + __rdiv__ = rdiv @property def attrs(self): diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index c24164dcc65..68028fbf4c6 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -17,13 +17,12 @@ import modin.pandas as pd from modin.pandas.test.utils import ( - random_state, - RAND_LOW, - RAND_HIGH, df_equals, test_data_values, test_data_keys, eval_general, + test_data, + create_test_dfs, ) pd.DEFAULT_NPARTITIONS = 4 @@ -32,204 +31,108 @@ matplotlib.use("Agg") -def inter_df_math_helper(modin_df, pandas_df, op): - # Test dataframe to dataframe - try: - pandas_result = getattr(pandas_df, op)(pandas_df) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df) - else: - modin_result = getattr(modin_df, op)(modin_df) - df_equals(modin_result, pandas_result) - - # Test dataframe to int - try: - pandas_result = getattr(pandas_df, op)(4) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4) - else: - modin_result = getattr(modin_df, op)(4) - df_equals(modin_result, pandas_result) - - # Test dataframe to float - try: - pandas_result = getattr(pandas_df, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4.0) - else: - modin_result = getattr(modin_df, op)(4.0) - df_equals(modin_result, pandas_result) - - # Test transposed dataframes to float - try: - pandas_result = getattr(pandas_df.T, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df.T, op)(4.0) - else: - modin_result = getattr(modin_df.T, op)(4.0) - df_equals(modin_result, pandas_result) - - frame_data = { - "{}_other".format(modin_df.columns[0]): [0, 2], - modin_df.columns[0]: [0, 19], - modin_df.columns[1]: [1, 1], - } - modin_df2 = pd.DataFrame(frame_data) - pandas_df2 = pandas.DataFrame(frame_data) - - # Test dataframe to different dataframe shape - try: - pandas_result = getattr(pandas_df, op)(pandas_df2) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df2) - else: - modin_result = getattr(modin_df, op)(modin_df2) - df_equals(modin_result, pandas_result) - - # Test dataframe fill value - try: - pandas_result = getattr(pandas_df, op)(pandas_df2, fill_value=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(modin_df2, fill_value=0) - else: - modin_result = getattr(modin_df, op)(modin_df2, fill_value=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to list - list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[1])) - try: - pandas_result = getattr(pandas_df, op)(list_test, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(list_test, axis=1) - else: - modin_result = getattr(modin_df, op)(list_test, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to series axis=0 - series_test_modin = modin_df[modin_df.columns[0]] - series_test_pandas = pandas_df[pandas_df.columns[0]] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to series axis=1 - series_test_modin = modin_df.iloc[0] - series_test_pandas = pandas_df.iloc[0] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to list axis=1 - series_test_modin = series_test_pandas = list(pandas_df.iloc[0]) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to list axis=0 - series_test_modin = series_test_pandas = list(pandas_df[pandas_df.columns[0]]) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Test dataframe to series missing values - series_test_modin = modin_df.iloc[0, :-2] - series_test_pandas = pandas_df.iloc[0, :-2] - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=1) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=1) - df_equals(modin_result, pandas_result) - - # Test dataframe to series with different index - series_test_modin = modin_df[modin_df.columns[0]].reset_index(drop=True) - series_test_pandas = pandas_df[pandas_df.columns[0]].reset_index(drop=True) - try: - pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(series_test_modin, axis=0) - else: - modin_result = getattr(modin_df, op)(series_test_modin, axis=0) - df_equals(modin_result, pandas_result) - - # Level test - new_idx = pandas.MultiIndex.from_tuples( +@pytest.mark.parametrize( + "other", + [ + lambda df: 4, + lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]), + ], + ids=["scalar", "series_or_list"], +) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize( + "op", + [ + *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), + *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), + ], +) +def test_math_functions(other, axis, op): + data = test_data["float_nan_data"] + if (op == "floordiv" or op == "rfloordiv") and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behaviour") + + if op == "rmod" and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behaviour") + + eval_general( + *create_test_dfs(data), lambda df: getattr(df, op)(other(df, axis), axis=axis) + ) + + +@pytest.mark.parametrize( + "other", + [lambda df: df[: -(2 ** 4)], lambda df: df[df.columns[0]].reset_index(drop=True)], + ids=["check_missing_value", "check_different_index"], +) +@pytest.mark.parametrize("fill_value", [None, 3.0]) +@pytest.mark.parametrize( + "op", + [ + *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), + *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), + ], +) +def test_math_functions_fill_value(other, fill_value, op): + data = test_data["int_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + eval_general( + modin_df, + pandas_df, + lambda df: getattr(df, op)(other(df), axis=0, fill_value=fill_value), + ) + + +@pytest.mark.parametrize( + "op", + [ + *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), + *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), + ], +) +def test_math_functions_level(op): + modin_df = pd.DataFrame(test_data["int_data"]) + modin_df.index = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_df.index] ) - modin_df_multi_level = modin_df.copy() - modin_df_multi_level.index = new_idx + # Defaults to pandas with pytest.warns(UserWarning): # Operation against self for sanity check - getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + getattr(modin_df, op)(modin_df, axis=0, level=1) @pytest.mark.parametrize( - "function", + "math_op, alias", [ - "add", - "div", - "divide", - "floordiv", - "mod", - "mul", - "multiply", - "pow", - "sub", - "subtract", - "truediv", - "__div__", - "__add__", - "__radd__", - "__mul__", - "__rmul__", - "__pow__", - "__rpow__", - "__sub__", - "__floordiv__", - "__rfloordiv__", - "__truediv__", - "__rtruediv__", - "__mod__", - "__rmod__", - "__rdiv__", + ("truediv", "divide"), + ("truediv", "div"), + ("rtruediv", "rdiv"), + ("mul", "multiply"), + ("sub", "subtract"), + ("add", "__add__"), + ("radd", "__radd__"), + ("div", "__div__"), + ("rdiv", "__rdiv__"), + ("truediv", "__truediv__"), + ("rtruediv", "__rtruediv__"), + ("floordiv", "__floordiv__"), + ("rfloordiv", "__rfloordiv__"), + ("mod", "__mod__"), + ("rmod", "__rmod__"), + ("mul", "__mul__"), + ("rmul", "__rmul__"), + ("pow", "__pow__"), + ("rpow", "__rpow__"), + ("sub", "__sub__"), + ("rsub", "__rsub__"), ], ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_math_functions(data, function): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_helper(modin_df, pandas_df, function) +def test_math_alias(math_op, alias): + assert getattr(pd.DataFrame, math_op) == getattr(pd.DataFrame, alias) @pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) @@ -264,104 +167,6 @@ def test_multi_level_comparison(data, op): getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) -# Test dataframe right operations -def inter_df_math_right_ops_helper(modin_df, pandas_df, op): - try: - pandas_result = getattr(pandas_df, op)(4) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4) - else: - modin_result = getattr(modin_df, op)(4) - df_equals(modin_result, pandas_result) - - try: - pandas_result = getattr(pandas_df, op)(4.0) - except Exception as e: - with pytest.raises(type(e)): - getattr(modin_df, op)(4.0) - else: - modin_result = getattr(modin_df, op)(4.0) - df_equals(modin_result, pandas_result) - - new_idx = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in modin_df.index] - ) - modin_df_multi_level = modin_df.copy() - modin_df_multi_level.index = new_idx - - # Defaults to pandas - with pytest.warns(UserWarning): - # Operation against self for sanity check - getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_radd(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "radd") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rdiv(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rdiv") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rfloordiv(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rfloordiv") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rmod(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rmod") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rmul(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rmul") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rpow(request, data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rpow") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rsub(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rsub") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_rtruediv(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rtruediv") - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test___rsub__(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "__rsub__") - - -# END test dataframe right operations - - def test_equals(): frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]} modin_df1 = pd.DataFrame(frame_data) From 9325ee556e859a9297ef78e6691709c04977738b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 21:53:09 +0300 Subject: [PATCH 105/120] TEST-#2044: speed up iter tests (#2045) Signed-off-by: Anatoly Myachev --- modin/pandas/test/dataframe/test_iter.py | 71 ++++++++---------------- 1 file changed, 24 insertions(+), 47 deletions(-) diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index b1372396407..d35687a8123 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -27,6 +27,7 @@ test_data_values, test_data_keys, create_test_dfs, + test_data, ) pd.DEFAULT_NPARTITIONS = 4 @@ -35,66 +36,42 @@ matplotlib.use("Agg") -def test_items(): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) +@pytest.mark.parametrize("method", ["items", "iteritems", "iterrows"]) +def test_items_iteritems_iterrows(method): + data = test_data["float_nan_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - modin_items = modin_df.items() - pandas_items = pandas_df.items() - for modin_item, pandas_item in zip(modin_items, pandas_items): + for modin_item, pandas_item in zip( + getattr(modin_df, method)(), getattr(pandas_df, method)() + ): modin_index, modin_series = modin_item pandas_index, pandas_series = pandas_item df_equals(pandas_series, modin_series) assert pandas_index == modin_index -def test_iteritems(): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_items = modin_df.iteritems() - pandas_items = pandas_df.iteritems() - for modin_item, pandas_item in zip(modin_items, pandas_items): - modin_index, modin_series = modin_item - pandas_index, pandas_series = pandas_item - df_equals(pandas_series, modin_series) - assert pandas_index == modin_index - +@pytest.mark.parametrize("name", [None, "NotPandas"]) +def test_itertuples_name(name): + data = test_data["float_nan_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) -def test_iterrows(): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_iterrows = modin_df.iterrows() - pandas_iterrows = pandas_df.iterrows() - for modin_row, pandas_row in zip(modin_iterrows, pandas_iterrows): - modin_index, modin_series = modin_row - pandas_index, pandas_series = pandas_row - df_equals(pandas_series, modin_series) - assert pandas_index == modin_index - - -@pytest.mark.parametrize("name", [None, "NotPandas", "Pandas"]) -@pytest.mark.parametrize("index", [True, False]) -def test_itertuples(name, index): - modin_df = pd.DataFrame(test_data_values[0]) - pandas_df = pandas.DataFrame(test_data_values[0]) - - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) + modin_it_custom = modin_df.itertuples(name=name) + pandas_it_custom = pandas_df.itertuples(name=name) for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): np.testing.assert_equal(modin_row, pandas_row) - mi_index_modin = pd.MultiIndex.from_tuples( + +def test_itertuples_multiindex(): + data = test_data["int_data"] + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + + new_idx = pd.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in range(len(modin_df.columns))] ) - mi_index_pandas = pandas.MultiIndex.from_tuples( - [(i // 4, i // 2, i) for i in range(len(pandas_df.columns))] - ) - modin_df.columns = mi_index_modin - pandas_df.columns = mi_index_pandas - modin_it_custom = modin_df.itertuples(index=index, name=name) - pandas_it_custom = pandas_df.itertuples(index=index, name=name) + modin_df.columns = new_idx + pandas_df.columns = new_idx + modin_it_custom = modin_df.itertuples() + pandas_it_custom = pandas_df.itertuples() for modin_row, pandas_row in zip(modin_it_custom, pandas_it_custom): np.testing.assert_equal(modin_row, pandas_row) From 0c0b6f64bc9c6afc1267fe9fc703a0e30d783f96 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 7 Sep 2020 21:56:19 +0300 Subject: [PATCH 106/120] TEST-#2042: speed up udf tests (#2043) Signed-off-by: Anatoly Myachev --- modin/pandas/base.py | 5 +- modin/pandas/test/dataframe/test_udf.py | 174 ++++-------------------- 2 files changed, 29 insertions(+), 150 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index e3c2ad0b3b1..6ab559f9212 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -358,9 +358,6 @@ def add(self, other, axis="columns", level=None, fill_value=None): "add", other, axis=axis, level=level, fill_value=fill_value ) - def agg(self, func, axis=0, *args, **kwargs): - return self.aggregate(func, axis=axis, *args, **kwargs) - def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) result = None @@ -375,6 +372,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): return self.apply(func, axis=axis, args=args, **kwargs) return result + agg = aggregate + def _aggregate(self, arg, *args, **kwargs): _axis = kwargs.pop("_axis", 0) kwargs.pop("_level", None) diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index a87243471e5..eef5f259103 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -20,21 +20,17 @@ from modin.pandas.test.utils import ( random_state, df_equals, - name_contains, test_data_values, test_data_keys, - numeric_dfs, query_func_keys, query_func_values, agg_func_keys, agg_func_values, - numeric_agg_funcs, - axis_keys, - axis_values, eval_general, create_test_dfs, udf_func_values, udf_func_keys, + test_data, ) pd.DEFAULT_NPARTITIONS = 4 @@ -43,85 +39,31 @@ matplotlib.use("Agg") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_agg(data, axis, func): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_agg_numeric(request, data, axis, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_aggregate(request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.aggregate(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.aggregate(func, axis) - else: - modin_result = modin_df.aggregate(func, axis) - df_equals(modin_result, pandas_result) +@pytest.mark.parametrize("op", ["agg", "apply"]) +def test_agg_apply(axis, func, op): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, op)(func, axis), + ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_aggregate_numeric(request, data, axis, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) +@pytest.mark.parametrize("op", ["agg", "apply"]) +def test_agg_apply_axis_names(axis, func, op): + eval_general( + *create_test_dfs(test_data["int_data"]), lambda df: getattr(df, op)(func, axis) + ) - try: - pandas_result = pandas_df.agg(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.agg(func, axis) - else: - modin_result = modin_df.agg(func, axis) - df_equals(modin_result, pandas_result) +def test_aggregate_alias(): + assert pd.DataFrame.agg == pd.DataFrame.aggregate -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_aggregate_error_checking(data): - modin_df = pd.DataFrame(data) - assert modin_df.aggregate("ndim") == 2 +def test_aggregate_error_checking(): + modin_df = pd.DataFrame(test_data["float_nan_data"]) with pytest.warns(UserWarning): modin_df.aggregate({modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"}) @@ -133,25 +75,12 @@ def test_aggregate_error_checking(data): modin_df.aggregate("NOT_EXISTS") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_apply(request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - +def test_apply_type_error(func): + modin_df = pd.DataFrame(test_data["int_data"]) with pytest.raises(TypeError): modin_df.apply({"row": func}, axis=1) - try: - pandas_result = pandas_df.apply(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.apply(func, axis) - else: - modin_result = modin_df.apply(func, axis) - df_equals(modin_result, pandas_result) - @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("level", [None, -1, 0, 1]) @@ -201,25 +130,19 @@ def test_apply_text_func_with_level(level, data, func, axis): ) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -def test_apply_args(data, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize("args", [(1,), ("_A",)]) +def test_apply_args(axis, args): def apply_func(series, y): try: return series + y except TypeError: return series.map(str) + str(y) - modin_result = modin_df.apply(apply_func, axis=axis, args=(1,)) - pandas_result = pandas_df.apply(apply_func, axis=axis, args=(1,)) - df_equals(modin_result, pandas_result) - - modin_result = modin_df.apply(apply_func, axis=axis, args=("_A",)) - pandas_result = pandas_df.apply(apply_func, axis=axis, args=("_A",)) - df_equals(modin_result, pandas_result) + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df.apply(apply_func, axis=axis, args=args), + ) def test_apply_metadata(): @@ -240,30 +163,6 @@ def add(a, b, c): df_equals(modin_df, pandas_df) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) -def test_apply_numeric(request, data, func, axis): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - if name_contains(request.node.name, numeric_dfs): - try: - pandas_result = pandas_df.apply(func, axis) - except Exception as e: - with pytest.raises(type(e)): - modin_df.apply(func, axis) - else: - modin_result = modin_df.apply(func, axis) - df_equals(modin_result, pandas_result) - - if "empty_data" not in request.node.name: - key = modin_df.columns[0] - modin_result = modin_df.apply(lambda df: df.drop(key), axis=1) - pandas_result = pandas_df.apply(lambda df: df.drop(key), axis=1) - df_equals(modin_result, pandas_result) - - @pytest.mark.parametrize("func", udf_func_values, ids=udf_func_keys) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_apply_udf(data, func): @@ -422,22 +321,3 @@ def test_transform(request, data, func): else: modin_result = modin_df.transform(func) df_equals(modin_result, pandas_result) - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) -def test_transform_numeric(request, data, func): - if name_contains(request.node.name, numeric_agg_funcs) and name_contains( - request.node.name, numeric_dfs - ): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.transform(func) - except Exception as e: - with pytest.raises(type(e)): - modin_df.transform(func) - else: - modin_result = modin_df.transform(func) - df_equals(modin_result, pandas_result) From b370489dd2fffd6a975646c26749b20c9076de89 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 8 Sep 2020 13:14:26 +0300 Subject: [PATCH 107/120] TEST-#2033: speed up test_series.py (#2034) * TEST-#2033: speed up test_series.py Signed-off-by: Anatoly Myachev * TEST-#2033: fix kurtosis test Signed-off-by: Anatoly Myachev * TEST-#2033: refactor test for #1953 Signed-off-by: Anatoly Myachev --- modin/pandas/test/test_series.py | 168 +++++++++++-------------------- 1 file changed, 59 insertions(+), 109 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 13bb7450711..d21267c8242 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -61,6 +61,8 @@ test_data_small_keys, test_data_categorical_values, test_data_categorical_keys, + generate_multiindex, + test_data_diff_dtype, ) pd.DEFAULT_NPARTITIONS = 4 @@ -1850,10 +1852,22 @@ def test_keys(data): df_equals(modin_series.keys(), pandas_series.keys()) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_kurtosis_alias(): + # It's optimization. If failed, Series.kurt should be tested explicitly + # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. + assert pd.Series.kurt == pd.Series.kurtosis + + +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", bool_arg_values, ids=bool_arg_keys) -@pytest.mark.parametrize("level", [None, -1, 0, 1]) +def test_kurtosis(axis, skipna): + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda df: df.kurtosis(axis=axis, skipna=skipna), + ) + + +@pytest.mark.parametrize("axis", ["rows", "columns"]) @pytest.mark.parametrize( "numeric_only", [ @@ -1867,20 +1881,26 @@ def test_keys(data): None, ], ) -@pytest.mark.parametrize("method", ["kurtosis", "kurt"]) -def test_kurt_kurtosis(data, axis, skipna, level, numeric_only, method): - func_kwargs = { - "axis": axis, - "skipna": skipna, - "level": level, - "numeric_only": numeric_only, - } - modin_series, pandas_series = create_test_series(data) +def test_kurtosis_numeric_only(axis, numeric_only): + eval_general( + *create_test_series(test_data_diff_dtype), + lambda df: df.kurtosis(axis=axis, numeric_only=numeric_only), + ) + + +@pytest.mark.parametrize("level", [-1, 0, 1]) +def test_kurtosis_level(level): + data = test_data["int_data"] + modin_s, pandas_s = create_test_series(data) + + index = generate_multiindex(len(data.keys())) + modin_s.columns = index + pandas_s.columns = index eval_general( - modin_series, - pandas_series, - lambda df: df.kurtosis(**func_kwargs), + modin_s, + pandas_s, + lambda s: s.kurtosis(axis=1, level=level), ) @@ -2015,16 +2035,18 @@ def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) - # test for issue #1953 + +@pytest.mark.parametrize("method", ["median", "skew", "std", "sum", "var", "prod"]) +def test_median_skew_std_sum_var_prod_1953(method): + # See #1953 for details + data = [3, 3, 3, 3, 3, 3, 3, 3, 3] arrays = [ ["1", "1", "1", "2", "2", "2", "3", "3", "3"], ["1", "2", "3", "4", "5", "6", "7", "8", "9"], ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.median(level=0) - pandas_result = pandas_series.median(level=0) - df_equals(modin_result, pandas_result) + modin_s = pd.Series(data, index=arrays) + pandas_s = pandas.Series(data, index=arrays) + eval_general(modin_s, pandas_s, lambda s: getattr(s, method)(level=0)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2206,66 +2228,38 @@ def test_pow(data): inter_df_math_helper(modin_series, pandas_series, "pow") -@pytest.mark.parametrize( - "data", - test_data_values + test_data_small_values, - ids=test_data_keys + test_data_small_keys, -) -@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +def test_product_alias(): + assert pd.Series.prod == pd.Series.product + + +@pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) +def test_prod(axis, skipna): + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda s: s.prod(axis=axis, skipna=skipna), + ) + + @pytest.mark.parametrize( "numeric_only", [ None, False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="numeric_only not implemented for pandas.Series" - ), - ), + pytest.param(True, marks=pytest.mark.xfail(reason="didn't raise Exception")), ], ) @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) -@pytest.mark.parametrize( - "operation", - [ - "prod", - pytest.param( - "product", - marks=pytest.mark.skipif( - pandas.Series.product == pandas.Series.prod - and pd.Series.product == pd.Series.prod, - reason="That operation was already tested.", - ), - ), - ], -) -def test_prod(data, axis, skipna, numeric_only, min_count, operation): +def test_prod_specific(min_count, numeric_only): eval_general( - *create_test_series(data), - lambda df, *args, **kwargs: type(df)([getattr(df, operation)(*args, **kwargs)]), - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, + *create_test_series(test_data_diff_dtype), + lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), ) - # test for issue #1953 - arrays = [ - ["1", "1", "1", "2", "2", "2", "3", "3", "3"], - ["1", "2", "3", "4", "5", "6", "7", "8", "9"], - ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.prod(level=0) - pandas_result = pandas_series.prod(level=0) - df_equals(modin_result, pandas_result) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) @@ -2784,17 +2778,6 @@ def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) - # test for issue #1953 - arrays = [ - ["1", "1", "1", "2", "2", "2", "3", "3", "3"], - ["1", "2", "3", "4", "5", "6", "7", "8", "9"], - ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.skew(level=0) - pandas_result = pandas_series.skew(level=0) - df_equals(modin_result, pandas_result) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray"]) @@ -2913,17 +2896,6 @@ def test_std(request, data, skipna, ddof): modin_result = modin_series.std(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) - # test for issue #1953 - arrays = [ - ["1", "1", "1", "2", "2", "2", "3", "3", "3"], - ["1", "2", "3", "4", "5", "6", "7", "8", "9"], - ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.std(level=0) - pandas_result = pandas_series.std(level=0) - df_equals(modin_result, pandas_result) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sub(data): @@ -2972,17 +2944,6 @@ def test_sum(data, axis, skipna, numeric_only, min_count): min_count=min_count, ) - # test for issue #1953 - arrays = [ - ["1", "1", "1", "2", "2", "2", "3", "3", "3"], - ["1", "2", "3", "4", "5", "6", "7", "8", "9"], - ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.sum(level=0) - pandas_result = pandas_series.sum(level=0) - df_equals(modin_result, pandas_result) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1, "columns", "index"]) @@ -3361,17 +3322,6 @@ def test_var(data, skipna, ddof): modin_result = modin_series.var(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) - # test for issue #1953 - arrays = [ - ["1", "1", "1", "2", "2", "2", "3", "3", "3"], - ["1", "2", "3", "4", "5", "6", "7", "8", "9"], - ] - modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) - modin_result = modin_series.var(level=0) - pandas_result = pandas_series.var(level=0) - df_equals(modin_result, pandas_result) - def test_view(): modin_series = pd.Series([-2, -1, 0, 1, 2], dtype="int8") From 06cc5a40dee55cc2e43e2b1bcbc7097d927e26c7 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 8 Sep 2020 18:04:42 +0300 Subject: [PATCH 108/120] TEST-#2039: speed up default tests (#2040) * TEST-#2039: speed up default tests Signed-off-by: Anatoly Myachev * TEST-#2039: refactor some resample tests Signed-off-by: Anatoly Myachev * TEST-#2039: fix test_resampler Signed-off-by: Anatoly Myachev --- modin/pandas/test/dataframe/test_default.py | 479 ++++++++------------ modin/pandas/test/utils.py | 7 +- 2 files changed, 187 insertions(+), 299 deletions(-) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 83f36d23ae2..c07849f2231 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -34,6 +34,9 @@ eval_general, create_test_dfs, generate_multiindex, + test_data_resample, + test_data, + test_data_diff_dtype, ) pd.DEFAULT_NPARTITIONS = 4 @@ -42,17 +45,66 @@ matplotlib.use("Agg") -def test_align(): +@pytest.mark.parametrize( + "op, make_args", + [ + ("align", lambda df: {"other": df}), + ("corr", None), + ("expanding", None), + ("corrwith", lambda df: {"other": df}), + ("explode", lambda df: {"column": df.columns[0]}), + ("ewm", lambda df: {"com": 0.5}), + ("from_dict", lambda df: {"data": None}), + ("from_records", lambda df: {"data": to_pandas(df)}), + ("hist", lambda df: {"column": "int_col"}), + ("infer_objects", None), + ("interpolate", None), + ("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}), + ("mask", lambda df: {"cond": df != 0}), + ("pct_change", None), + ("sem", None), + ("__getstate__", None), + ("to_xarray", None), + ("pivot_table", lambda df: {"values": "int_col", "index": ["float_col"]}), + ], +) +def test_ops_defaulting_to_pandas(op, make_args): + modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) + with pytest.warns(UserWarning): + operation = getattr(modin_df, op) + if make_args is not None: + operation(**make_args(modin_df)) + else: + operation() + + +def test_style(): data = test_data_values[0] with pytest.warns(UserWarning): - pd.DataFrame(data).align(pd.DataFrame(data)) + pd.DataFrame(data).style + + +def test___setstate__(): + data = test_data_values[0] + with pytest.warns(UserWarning): + try: + pd.DataFrame(data).__setstate__(None) + except TypeError: + pass + + +def test_to_timestamp(): + idx = pd.date_range("1/1/2012", periods=5, freq="M") + df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) + + with pytest.warns(UserWarning): + df.to_period().to_timestamp() @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_numpy(data): - modin_frame = pd.DataFrame(data) - pandas_frame = pandas.DataFrame(data) - assert_array_equal(modin_frame.values, pandas_frame.values) + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + assert_array_equal(modin_df.values, pandas_df.values) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -165,18 +217,6 @@ def test_combine_first(): df_equals(modin_df1.combine_first(modin_df2), pandas_df1.combine_first(pandas_df2)) -def test_corr(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).corr() - - -def test_corrwith(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).corrwith(pd.DataFrame(data)) - - def test_cov(): data = test_data_values[0] modin_result = pd.DataFrame(data).cov() @@ -272,25 +312,6 @@ def test_matmul(data): modin_result = modin_df @ pd.Series(np.arange(col_len)) -def test_ewm(): - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - with pytest.warns(UserWarning): - df.ewm(com=0.5).mean() - - -def test_expanding(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).expanding() - - -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_explode(data): - modin_df = pd.DataFrame(data) - with pytest.warns(UserWarning): - modin_df.explode(modin_df.columns[0]) - - def test_first(): i = pd.date_range("2010-04-09", periods=400, freq="2D") modin_df = pd.DataFrame({"A": list(range(400)), "B": list(range(400))}, index=i) @@ -301,48 +322,6 @@ def test_first(): df_equals(modin_df.first("20D"), pandas_df.first("20D")) -@pytest.mark.skip(reason="Defaulting to Pandas") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_from_dict(data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_dict(None) - - -@pytest.mark.skip(reason="Defaulting to Pandas") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_from_items(data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_items(None) - - -@pytest.mark.skip(reason="Defaulting to Pandas") -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_from_records(data): - modin_df = pd.DataFrame(data) # noqa F841 - pandas_df = pandas.DataFrame(data) # noqa F841 - - with pytest.raises(NotImplementedError): - pd.DataFrame.from_records(None) - - -def test_hist(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).hist(None) - - -def test_infer_objects(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).infer_objects() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_info_default_param(data): with io.StringIO() as first, io.StringIO() as second: @@ -389,46 +368,43 @@ def test_info(verbose, max_cols, memory_usage, null_counts): assert modin_info[1:] == pandas_info[1:] -def test_interpolate(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).interpolate() - - -def test_kurt_kurtosis_equals(): - # It's optimization. If failed, df.kurt should be tested explicitly - # in tests: `test_kurt_kurtosis`, `test_kurt_kurtosis_level`. - data = test_data_values[0] - df_modin = pd.DataFrame(data) - assert df_modin.kurt == df_modin.kurtosis - - @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("skipna", bool_arg_values, ids=bool_arg_keys) @pytest.mark.parametrize("numeric_only", bool_arg_values, ids=bool_arg_keys) -def test_kurt_kurtosis(axis, skipna, numeric_only): - data = test_data_values[0] - df_modin = pd.DataFrame(data) - df_pandas = pandas.DataFrame(data) +@pytest.mark.parametrize( + "method", + [ + "kurtosis", + pytest.param( + "kurt", + marks=pytest.mark.skipif( + pandas.DataFrame.kurt == pandas.DataFrame.kurtosis + and pd.DataFrame.kurt == pd.DataFrame.kurtosis, + reason="That method was already tested.", + ), + ), + ], +) +def test_kurt_kurtosis(axis, skipna, numeric_only, method): + data = test_data["float_nan_data"] eval_general( - df_modin, - df_pandas, - lambda df: df.kurtosis( - axis=axis, skipna=skipna, level=None, numeric_only=numeric_only + *create_test_dfs(data), + lambda df: getattr(df, method)( + axis=axis, skipna=skipna, numeric_only=numeric_only ), ) @pytest.mark.parametrize("level", [-1, 0, 1]) def test_kurt_kurtosis_level(level): - data = test_data_values[0] - df_modin = pd.DataFrame(data) - df_pandas = pandas.DataFrame(data) + data = test_data["int_data"] + df_modin, df_pandas = pd.DataFrame(data), pandas.DataFrame(data) index = generate_multiindex(len(data.keys())) df_modin.columns = index df_pandas.columns = index + eval_general( df_modin, df_pandas, @@ -449,12 +425,6 @@ def test_last(): df_equals(modin_df.last("20D"), pandas_df.last("20D")) -def test_lookup(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).lookup([0, 1], ["col1", "col2"]) - - @pytest.mark.parametrize("data", test_data_values) @pytest.mark.parametrize("axis", [None, 0, 1]) @pytest.mark.parametrize("skipna", [None, True, False]) @@ -481,16 +451,6 @@ def test_mad_level(level): ) -def test_mask(): - df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) - m = df % 3 == 0 - with pytest.warns(UserWarning): - try: - df.mask(~m, -df) - except ValueError: - pass - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "id_vars", [lambda df: df.columns[0], lambda df: df.columns[:4], None] @@ -509,12 +469,6 @@ def test_melt(data, id_vars, value_vars): ) -def test_pct_change(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).pct_change() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "index", [lambda df: df.columns[0], lambda df: df[df.columns[0]].values, None] @@ -534,30 +488,6 @@ def test_pivot(data, index, columns, values): ) -def test_pivot_table(): - df = pd.DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) - with pytest.warns(UserWarning): - df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc=np.sum) - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_plot(request, data): modin_df = pd.DataFrame(data) @@ -614,118 +544,127 @@ def test_replace(): @pytest.mark.parametrize("rule", ["5T", pandas.offsets.Hour()]) -@pytest.mark.parametrize("axis", [0, "columns"]) -@pytest.mark.parametrize("closed", ["left", "right"]) -@pytest.mark.parametrize("label", ["right", "left"]) -@pytest.mark.parametrize("on", [None, "DateColumn"]) -@pytest.mark.parametrize("level", [None, 1]) -def test_resample(rule, axis, closed, label, on, level): - freq = "H" - base = 2 - index = pandas.date_range("31/12/2000", periods=12, freq=freq) - data = {"A": range(12), "B": range(12)} +@pytest.mark.parametrize("axis", [0]) +def test_resampler(rule, axis): + data, index, = ( + test_data_resample["data"], + test_data_resample["index"], + ) + modin_resampler = pd.DataFrame(data, index=index).resample(rule, axis=axis, base=2) + pandas_resampler = pandas.DataFrame(data, index=index).resample( + rule, axis=axis, base=2 + ) + assert pandas_resampler.indices == modin_resampler.indices + assert pandas_resampler.groups == modin_resampler.groups + + df_equals( + modin_resampler.get_group(name=list(modin_resampler.groups)[0]), + pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), + ) + + +@pytest.mark.parametrize("rule", ["5T"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +@pytest.mark.parametrize( + "method", + [ + *("count", "sum", "std", "sem", "size", "prod", "ohlc", "quantile"), + *("min", "median", "mean", "max", "last", "first", "nunique", "var"), + *("interpolate", "asfreq", "pad", "nearest", "bfill", "backfill", "ffill"), + ], +) +def test_resampler_functions(rule, axis, method): + data, index, = ( + test_data_resample["data"], + test_data_resample["index"], + ) + modin_df = pd.DataFrame(data, index=index) pandas_df = pandas.DataFrame(data, index=index) + + eval_general( + modin_df, + pandas_df, + lambda df: getattr(df.resample(rule, axis=axis, base=2), method)(), + ) + + +@pytest.mark.parametrize("rule", ["5T"]) +@pytest.mark.parametrize("axis", ["index", "columns"]) +@pytest.mark.parametrize( + "method_arg", + [ + ("pipe", lambda x: x.max() - x.min()), + ("transform", lambda x: (x - x.mean()) / x.std()), + ("apply", ["sum", "mean", "max"]), + ("aggregate", ["sum", "mean", "max"]), + ], +) +def test_resampler_functions_with_arg(rule, axis, method_arg): + data, index, = ( + test_data_resample["data"], + test_data_resample["index"], + ) modin_df = pd.DataFrame(data, index=index) + pandas_df = pandas.DataFrame(data, index=index) + + method, arg = method_arg[0], method_arg[1] + + eval_general( + modin_df, + pandas_df, + lambda df: getattr(df.resample(rule, axis=axis, base=2), method)(arg), + ) - if on is not None and axis == 0: - pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") - modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") - else: - on = None - if axis == "columns": - pandas_df = pandas_df.T - modin_df = modin_df.T +@pytest.mark.parametrize("rule", ["5T"]) +@pytest.mark.parametrize("closed", ["left", "right"]) +@pytest.mark.parametrize("label", ["right", "left"]) +@pytest.mark.parametrize("on", [None, "DateColumn"]) +@pytest.mark.parametrize("level", [None, 1]) +def test_resample_specific(rule, closed, label, on, level): + data, index, = ( + test_data_resample["data"], + test_data_resample["index"], + ) + modin_df = pd.DataFrame(data, index=index) + pandas_df = pandas.DataFrame(data, index=index) - if level is not None and axis == 0 and on is None: + if on is None and level is not None: index = pandas.MultiIndex.from_product( - [["a", "b", "c"], pandas.date_range("31/12/2000", periods=4, freq=freq)] + [["a", "b", "c"], pandas.date_range("31/12/2000", periods=4, freq="H")] ) pandas_df.index = index modin_df.index = index else: level = None + if on is not None: + pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") + modin_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") + pandas_resampler = pandas_df.resample( - rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level + rule, + closed=closed, + label=label, + on=on, + level=level, ) modin_resampler = modin_df.resample( - rule, axis=axis, closed=closed, label=label, base=base, on=on, level=level + rule, + closed=closed, + label=label, + on=on, + level=level, ) - - df_equals(modin_resampler.count(), pandas_resampler.count()) df_equals(modin_resampler.var(0), pandas_resampler.var(0)) - df_equals(modin_resampler.sum(), pandas_resampler.sum()) - df_equals(modin_resampler.std(), pandas_resampler.std()) - df_equals(modin_resampler.sem(), pandas_resampler.sem()) - df_equals(modin_resampler.size(), pandas_resampler.size()) - df_equals(modin_resampler.prod(), pandas_resampler.prod()) - if on is None: - df_equals(modin_resampler.ohlc(), pandas_resampler.ohlc()) - df_equals(modin_resampler.min(), pandas_resampler.min()) - df_equals(modin_resampler.median(), pandas_resampler.median()) - df_equals(modin_resampler.mean(), pandas_resampler.mean()) - df_equals(modin_resampler.max(), pandas_resampler.max()) - df_equals(modin_resampler.last(), pandas_resampler.last()) - df_equals(modin_resampler.first(), pandas_resampler.first()) - df_equals(modin_resampler.nunique(), pandas_resampler.nunique()) - df_equals( - modin_resampler.pipe(lambda x: x.max() - x.min()), - pandas_resampler.pipe(lambda x: x.max() - x.min()), - ) - df_equals( - modin_resampler.transform(lambda x: (x - x.mean()) / x.std()), - pandas_resampler.transform(lambda x: (x - x.mean()) / x.std()), - ) - df_equals( - pandas_resampler.aggregate("max"), - modin_resampler.aggregate("max"), - ) - df_equals( - modin_resampler.apply("sum"), - pandas_resampler.apply("sum"), - ) - df_equals( - modin_resampler.get_group(name=list(modin_resampler.groups)[0]), - pandas_resampler.get_group(name=list(pandas_resampler.groups)[0]), - ) - assert pandas_resampler.indices == modin_resampler.indices - assert pandas_resampler.groups == modin_resampler.groups - df_equals(modin_resampler.quantile(), pandas_resampler.quantile()) - if axis == 0: - # Upsampling from level= or on= selection is not supported - if on is None and level is None: - df_equals( - modin_resampler.interpolate(), - pandas_resampler.interpolate(), - ) - df_equals(modin_resampler.asfreq(), pandas_resampler.asfreq()) - df_equals( - modin_resampler.fillna(method="nearest"), - pandas_resampler.fillna(method="nearest"), - ) - df_equals(modin_resampler.pad(), pandas_resampler.pad()) - df_equals(modin_resampler.nearest(), pandas_resampler.nearest()) - df_equals(modin_resampler.bfill(), pandas_resampler.bfill()) - df_equals(modin_resampler.backfill(), pandas_resampler.backfill()) - df_equals(modin_resampler.ffill(), pandas_resampler.ffill()) - df_equals( - pandas_resampler.apply(["sum", "mean", "max"]), - modin_resampler.apply(["sum", "mean", "max"]), - ) + if on is None and level is None: df_equals( - modin_resampler.aggregate(["sum", "mean", "max"]), - pandas_resampler.aggregate(["sum", "mean", "max"]), + modin_resampler.fillna(method="nearest"), + pandas_resampler.fillna(method="nearest"), ) -def test_sem(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).sem() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray"]) @pytest.mark.parametrize("axis", [0, 1]) @@ -832,12 +771,6 @@ def test_stack(data, is_multi_idx, is_multi_col): df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2])) -def test_style(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).style - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1]) @pytest.mark.parametrize("axis2", [0, 1]) @@ -932,20 +865,6 @@ def test_to_string(data): ) -def test_to_timestamp(): - idx = pd.date_range("1/1/2012", periods=5, freq="M") - df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) - - with pytest.warns(UserWarning): - df.to_period().to_timestamp() - - -def test_to_xarray(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).to_xarray() - - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_truncate(data): modin_df = pd.DataFrame(data) @@ -1108,51 +1027,15 @@ def test_unstack(data, is_multi_idx, is_multi_col): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___array__(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert_array_equal(modin_df.__array__(), pandas_df.__array__()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___bool__(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - - try: - pandas_result = pandas_df.__bool__() - except Exception as e: - with pytest.raises(type(e)): - modin_df.__bool__() - else: - modin_result = modin_df.__bool__() - df_equals(modin_result, pandas_result) - - -def test___getstate__(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).__getstate__() - - -def test___setstate__(): - data = test_data_values[0] - with pytest.warns(UserWarning): - try: - pd.DataFrame(data).__setstate__(None) - except TypeError: - pass + eval_general(*create_test_dfs(data), lambda df: df.__bool__()) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_hasattr_sparse(data): - modin_df = pd.DataFrame(data) - pandas_df = pandas.DataFrame(data) - try: - pandas_result = hasattr(pandas_df, "sparse") - except Exception as e: - with pytest.raises(type(e)): - hasattr(modin_df, "sparse") - else: - modin_result = hasattr(modin_df, "sparse") - assert modin_result == pandas_result + eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse")) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 7e2c57064d1..65224e51661 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -116,6 +116,11 @@ for i in range(NCOLS) } +test_data_resample = { + "data": {"A": range(12), "B": range(12)}, + "index": pandas.date_range("31/12/2000", periods=12, freq="H"), +} + test_data_with_duplicates = { "no_duplicates": { "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): range(NROWS) @@ -550,7 +555,7 @@ def arg_keys(arg_name, keys): Returns: List of strings with arg_name append to front of keys. """ - return ["{0} {1}".format(arg_name, key) for key in keys] + return ["{0}_{1}".format(arg_name, key) for key in keys] def name_contains(test_name, vals): From 6489e7a2ca40e6eaa0732ae06e6d61b4a75adc40 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 8 Sep 2020 19:27:19 +0300 Subject: [PATCH 109/120] TEST-#2050: decrease number of parallel jobs on windows Ci (#2051) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 30 +++++++++++++++++++++++++++--- .github/workflows/push.yml | 32 ++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2379f311250..42b5ba135ad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -281,7 +281,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] + part: ["DataFrame", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -309,8 +309,32 @@ jobs: conda info conda list - shell: bash -l {0} - run: python -m pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py - if: matrix.part != 3 + run: pytest modin/pandas/test/dataframe/test_binary.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_default.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_indexing.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_iter.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_join_sort.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_map_metadata.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_reduction.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_udf.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_window.py + if: matrix.part == 'DataFrame' - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 0152eaf391d..73b2ce52ffc 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -107,7 +107,7 @@ jobs: matrix: python-version: ["3.6", "3.7", "3.8"] engine: ["ray", "dask"] - part: ["reduction", "binary", "map_metadata", "udf", "default", "window", "indexing", "iter", "join_sort", 3] + part: ["DataFrame", 3] env: MODIN_ENGINE: ${{matrix.engine}} MODIN_MEMORY: 1000000000 @@ -128,15 +128,39 @@ jobs: python-version: ${{matrix.python-version}} channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - auto-update-conda: true + auto-update-conda: true # this enable `use-only-tar-bz2` feature on Windows - name: Conda environment shell: bash -l {0} run: | conda info conda list - shell: bash -l {0} - run: python -m pytest modin/pandas/test/dataframe/test_${{matrix.part}}.py - if: matrix.part != 3 + run: pytest modin/pandas/test/dataframe/test_binary.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_default.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_indexing.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_iter.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_join_sort.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_map_metadata.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_reduction.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_udf.py + if: matrix.part == 'DataFrame' + - shell: bash -l {0} + run: pytest modin/pandas/test/dataframe/test_window.py + if: matrix.part == 'DataFrame' - shell: bash -l {0} run: python -m pytest modin/pandas/test/test_series.py if: matrix.part == 3 From 110873007e8cb7421cc7bede9f5315fb3ae5e856 Mon Sep 17 00:00:00 2001 From: Anton Date: Tue, 8 Sep 2020 19:29:51 +0300 Subject: [PATCH 110/120] Conda recipe for Modin (#1986) Signed-off-by: Anton Malakhov --- MANIFEST.in | 1 + modin/data_management/test/__init__.py | 12 +++++ modin/experimental/backends/__init__.py | 12 +++++ .../engines/omnisci_on_ray/__init__.py | 12 +++++ .../engines/omnisci_on_ray/frame/__init__.py | 12 +++++ .../engines/omnisci_on_ray/test/__init__.py | 12 +++++ modin/experimental/pandas/test/__init__.py | 12 +++++ modin/pandas/test/data/__init__.py | 12 +++++ scripts/conda-recipe/conda_build_config.yaml | 7 +++ scripts/conda-recipe/meta.yaml | 46 +++++++++++++++++++ scripts/conda-recipe/run_test.py | 9 ++++ 11 files changed, 147 insertions(+) create mode 100644 modin/data_management/test/__init__.py create mode 100644 modin/experimental/backends/__init__.py create mode 100644 modin/experimental/engines/omnisci_on_ray/__init__.py create mode 100644 modin/experimental/engines/omnisci_on_ray/frame/__init__.py create mode 100644 modin/experimental/engines/omnisci_on_ray/test/__init__.py create mode 100644 modin/experimental/pandas/test/__init__.py create mode 100644 modin/pandas/test/data/__init__.py create mode 100644 scripts/conda-recipe/conda_build_config.yaml create mode 100644 scripts/conda-recipe/meta.yaml create mode 100644 scripts/conda-recipe/run_test.py diff --git a/MANIFEST.in b/MANIFEST.in index 2121afd5956..6943eb5e5a3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include versioneer.py include modin/_version.py include modin/experimental/cloud/ray-autoscaler.yml +include modin/pandas/test/data/*.csv diff --git a/modin/data_management/test/__init__.py b/modin/data_management/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/data_management/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/backends/__init__.py b/modin/experimental/backends/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/backends/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/engines/omnisci_on_ray/__init__.py b/modin/experimental/engines/omnisci_on_ray/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/engines/omnisci_on_ray/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/engines/omnisci_on_ray/frame/__init__.py b/modin/experimental/engines/omnisci_on_ray/frame/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/engines/omnisci_on_ray/frame/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/engines/omnisci_on_ray/test/__init__.py b/modin/experimental/engines/omnisci_on_ray/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/engines/omnisci_on_ray/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/pandas/test/__init__.py b/modin/experimental/pandas/test/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/pandas/test/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/pandas/test/data/__init__.py b/modin/pandas/test/data/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/pandas/test/data/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/scripts/conda-recipe/conda_build_config.yaml b/scripts/conda-recipe/conda_build_config.yaml new file mode 100644 index 00000000000..05183f3286f --- /dev/null +++ b/scripts/conda-recipe/conda_build_config.yaml @@ -0,0 +1,7 @@ +pin_run_as_build: + python: + min_pin: x.x + max_pin: x.x +python: +- 3.7 +- 3.8 diff --git a/scripts/conda-recipe/meta.yaml b/scripts/conda-recipe/meta.yaml new file mode 100644 index 00000000000..9a00ce0ad2e --- /dev/null +++ b/scripts/conda-recipe/meta.yaml @@ -0,0 +1,46 @@ +{% set version = environ.get('GIT_DESCRIBE_TAG', "0.dirty").replace('-', '.') %} + +package: + name: modin + version: "{{ version }}" + +source: + path: ../../ + +build: + number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }} + script: '{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv ' + noarch: python + +requirements: + host: + - pip + - python + run: + - ray ==0.8.6 + - omniscidbe4py + - python + - pandas ==1.0.5 + - dask >=2.1.0 + - distributed >=2.3.2 + +test: + import: + - modin + - modin.experemental.pandas + commands: + - python run_test.py + +about: + home: https://github.com/modin-project/modin + license: Apache-2.0 + license_family: Apache + license_file: ../../LICENSE + summary: Speed up your Pandas workflows by changing a single line of code + doc_url: https://modin.readthedocs.io/ + dev_url: https://github.com/modin-project/modin + +extra: + recipe-maintainers: + - devin-petersohn + - h-vetinari diff --git a/scripts/conda-recipe/run_test.py b/scripts/conda-recipe/run_test.py new file mode 100644 index 00000000000..dcd6db6e9a0 --- /dev/null +++ b/scripts/conda-recipe/run_test.py @@ -0,0 +1,9 @@ +import modin +import modin.data_management.functions + + +if __name__ == "__main__": + import modin.pandas as pd + + print(pd.__version__) + print(pd.DataFrame([1,2,3])) From 9c03512adb4ee650fffd574d900bd0d22c1ebf5c Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 8 Sep 2020 20:42:47 +0300 Subject: [PATCH 111/120] REFACTOR-#2011: move default_to_pandas in groupby to backend (#2041) Signed-off-by: ienkovich --- .../data_management/functions/groupby_function.py | 11 ++++++++--- modin/pandas/groupby.py | 14 +++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py index e62b47d8b10..6312e8f56c3 100644 --- a/modin/data_management/functions/groupby_function.py +++ b/modin/data_management/functions/groupby_function.py @@ -14,6 +14,7 @@ import pandas from .mapreducefunction import MapReduceFunction +from modin.pandas.utils import try_cast_to_pandas class GroupbyReduceFunction(MapReduceFunction): @@ -29,9 +30,13 @@ def caller( numeric_only=True, drop=False, ): - assert isinstance( - by, type(query_compiler) - ), "Can only use groupby reduce with another Query Compiler" + if not isinstance(by, type(query_compiler)): + by = try_cast_to_pandas(by) + return query_compiler.default_to_pandas( + lambda df: map_func( + df.groupby(by=by, axis=axis, **groupby_args), **map_args + ) + ) assert axis == 0, "Can only groupby reduce with axis=0" if numeric_only: diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 1a43f458dfc..565fd2b3929 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -455,17 +455,21 @@ def all(self, **kwargs): ) def size(self): - if is_list_like(self._by) and any(isinstance(o, Series) for o in self._by): - # We don't have good way to handle this right now, fall back to Pandas. - return self._default_to_pandas(lambda df: df.size()) if self._axis == 0: # Size always works in as_index=True mode so it is necessary to make a # copy of _kwargs and change as_index in it kwargs = self._kwargs.copy() kwargs["as_index"] = True + # Series objects in 'by' mean we couldn't handle the case and transform + # 'by' to a query compiler. In this case we replace column names with + # actual columns to be able to apply goupby to a Series. + if is_list_like(self._by) and any(isinstance(o, Series) for o in self._by): + by = [self._df[o] if isinstance(o, str) else o for o in self._by] + else: + by = self._by work_object = SeriesGroupBy( self._df[self._df.columns[0]], - self._by, + by, self._axis, drop=False, idx_name=None, @@ -653,7 +657,7 @@ def _wrap_aggregation( DataFrame or Series Returns the same type as `self._df`. """ - if not isinstance(self._by, type(self._query_compiler)) or self._axis != 0: + if self._axis != 0: return self._default_to_pandas(default_func, **kwargs) # For aggregations, pandas behavior does this for the result. # For other operations it does not, so we wait until there is an aggregation to From dd27013733d93f527d0826e5cc6cd6d0f7c7a167 Mon Sep 17 00:00:00 2001 From: YarShev Date: Wed, 9 Sep 2020 13:04:02 +0300 Subject: [PATCH 112/120] FEAT-#1285: Add `sem` implementation for `Series` and `DataFrame` (#2048) Signed-off-by: Igoshev, Yaroslav --- docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/backends/base/query_compiler.py | 12 +++++ modin/backends/pandas/query_compiler.py | 1 + modin/pandas/base.py | 13 ----- modin/pandas/dataframe.py | 53 +++++++++++++++++++++ modin/pandas/series.py | 53 +++++++++++++++++++++ modin/pandas/test/dataframe/test_default.py | 1 - modin/pandas/test/dataframe/test_window.py | 30 ++++++++++-- modin/pandas/test/test_series.py | 28 ++++++++--- 10 files changed, 167 insertions(+), 28 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index ae4f6dc43f4..352f50f13d4 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -328,7 +328,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``select_dtypes`` | `select_dtypes`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``sem`` | `sem`_ | D | | +| ``sem`` | `sem`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``set_axis`` | `set_axis`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index b8996ad74e3..283eaceafbb 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -362,7 +362,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``searchsorted`` | Y | +-----------------------------+---------------------------------+ -| ``sem`` | D | +| ``sem`` | Y | +-----------------------------+---------------------------------+ | ``set_axis`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 06a513b5854..adc00814ac7 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -670,6 +670,18 @@ def skew(self, **kwargs): """ pass + @abc.abstractmethod + def sem(self, **kwargs): + """ + Returns standard deviation of the mean over requested axis. + + Returns + ------- + BaseQueryCompiler + QueryCompiler containing the standard deviation of the mean over requested axis. + """ + pass + @abc.abstractmethod def std(self, **kwargs): """Returns standard deviation of each column or row. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index d0c85507c7e..b6af408951a 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -740,6 +740,7 @@ def sort_index_for_equal_values(result, ascending): nunique = ReductionFunction.register(pandas.DataFrame.nunique) skew = ReductionFunction.register(pandas.DataFrame.skew) kurt = ReductionFunction.register(pandas.DataFrame.kurt) + sem = ReductionFunction.register(pandas.DataFrame.sem) std = ReductionFunction.register(pandas.DataFrame.std) var = ReductionFunction.register(pandas.DataFrame.var) sum_min_count = ReductionFunction.register(pandas.DataFrame.sum) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 6ab559f9212..542545fad5d 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2544,19 +2544,6 @@ def sample( query_compiler = self._query_compiler.getitem_row_array(samples) return self.__constructor__(query_compiler=query_compiler) - def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._default_to_pandas( - "sem", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def set_axis(self, labels, axis=0, inplace=False): """Assign desired index to given axis. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index bae377e57c9..1b4655ad285 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2180,6 +2180,59 @@ def is_dtype_instance_mapper(column, dtype): ] return self.drop(columns=self.columns[indicate], inplace=False) + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + + Returns + ------- + Series or DataFrame (if level specified) + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def set_index( self, keys, drop=True, append=False, inplace=False, verify_integrity=False ): diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 382f759c732..556d26cab53 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1092,6 +1092,59 @@ def nsmallest(self, n=5, keep="first"): """ return Series(query_compiler=self._query_compiler.nsmallest(n=n, keep=keep)) + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, + the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + + Returns + ------- + Scalar or Series (if level specified) + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.sem( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def slice_shift(self, periods=1, axis=0): """ Equivalent to `shift` without copying data. diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index c07849f2231..ddb6bc080d1 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -62,7 +62,6 @@ ("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}), ("mask", lambda df: {"cond": df != 0}), ("pct_change", None), - ("sem", None), ("__getstate__", None), ("to_xarray", None), ("pivot_table", lambda df: {"values": "int_col", "index": ["float_col"]}), diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index ce2bc0e2ba2..ff8ce9c5117 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -456,16 +456,16 @@ def test_median_skew_transposed(axis, method): ), ], ) -@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank"]) -def test_median_skew_std_var_rank_specific(numeric_only, method): +@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "rank", "sem"]) +def test_median_skew_std_var_rank_sem_specific(numeric_only, method): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, method)(numeric_only=numeric_only), ) -@pytest.mark.parametrize("method", ["median", "skew", "std", "var"]) -def test_median_skew_std_var_1953(method): +@pytest.mark.parametrize("method", ["median", "skew", "std", "var", "sem"]) +def test_median_skew_std_var_sem_1953(method): # See #1953 for details arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] @@ -633,11 +633,31 @@ def test_rank_transposed(axis, na_option): ) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_float_nan_only(skipna, ddof): + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: df.sem(skipna=skipna, ddof=ddof), + ) + + +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_int_only(axis, ddof): + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df.sem(axis=axis, ddof=ddof), + ) + + @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) -@pytest.mark.parametrize("method", ["str", "var", "rank"]) +@pytest.mark.parametrize("method", ["std", "var", "rank"]) def test_std_var_rank(axis, skipna, method): eval_general( *create_test_dfs(test_data["float_nan_data"]), diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index d21267c8242..5546367b56f 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2036,8 +2036,10 @@ def test_median(data, skipna): df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) -@pytest.mark.parametrize("method", ["median", "skew", "std", "sum", "var", "prod"]) -def test_median_skew_std_sum_var_prod_1953(method): +@pytest.mark.parametrize( + "method", ["median", "skew", "std", "sum", "var", "prod", "sem"] +) +def test_median_skew_std_sum_var_prod_sem_1953(method): # See #1953 for details data = [3, 3, 3, 3, 3, 3, 3, 3, 3] arrays = [ @@ -2736,11 +2738,23 @@ def test_searchsorted( assert case -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_sem(data): - modin_series, _ = create_test_series(data) # noqa: F841 - with pytest.warns(UserWarning): - modin_series.sem() +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_float_nan_only(skipna, ddof): + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda df: df.sem(skipna=skipna, ddof=ddof), + ) + + +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_sem_int_only(ddof): + eval_general( + *create_test_series(test_data["int_data"]), + lambda df: df.sem(ddof=ddof), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 6140c81496d6d45239240ccc8efe23ed3ff4a3de Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 9 Sep 2020 19:47:24 +0300 Subject: [PATCH 113/120] FIX-#2054: Moved non-dependent on modin.DataFrame utils to modin/utils.py (#2055) Signed-off-by: Dmitry Chigarev --- modin/backends/pandas/query_compiler.py | 2 +- .../functions/groupby_function.py | 2 +- modin/pandas/base.py | 3 +- modin/pandas/dataframe.py | 4 +- modin/pandas/general.py | 2 +- modin/pandas/groupby.py | 3 +- modin/pandas/io.py | 4 +- modin/pandas/plotting.py | 2 +- modin/pandas/reshape.py | 2 +- modin/pandas/series.py | 4 +- modin/pandas/test/dataframe/test_default.py | 2 +- modin/pandas/test/dataframe/test_join_sort.py | 2 +- modin/pandas/test/test_groupby.py | 3 +- modin/pandas/test/test_io.py | 3 +- modin/pandas/test/test_series.py | 2 +- modin/pandas/test/utils.py | 2 +- modin/pandas/utils.py | 101 ---------------- modin/utils.py | 113 ++++++++++++++++++ 18 files changed, 134 insertions(+), 122 deletions(-) create mode 100644 modin/utils.py diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index b6af408951a..c948401c1bb 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -23,7 +23,7 @@ from modin.backends.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage -from modin.pandas.utils import try_cast_to_pandas, wrap_udf_function +from modin.utils import try_cast_to_pandas, wrap_udf_function from modin.data_management.functions import ( FoldFunction, MapFunction, diff --git a/modin/data_management/functions/groupby_function.py b/modin/data_management/functions/groupby_function.py index 6312e8f56c3..c645c000c27 100644 --- a/modin/data_management/functions/groupby_function.py +++ b/modin/data_management/functions/groupby_function.py @@ -14,7 +14,7 @@ import pandas from .mapreducefunction import MapReduceFunction -from modin.pandas.utils import try_cast_to_pandas +from modin.utils import try_cast_to_pandas class GroupbyReduceFunction(MapReduceFunction): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 542545fad5d..225636552fd 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -31,8 +31,9 @@ import warnings import pickle as pkl +from modin.utils import try_cast_to_pandas from modin.error_message import ErrorMessage -from modin.pandas.utils import try_cast_to_pandas, is_scalar +from modin.pandas.utils import is_scalar # Similar to pandas, sentinel value to use as kwarg in place of None when None has # special meaning and needs to be distinguished from a user explicitly passing None. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 1b4655ad285..cc87678e467 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -33,12 +33,10 @@ import warnings from modin.error_message import ErrorMessage +from modin.utils import _inherit_docstrings, to_pandas, hashable from .utils import ( from_pandas, from_non_pandas, - to_pandas, - _inherit_docstrings, - hashable, ) from .iterator import PartitionIterator from .series import Series diff --git a/modin/pandas/general.py b/modin/pandas/general.py index c8b635ef4f2..18675307201 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -17,7 +17,7 @@ from .base import BasePandasDataset from .dataframe import DataFrame from .series import Series -from .utils import to_pandas +from modin.utils import to_pandas def isna(obj): diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 565fd2b3929..e5fca25ec71 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -18,8 +18,7 @@ import pandas.core.common as com from modin.error_message import ErrorMessage - -from .utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas +from modin.utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas from .series import Series diff --git a/modin/pandas/io.py b/modin/pandas/io.py index f8dceb5db8c..85b644e8119 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -485,7 +485,7 @@ def return_handler(*args, **kwargs): A Modin DataFrame in place of a pandas DataFrame, or the same return type as pandas.ExcelFile. """ - from .utils import to_pandas + from modin.utils import to_pandas # We don't want to constantly be giving this error message for # internal methods. @@ -554,7 +554,7 @@ def return_handler(*args, **kwargs): A Modin DataFrame in place of a pandas DataFrame, or the same return type as pandas.HDFStore. """ - from .utils import to_pandas + from modin.utils import to_pandas # We don't want to constantly be giving this error message for # internal methods. diff --git a/modin/pandas/plotting.py b/modin/pandas/plotting.py index 3ba9af5d7d4..36102b13c67 100644 --- a/modin/pandas/plotting.py +++ b/modin/pandas/plotting.py @@ -13,7 +13,7 @@ from pandas import plotting as pdplot -from .utils import to_pandas +from modin.utils import to_pandas from .dataframe import DataFrame diff --git a/modin/pandas/reshape.py b/modin/pandas/reshape.py index 5423f4dc9b7..99b9005df4e 100644 --- a/modin/pandas/reshape.py +++ b/modin/pandas/reshape.py @@ -15,7 +15,7 @@ from .dataframe import DataFrame from .series import Series -from .utils import to_pandas +from modin.utils import to_pandas from modin.error_message import ErrorMessage diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 556d26cab53..2901b8af1c7 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -24,10 +24,10 @@ import sys import warnings +from modin.utils import _inherit_docstrings, to_pandas from .base import BasePandasDataset from .iterator import PartitionIterator -from .utils import _inherit_docstrings -from .utils import from_pandas, to_pandas, is_scalar +from .utils import from_pandas, is_scalar if sys.version_info[0] == 3 and sys.version_info[1] >= 7: # Python >= 3.7 diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index ddb6bc080d1..ca715883b76 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -17,7 +17,7 @@ import os import matplotlib import modin.pandas as pd -from modin.pandas.utils import to_pandas +from modin.utils import to_pandas from numpy.testing import assert_array_equal import io diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index 7e1edf022dc..c281f2b9db9 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -16,7 +16,7 @@ import pandas import matplotlib import modin.pandas as pd -from modin.pandas.utils import to_pandas +from modin.utils import to_pandas from modin.pandas.test.utils import ( random_state, diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 329e1258e2b..c179a49c456 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -15,7 +15,8 @@ import pandas import numpy as np import modin.pandas as pd -from modin.pandas.utils import from_pandas, to_pandas, try_cast_to_pandas +from modin.utils import try_cast_to_pandas, to_pandas +from modin.pandas.utils import from_pandas from .utils import ( df_equals, check_df_columns_have_nans, diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 5bc1207b733..ca66d8a6ba5 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -16,7 +16,8 @@ import pandas from pandas.errors import ParserWarning from collections import OrderedDict -from modin.pandas.utils import to_pandas, from_arrow +from modin.utils import to_pandas +from modin.pandas.utils import from_arrow from pathlib import Path import pyarrow as pa import pyarrow.parquet as pq diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 5546367b56f..8c1b4d4517d 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -20,7 +20,7 @@ from numpy.testing import assert_array_equal import sys -from modin.pandas.utils import to_pandas +from modin.utils import to_pandas from .utils import ( random_state, RAND_LOW, diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 65224e51661..80f03da9c02 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -20,7 +20,7 @@ assert_categorical_equal, ) import modin.pandas as pd -from modin.pandas.utils import to_pandas +from modin.utils import to_pandas from io import BytesIO random_state = np.random.RandomState(seed=42) diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index e21c7617dd5..25461db481e 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -11,8 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import pandas - def from_non_pandas(df, index, columns, dtype): from modin.data_management.dispatcher import EngineDispatcher @@ -58,105 +56,6 @@ def from_arrow(at): return DataFrame(query_compiler=EngineDispatcher.from_arrow(at)) -def to_pandas(modin_obj): - """Converts a Modin DataFrame/Series to a pandas DataFrame/Series. - - Args: - obj {modin.DataFrame, modin.Series}: The Modin DataFrame/Series to convert. - - Returns: - A new pandas DataFrame or Series. - """ - return modin_obj._to_pandas() - - -def _inherit_docstrings(parent, excluded=[]): - """Creates a decorator which overwrites a decorated class' __doc__ - attribute with parent's __doc__ attribute. Also overwrites __doc__ of - methods and properties defined in the class with the __doc__ of matching - methods and properties in parent. - - Args: - parent (object): Class from which the decorated class inherits __doc__. - excluded (list): List of parent objects from which the class does not - inherit docstrings. - - Returns: - function: decorator which replaces the decorated class' documentation - parent's documentation. - """ - - def decorator(cls): - if parent not in excluded: - cls.__doc__ = parent.__doc__ - for attr, obj in cls.__dict__.items(): - parent_obj = getattr(parent, attr, None) - if parent_obj in excluded or ( - not callable(parent_obj) and not isinstance(parent_obj, property) - ): - continue - if callable(obj): - obj.__doc__ = parent_obj.__doc__ - elif isinstance(obj, property) and obj.fget is not None: - p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__) - setattr(cls, attr, p) - return cls - - return decorator - - -def try_cast_to_pandas(obj): - """ - Converts obj and all nested objects from modin to pandas if it is possible, - otherwise returns obj - - Parameters - ---------- - obj : object, - object to convert from modin to pandas - - Returns - ------- - Converted object - """ - if hasattr(obj, "_to_pandas"): - return obj._to_pandas() - if isinstance(obj, (list, tuple)): - return type(obj)([try_cast_to_pandas(o) for o in obj]) - if isinstance(obj, dict): - return {k: try_cast_to_pandas(v) for k, v in obj.items()} - if callable(obj): - module_hierarchy = getattr(obj, "__module__", "").split(".") - fn_name = getattr(obj, "__name__", None) - if fn_name and module_hierarchy[0] == "modin": - return ( - getattr(pandas.DataFrame, fn_name, obj) - if module_hierarchy[-1] == "dataframe" - else getattr(pandas.Series, fn_name, obj) - ) - return obj - - -def wrap_udf_function(func): - def wrapper(*args, **kwargs): - result = func(*args, **kwargs) - # if user accidently returns modin DataFrame or Series - # casting it back to pandas to properly process - return try_cast_to_pandas(result) - - wrapper.__name__ = func.__name__ - return wrapper - - -def hashable(obj): - """Return whether the object is hashable.""" - try: - hash(obj) - except TypeError: - return False - return True - - def is_scalar(obj): """ Return True if given object is scalar. diff --git a/modin/utils.py b/modin/utils.py new file mode 100644 index 00000000000..ae87964ec37 --- /dev/null +++ b/modin/utils.py @@ -0,0 +1,113 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pandas + + +def _inherit_docstrings(parent, excluded=[]): + """Creates a decorator which overwrites a decorated class' __doc__ + attribute with parent's __doc__ attribute. Also overwrites __doc__ of + methods and properties defined in the class with the __doc__ of matching + methods and properties in parent. + + Args: + parent (object): Class from which the decorated class inherits __doc__. + excluded (list): List of parent objects from which the class does not + inherit docstrings. + + Returns: + function: decorator which replaces the decorated class' documentation + parent's documentation. + """ + + def decorator(cls): + if parent not in excluded: + cls.__doc__ = parent.__doc__ + for attr, obj in cls.__dict__.items(): + parent_obj = getattr(parent, attr, None) + if parent_obj in excluded or ( + not callable(parent_obj) and not isinstance(parent_obj, property) + ): + continue + if callable(obj): + obj.__doc__ = parent_obj.__doc__ + elif isinstance(obj, property) and obj.fget is not None: + p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__) + setattr(cls, attr, p) + return cls + + return decorator + + +def to_pandas(modin_obj): + """Converts a Modin DataFrame/Series to a pandas DataFrame/Series. + + Args: + obj {modin.DataFrame, modin.Series}: The Modin DataFrame/Series to convert. + + Returns: + A new pandas DataFrame or Series. + """ + return modin_obj._to_pandas() + + +def hashable(obj): + """Return whether the object is hashable.""" + try: + hash(obj) + except TypeError: + return False + return True + + +def try_cast_to_pandas(obj): + """ + Converts obj and all nested objects from modin to pandas if it is possible, + otherwise returns obj + + Parameters + ---------- + obj : object, + object to convert from modin to pandas + + Returns + ------- + Converted object + """ + if hasattr(obj, "_to_pandas"): + return obj._to_pandas() + if isinstance(obj, (list, tuple)): + return type(obj)([try_cast_to_pandas(o) for o in obj]) + if isinstance(obj, dict): + return {k: try_cast_to_pandas(v) for k, v in obj.items()} + if callable(obj): + module_hierarchy = getattr(obj, "__module__", "").split(".") + fn_name = getattr(obj, "__name__", None) + if fn_name and module_hierarchy[0] == "modin": + return ( + getattr(pandas.DataFrame, fn_name, obj) + if module_hierarchy[-1] == "dataframe" + else getattr(pandas.Series, fn_name, obj) + ) + return obj + + +def wrap_udf_function(func): + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + # if user accidently returns modin DataFrame or Series + # casting it back to pandas to properly process + return try_cast_to_pandas(result) + + wrapper.__name__ = func.__name__ + return wrapper From 847d4a35eab2c07bac992bcb0eb4f40a9ce9c55b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 9 Sep 2020 23:34:06 +0300 Subject: [PATCH 114/120] TEST-#1891: use conda instead of pip (#2056) * TEST-#1891: use conda instead of pip Signed-off-by: Anatoly Myachev * TEST-#1891: remove source ctivate modin Signed-off-by: Anatoly Myachev * TEST-#1891: pin some dev tools Signed-off-by: Anatoly Myachev --- ci/teamcity/Dockerfile.teamcity-ci | 17 +++++++++++------ environment.yml | 7 ++++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ci/teamcity/Dockerfile.teamcity-ci b/ci/teamcity/Dockerfile.teamcity-ci index 8051a9406e9..4ad18615602 100644 --- a/ci/teamcity/Dockerfile.teamcity-ci +++ b/ci/teamcity/Dockerfile.teamcity-ci @@ -1,12 +1,17 @@ FROM modin-project/modin-base -ARG REQUIREMENTS=requirements.txt +ARG ENVIRONMENT=environment.yml ADD modin.tar /modin ADD git-rev /modin/git-rev + WORKDIR /modin -RUN pip install -U pip setuptools -RUN pip install -U -r ${REQUIREMENTS} -RUN pip install -U pytest-remotedata -RUN pip install -e .[all] -RUN pip install PyGithub + +RUN conda env create -f environment.yml + +# Make RUN commands use the new environment: +SHELL ["conda", "run", "-n", "modin", "/bin/bash", "-c"] + +RUN conda list + +ENV PATH /opt/conda/envs/env/bin:$PATH diff --git a/environment.yml b/environment.yml index b98335b021b..7b6f3f9d158 100644 --- a/environment.yml +++ b/environment.yml @@ -23,10 +23,11 @@ dependencies: - pytables - msgpack-python - psutil - - pytest - - pytest-cov - - pytest-xdist + - pytest>=6.0.1 + - pytest-cov>=2.10.1 + - pytest-xdist>=2.1.0 - coverage<5.0 + - pygithub==1.53 - pip: - ray==0.8.7 - rpyc From 0f54983a7e94024ea3134c071ed1a295e78f795a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 10 Sep 2020 11:38:51 +0300 Subject: [PATCH 115/120] FIX-#2052: fix spawning of remote cluster (#2053) * Fix spawning of remote cluster for Ray 0.8.7 * Update dependencies * Change Teamcity Dockerfile to fully use conda for setting up environment Signed-off-by: Anatoly Myachev --- ci/teamcity/Dockerfile.teamcity-ci | 2 +- environment.yml | 4 ++-- modin/experimental/cloud/ray-autoscaler.yml | 3 +++ modin/experimental/cloud/rayscale.py | 7 +++++++ requirements.txt | 4 ++-- requirements/env_windows.yml | 2 +- requirements/windows_test_requires.txt | 2 +- setup.py | 2 +- 8 files changed, 18 insertions(+), 8 deletions(-) diff --git a/ci/teamcity/Dockerfile.teamcity-ci b/ci/teamcity/Dockerfile.teamcity-ci index 4ad18615602..6efde7af080 100644 --- a/ci/teamcity/Dockerfile.teamcity-ci +++ b/ci/teamcity/Dockerfile.teamcity-ci @@ -7,7 +7,7 @@ ADD git-rev /modin/git-rev WORKDIR /modin -RUN conda env create -f environment.yml +RUN conda env create -f ${ENVIRONMENT} # Make RUN commands use the new environment: SHELL ["conda", "run", "-n", "modin", "/bin/bash", "-c"] diff --git a/environment.yml b/environment.yml index 7b6f3f9d158..7fcab27bd52 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - pathlib - scipy - pip - - s3fs + - s3fs>=0.4.2 - feather-format - lxml - openpyxl @@ -30,4 +30,4 @@ dependencies: - pygithub==1.53 - pip: - ray==0.8.7 - - rpyc + - rpyc==4.1.5 diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml index 9051337128c..5c941e75970 100644 --- a/modin/experimental/cloud/ray-autoscaler.yml +++ b/modin/experimental/cloud/ray-autoscaler.yml @@ -124,6 +124,9 @@ setup_commands: conda install python==3.7.6 pip install modin "ray==0.8.7" cloudpickle + # ray now executes "ray stop" which expects "ray" to be in $PATH + # so place a symlink to current "ray" binary to /usr/local/bin + sudo ln -s `which ray` /usr/local/bin/ray echo 'export MODIN_RAY_CLUSTER=True' >> ~/.bashrc # Consider uncommenting these if you also want to run apt-get commands during setup diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index 9c990185204..fd7ecd1e64c 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -156,6 +156,10 @@ def __do_spawn(self): restart_only=False, yes=True, override_cluster_name=None, + no_config_cache=False, + log_old_style=False, + log_color="auto", + verbose=1, ) # need to re-load the config, as create_or_update_cluster() modifies it with open(self.config_file) as inp: @@ -176,6 +180,9 @@ def __do_destroy(self): workers_only=False, override_cluster_name=None, keep_min_workers=0, + log_old_style=False, + log_color="auto", + verbose=1, ) self.ready = False self.config = None diff --git a/requirements.txt b/requirements.txt index 374602b1320..5298ed4b635 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ Jinja2 pathlib tables scipy -s3fs +s3fs>=0.4.2 pytest coverage<5.0 pytest-cov @@ -24,4 +24,4 @@ sqlalchemy msgpack pandas_gbq cloudpickle -rpyc +rpyc==4.1.5 diff --git a/requirements/env_windows.yml b/requirements/env_windows.yml index c1abadd0b63..63838734a07 100644 --- a/requirements/env_windows.yml +++ b/requirements/env_windows.yml @@ -23,7 +23,7 @@ dependencies: - pathlib - tables - scipy - - s3fs + - s3fs>=0.4.2 - pytest - coverage<5.0 - pytest-cov diff --git a/requirements/windows_test_requires.txt b/requirements/windows_test_requires.txt index efd109e9612..5c2009eab01 100644 --- a/requirements/windows_test_requires.txt +++ b/requirements/windows_test_requires.txt @@ -9,7 +9,7 @@ Jinja2 pathlib tables scipy -s3fs +s3fs>=0.4.2 pytest coverage<5.0 pytest-cov diff --git a/setup.py b/setup.py index 52fd07f5df3..658d070302f 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ def is_pure(self): dask_deps = ["dask>=2.12.0", "distributed>=2.12.0"] ray_deps = ["ray==0.8.7", "pyarrow<0.17"] -remote_deps = ["rpyc", "cloudpickle", "boto3"] +remote_deps = ["rpyc==4.1.5", "cloudpickle", "boto3==1.4.8"] all_deps = dask_deps + ray_deps + remote_deps From 2ca3f34b2c3c22bf285c0349d35a5099de3a2e78 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Thu, 10 Sep 2020 16:38:21 +0300 Subject: [PATCH 116/120] FEAT-#2058: Improve how remote factories are defined (#2060) Signed-off-by: Vasilij Litvinov --- modin/data_management/__init__.py | 17 +++++++++ modin/data_management/factories.py | 51 ++++++++++++++++++++++--- modin/experimental/cloud/meta_magic.py | 3 +- modin/experimental/pandas/numpy_wrap.py | 3 +- modin/pandas/__init__.py | 4 +- 5 files changed, 67 insertions(+), 11 deletions(-) diff --git a/modin/data_management/__init__.py b/modin/data_management/__init__.py index cae6413e559..b6fdf0cbcc6 100644 --- a/modin/data_management/__init__.py +++ b/modin/data_management/__init__.py @@ -10,3 +10,20 @@ # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. + +from . import factories + + +def _get_remote_engines(): + for name in dir(factories): + obj = getattr(factories, name) + if isinstance(obj, type) and issubclass( + obj, factories.ExperimentalRemoteFactory + ): + try: + yield obj.get_info().engine + except factories.NotRealFactory: + pass + + +REMOTE_ENGINES = set(_get_remote_engines()) diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index 8509b902911..f959ca9dd87 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -12,6 +12,8 @@ # governing permissions and limitations under the License. import warnings +import typing +import re from modin import execution_engine from modin.engines.base.io import BaseIO @@ -21,6 +23,16 @@ types_dictionary = {"pandas": {"category": pandas.CategoricalDtype}} +class FactoryInfo(typing.NamedTuple): + engine: str + partition: str + experimental: bool + + +class NotRealFactory(Exception): + pass + + class BaseFactory(object): """ Abstract factory which allows to override the io module easily. @@ -28,6 +40,25 @@ class BaseFactory(object): io_cls: BaseIO = None # The module where the I/O functionality exists. + @classmethod + def get_info(cls) -> FactoryInfo: + """ + This gets the information about the factory: its execution engine, + partitioning format and whether it's experimental-only. + + Note that it parses factory name, so it must be conformant with how + ExecutionEngine class constructs factory names. + """ + try: + experimental, partition, engine = re.match( + r"^(Experimental)?(.*)On(.*)Factory$", cls.__name__ + ).groups() + except AttributeError: + raise NotRealFactory() + return FactoryInfo( + engine=engine, partition=partition, experimental=bool(experimental) + ) + @classmethod def prepare(cls): """ @@ -216,7 +247,9 @@ def prepare(cls): cls.io_cls = PyarrowOnRayIO -class ExperimentalPandasOnCloudrayFactory(ExperimentalBaseFactory): +class ExperimentalRemoteFactory(ExperimentalBaseFactory): + wrapped_factory = BaseFactory + @classmethod def prepare(cls): # query_compiler import is needed so remote PandasQueryCompiler @@ -231,11 +264,13 @@ def prepare(cls): import modin.experimental.pandas.numpy_wrap # noqa: F401 class WrappedIO: - def __init__(self, conn): + def __init__(self, conn, factory): self.__conn = conn - self.__io_cls = conn.modules[ - "modin.engines.ray.pandas_on_ray.io" - ].PandasOnRayIO + remote_factory = getattr( + conn.modules[factory.__module__], factory.__name__ + ) + remote_factory.prepare() + self.__io_cls = remote_factory.io_cls self.__reads = { name for name in BaseIO.__dict__ if name.startswith("read_") } @@ -256,4 +291,8 @@ def wrap(*a, _original=getattr(self.__io_cls, name), **kw): wrap = getattr(self.__io_cls, name) return wrap - cls.io_cls = WrappedIO(get_connection()) + cls.io_cls = WrappedIO(get_connection(), cls.wrapped_factory) + + +class ExperimentalPandasOnCloudrayFactory(ExperimentalRemoteFactory): + wrapped_factory = PandasOnRayFactory diff --git a/modin/experimental/cloud/meta_magic.py b/modin/experimental/cloud/meta_magic.py index 6849063a132..fba82613840 100644 --- a/modin/experimental/cloud/meta_magic.py +++ b/modin/experimental/cloud/meta_magic.py @@ -16,6 +16,7 @@ import types from modin import execution_engine +from modin.data_management import REMOTE_ENGINES # the attributes that must be alwasy taken from a local part of dual-nature class, # never going to remote end @@ -153,7 +154,7 @@ def __new__(cls, *a, **kw): _KNOWN_DUALS[local_cls] = result def update_class(_): - if execution_engine.get() == "Cloudray": + if execution_engine.get() in REMOTE_ENGINES: from . import rpyc_proxy result.__real_cls__ = getattr(rpyc_proxy, rpyc_wrapper_name)(result) diff --git a/modin/experimental/pandas/numpy_wrap.py b/modin/experimental/pandas/numpy_wrap.py index d7bd78af8bc..680b12a95b9 100644 --- a/modin/experimental/pandas/numpy_wrap.py +++ b/modin/experimental/pandas/numpy_wrap.py @@ -27,6 +27,7 @@ import types import copyreg from modin import execution_engine + from modin.data_management import REMOTE_ENGINES import modin import pandas import os @@ -78,7 +79,7 @@ def __swap_numpy(self, other_numpy=None): self.__has_to_warn = False def __update_engine(self, _): - if execution_engine.get() == "Cloudray": + if execution_engine.get() in REMOTE_ENGINES: from modin.experimental.cloud import get_connection self.__swap_numpy(get_connection().modules["numpy"]) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 9e4419989f2..be987fd90ce 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -134,9 +134,8 @@ def _update_engine(publisher: Publisher): elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection - import rpyc - conn: rpyc.ClassicService = get_connection() + conn = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @@ -159,7 +158,6 @@ def init_remote_ray(): import modin.data_management.dispatcher # noqa: F401 num_cpus = remote_ray.cluster_resources()["CPU"] - elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format(publisher.get())) From df725d2810504ee9ea836ccc7ddb43c6d4a01fe7 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Mon, 14 Sep 2020 17:26:10 +0300 Subject: [PATCH 117/120] FIX-#1918: fix core dumped issue (#2000) Signed-off-by: Alexander Myskov --- modin/engines/ray/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modin/engines/ray/utils.py b/modin/engines/ray/utils.py index 70d4bc3fe1f..82241c30a7d 100644 --- a/modin/engines/ray/utils.py +++ b/modin/engines/ray/utils.py @@ -111,11 +111,13 @@ def initialize_ray( elif cluster == "": num_cpus = os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() object_store_memory = os.environ.get("MODIN_MEMORY", None) - plasma_directory = None + plasma_directory = os.environ.get("MODIN_ON_RAY_PLASMA_DIR", None) if os.environ.get("MODIN_OUT_OF_CORE", "False").title() == "True": - from tempfile import gettempdir - plasma_directory = gettempdir() + if plasma_directory is None: + from tempfile import gettempdir + + plasma_directory = gettempdir() # We may have already set the memory from the environment variable, we don't # want to overwrite that value if we have. if object_store_memory is None: From d308c58b675e4af733ffa37e4119829c83992345 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Tue, 15 Sep 2020 14:59:26 +0300 Subject: [PATCH 118/120] FIX-#1386: Fix `read_csv` for incorrect csv data (#2076) Signed-off-by: Alexey Prutskov --- modin/engines/base/io/text/csv_reader.py | 2 +- modin/pandas/test/test_io.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py index 0c443a35828..0671e8a4d46 100644 --- a/modin/engines/base/io/text/csv_reader.py +++ b/modin/engines/base/io/text/csv_reader.py @@ -185,7 +185,7 @@ def _read(cls, filepath_or_buffer, **kwargs): # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. - dtypes = cls.get_dtypes(dtypes_ids) + dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths) # If parse_dates is present, the column names that we have might not be diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index ca66d8a6ba5..a2dede18cbe 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1184,6 +1184,13 @@ def test_from_csv_newlines_in_quotes(nrows, skiprows): ) +def test_read_csv_incorrect_data(): + name = "modin/pandas/test/data/test_categories.json" + pandas_df, modin_df = pandas.read_csv(name), pd.read_csv(name) + + df_equals(pandas_df, modin_df) + + @pytest.mark.skip(reason="No clipboard on Travis") def test_to_clipboard(): modin_df = create_test_modin_dataframe() From 51ed0ae345cb19b1bb5ea23f73ce9b9ef9fb392f Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 15 Sep 2020 16:49:04 +0300 Subject: [PATCH 119/120] REFACTOR-#2035: move getitem_array to the backend (#2036) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 17 ++++++++ modin/backends/pandas/query_compiler.py | 53 +++++++++++++++++++++++++ modin/pandas/dataframe.py | 50 ++++------------------- 3 files changed, 77 insertions(+), 43 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index adc00814ac7..c5800831f89 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -833,6 +833,23 @@ def quantile_for_list_of_values(self, **kwargs): # END Abstract map across rows/columns # Abstract __getitem__ methods + @abc.abstractmethod + def getitem_array(self, key): + """ + Get column or row data specified by key. + + Parameters + ---------- + key : BaseQueryCompiler, numpy.ndarray, pandas.Index or list + Target numeric indices or labels by which to retrieve data. + + Returns + ------- + BaseQueryCompiler + A new Query Compiler. + """ + pass + @abc.abstractmethod def getitem_column_array(self, key): """Get column data for target labels. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index c948401c1bb..b4312f726fa 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -13,6 +13,8 @@ import numpy as np import pandas +from pandas.core.common import is_bool_indexer +from pandas.core.indexing import check_bool_indexer from pandas.core.dtypes.common import ( is_list_like, is_numeric_dtype, @@ -20,6 +22,7 @@ is_scalar, ) from pandas.core.base import DataError +import warnings from modin.backends.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage @@ -1918,6 +1921,56 @@ def applyier(df, internal_indices, other=[], internal_other_indices=[]): # END Map across rows/columns # __getitem__ methods + def getitem_array(self, key): + """ + Get column or row data specified by key. + + Parameters + ---------- + key : PandasQueryCompiler, numpy.ndarray, pandas.Index or list + Target numeric indices or labels by which to retrieve data. + + Returns + ------- + PandasQueryCompiler + A new Query Compiler. + """ + # TODO: dont convert to pandas for array indexing + if isinstance(key, type(self)): + key = key.to_pandas().squeeze(axis=1) + if is_bool_indexer(key): + if isinstance(key, pandas.Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + PendingDeprecationWarning, + stacklevel=3, + ) + elif len(key) != len(self.index): + raise ValueError( + "Item wrong length {} instead of {}.".format( + len(key), len(self.index) + ) + ) + key = check_bool_indexer(self.index, key) + # We convert to a RangeIndex because getitem_row_array is expecting a list + # of indices, and RangeIndex will give us the exact indices of each boolean + # requested. + key = pandas.RangeIndex(len(self.index))[key] + if len(key): + return self.getitem_row_array(key) + else: + return self.from_pandas( + pandas.DataFrame(columns=self.columns), type(self._modin_frame) + ) + else: + if any(k not in self.columns for k in key): + raise KeyError( + "{} not index".format( + str([k for k in key if k not in self.columns]).replace(",", "") + ) + ) + return self.getitem_column_array(key) + def getitem_column_array(self, key, numeric=False): """Get column data for target labels. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index cc87678e467..e22a9c4c5f1 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -12,7 +12,7 @@ # governing permissions and limitations under the License. import pandas -from pandas.core.common import apply_if_callable, is_bool_indexer +from pandas.core.common import apply_if_callable from pandas.core.dtypes.common import ( infer_dtype_from_object, is_dict_like, @@ -20,7 +20,6 @@ is_numeric_dtype, ) from pandas.core.indexes.api import ensure_index_from_sequences -from pandas.core.indexing import check_bool_indexer from pandas.util._validators import validate_bool_kwarg from pandas.io.formats.printing import pprint_thing @@ -2927,8 +2926,12 @@ def _getitem(self, key): return self._getitem_column(key) except (KeyError, ValueError, TypeError): pass - if isinstance(key, (Series, np.ndarray, pandas.Index, list)): - return self._getitem_array(key) + if isinstance(key, Series): + return DataFrame( + query_compiler=self._query_compiler.getitem_array(key._query_compiler) + ) + elif isinstance(key, (np.ndarray, pandas.Index, list)): + return DataFrame(query_compiler=self._query_compiler.getitem_array(key)) elif isinstance(key, DataFrame): return self.where(key) elif is_mi_columns: @@ -2948,45 +2951,6 @@ def _getitem_column(self, key): s._parent_axis = 1 return s - def _getitem_array(self, key): - # TODO: dont convert to pandas for array indexing - if isinstance(key, Series): - key = key._to_pandas() - if is_bool_indexer(key): - if isinstance(key, pandas.Series) and not key.index.equals(self.index): - warnings.warn( - "Boolean Series key will be reindexed to match DataFrame index.", - PendingDeprecationWarning, - stacklevel=3, - ) - elif len(key) != len(self.index): - raise ValueError( - "Item wrong length {} instead of {}.".format( - len(key), len(self.index) - ) - ) - key = check_bool_indexer(self.index, key) - # We convert to a RangeIndex because getitem_row_array is expecting a list - # of indices, and RangeIndex will give us the exact indices of each boolean - # requested. - key = pandas.RangeIndex(len(self.index))[key] - if len(key): - return DataFrame( - query_compiler=self._query_compiler.getitem_row_array(key) - ) - else: - return DataFrame(columns=self.columns) - else: - if any(k not in self.columns for k in key): - raise KeyError( - "{} not index".format( - str([k for k in key if k not in self.columns]).replace(",", "") - ) - ) - return DataFrame( - query_compiler=self._query_compiler.getitem_column_array(key) - ) - def __getattr__(self, key): """After regular attribute access, looks up the name in the columns From c955d506c3599dd324a5649b69fc50554b78243d Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Tue, 15 Sep 2020 21:47:51 -0500 Subject: [PATCH 120/120] REFACTOR-#2083: Rename LISCENSE_HEADER to LICENSE_HEADER. (#2082) Signed-off-by: Michael L Heuer --- LISCENSE_HEADER => LICENSE_HEADER | 0 modin/apply_license_header.py | 2 +- modin/test/test_headers.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename LISCENSE_HEADER => LICENSE_HEADER (100%) diff --git a/LISCENSE_HEADER b/LICENSE_HEADER similarity index 100% rename from LISCENSE_HEADER rename to LICENSE_HEADER diff --git a/modin/apply_license_header.py b/modin/apply_license_header.py index 7f9f94d1988..997ee9331d0 100644 --- a/modin/apply_license_header.py +++ b/modin/apply_license_header.py @@ -19,7 +19,7 @@ rootdir = dirname(abspath(__file__)) exclude_files = ["_version.py"] -with open("{}{}".format(dirname(rootdir), "/LISCENSE_HEADER"), "r") as f: +with open("{}{}".format(dirname(rootdir), "/LICENSE_HEADER"), "r") as f: # Lines to check each line individually header_lines = f.readlines() diff --git a/modin/test/test_headers.py b/modin/test/test_headers.py index e771b3bebe5..a508fb3709e 100644 --- a/modin/test/test_headers.py +++ b/modin/test/test_headers.py @@ -21,7 +21,7 @@ def test_headers(): - with open("{}{}".format(dirname(rootdir), "/LISCENSE_HEADER"), "r") as f: + with open("{}{}".format(dirname(rootdir), "/LICENSE_HEADER"), "r") as f: # Lines to check each line individually header_lines = f.readlines()