From b31375a8ad53264441587d38357dba135e1fc249 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Wed, 3 Jun 2020 23:11:16 +0300 Subject: [PATCH 1/4] Add `value_counts` implementation for both `Series` and as free function Signed-off-by: Yaroslav Igoshev --- docs/supported_apis/series_supported.rst | 2 +- docs/supported_apis/utilities_supported.rst | 2 +- modin/backends/base/query_compiler.py | 3 ++ modin/backends/pandas/query_compiler.py | 13 +++++++ modin/pandas/__init__.py | 4 +- modin/pandas/general.py | 31 ++++++++++++++++ modin/pandas/series.py | 41 +++++++++++++++++---- modin/pandas/test/test_general.py | 15 ++++++++ modin/pandas/test/test_series.py | 18 ++++++++- 9 files changed, 116 insertions(+), 13 deletions(-) diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index d443830845f..400d30a1a4a 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -468,7 +468,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``valid`` | D | +-----------------------------+---------------------------------+ -| ``value_counts`` | D | +| ``value_counts`` | Y | +-----------------------------+---------------------------------+ | ``values`` | Y | +-----------------------------+---------------------------------+ diff --git a/docs/supported_apis/utilities_supported.rst b/docs/supported_apis/utilities_supported.rst index 419b9ec8006..8d3d4da1708 100644 --- a/docs/supported_apis/utilities_supported.rst +++ b/docs/supported_apis/utilities_supported.rst @@ -21,7 +21,7 @@ default to pandas. +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.unique`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``pd.value_counts`` | D | | +| ``pd.value_counts`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.cut`_ | D | | +---------------------------+---------------------------------+----------------------------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 00d7a72d945..152b92aba89 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -432,6 +432,9 @@ def unique(self, **kwargs): # END Abstract map partitions operations + def value_counts(self, **kwargs): + pass + # Abstract map partitions across select indices @abc.abstractmethod def astype(self, col_dtypes, **kwargs): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index ee947829660..8359e5df255 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -480,6 +480,19 @@ def transpose(self, *args, **kwargs): # END String map partitions operations + def value_counts(self, **kwargs): + """ + Return a QueryCompiler of Series containing counts of unique values. + + Returns + ------- + PandasQueryCompiler + """ + new_modin_frame = self._modin_frame._apply_full_axis( + 0, lambda x: x.squeeze().value_counts(**kwargs) + ) + return self.__constructor__(new_modin_frame) + def unique(self): """Return unique values of Series object. diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index ab61e2a5821..0f41c10daca 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -28,7 +28,6 @@ from pandas import ( eval, unique, - value_counts, cut, to_numeric, factorize, @@ -132,6 +131,7 @@ notnull, notna, pivot, + value_counts, ) from .plotting import Plotting as plotting from .. import __execution_engine__ as execution_engine @@ -284,7 +284,6 @@ def import_pandas(*args): "concat", "eval", "unique", - "value_counts", "cut", "to_numeric", "factorize", @@ -363,6 +362,7 @@ def import_pandas(*args): "notnull", "notna", "pivot", + "value_counts", "datetime", "NamedAgg", "DEFAULT_NPARTITIONS", diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 636d98f80c2..49742d09565 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,6 +16,7 @@ from modin.error_message import ErrorMessage from .base import BasePandasDataset from .dataframe import DataFrame +from .series import Series from .utils import to_pandas @@ -217,3 +218,33 @@ def pivot(data, index=None, columns=None, values=None): if not isinstance(data, DataFrame): raise ValueError("can not pivot with instance of type {}".format(type(data))) return data.pivot(index=index, columns=columns, values=values) + + +def value_counts( + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True, +): + """ + Compute a histogram of the counts of non-null values. + + Parameters + ---------- + values : ndarray (1-d) + sort : bool, default True + Sort by values + ascending : bool, default False + Sort in ascending order + normalize: bool, default False + If True then compute a relative histogram + bins : integer, optional + Rather than count values, group them into half-open bins, + convenience for pd.cut, only works with numeric data + dropna : bool, default True + Don't include counts of NaN + + Returns + ------- + Series + """ + return Series(values).value_counts( + sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, + ) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d022020df35..85a0d70f19d 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1256,13 +1256,40 @@ def update(self, other): def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - return self._default_to_pandas( - pandas.Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna, + """ + Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : bool, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + bins : int, optional + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with numeric data. + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + Series + """ + return self.__constructor__( + query_compiler=self._query_compiler.value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) ) def view(self, dtype=None): diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index c5d93e88a92..3fabb0086c2 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -260,6 +260,21 @@ def test_pivot_table(): ) +def test_value_counts(): + values = np.array([3, 1, 2, 3, 4, np.nan]) + modin_result = pd.value_counts(values, normalize=True) + pandas_result = pandas.value_counts(values, normalize=True) + df_equals(modin_result, pandas_result) + + modin_result = pd.value_counts(values, bins=3) + pandas_result = pandas.value_counts(values, bins=3) + df_equals(modin_result, pandas_result) + + modin_result = pd.value_counts(values, dropna=False) + pandas_result = pandas.value_counts(values, dropna=False) + df_equals(modin_result, pandas_result) + + def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index f588c32cb52..f4dbb0d320c 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2748,9 +2748,23 @@ def test_update(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_value_counts(data): modin_series, pandas_series = create_test_series(data) + modin_result = modin_series.value_counts() + pandas_result = pandas_series.value_counts() + df_equals(modin_result, pandas_result) - with pytest.warns(UserWarning): - modin_series.value_counts() + modin_series = pd.Series([3, 1, 2, 3, 4, np.nan]) + pandas_series = pandas.Series([3, 1, 2, 3, 4, np.nan]) + modin_result = modin_series.value_counts(normalize=True) + pandas_result = pandas_series.value_counts(normalize=True) + df_equals(modin_result, pandas_result) + + modin_result = modin_series.value_counts(bins=3) + pandas_result = pandas_series.value_counts(bins=3) + df_equals(modin_result, pandas_result) + + modin_result = modin_series.value_counts(dropna=False) + pandas_result = pandas_series.value_counts(dropna=False) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 9efe11c752e7716884e3ffc3e92dc72382215c66 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 18 Jun 2020 11:46:25 -0700 Subject: [PATCH 2/4] Implementation for value_counts Signed-off-by: Devin Petersohn --- modin/backends/pandas/query_compiler.py | 24 ++++++++++++++++++- .../functions/mapreducefunction.py | 1 + modin/engines/base/frame/data.py | 17 +++++++++++-- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 6f75beb3d8f..d15c54b493a 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -61,6 +61,25 @@ def str_op_builder(df, *args, **kwargs): return str_op_builder +def map_func(df, *args, **kwargs): + return df.squeeze().value_counts(**kwargs) + + +def reduce_func(df, *args, **kwargs): + sort = kwargs.get("sort", False) + by = df.index + dropna = kwargs.get("dropna", True) + normalize = kwargs.get("normalize", False) + result = df.squeeze().groupby(by, sort=False).sum() + if not dropna and np.nan in df.index: + result = df.loc[[np.nan]].sum().append(result) + if normalize: + result / df.squeeze(axis=1).sum() + return ( + result.sort_values(ascending=kwargs.get("ascending", False)) if sort else result + ) + + def _dt_prop_map(property_name): """ Create a function that call property of property `dt` of the series. @@ -506,7 +525,10 @@ def is_monotonic_decreasing(self): lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True)) / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)), axis=kwargs.get("axis", 0), - ), + ) + ) + value_counts = MapReduceFunction.register( + map_func, reduce_func, preserve_index=False ) # END MapReduce operations diff --git a/modin/data_management/functions/mapreducefunction.py b/modin/data_management/functions/mapreducefunction.py index d4a99e846e3..2b83083b14b 100644 --- a/modin/data_management/functions/mapreducefunction.py +++ b/modin/data_management/functions/mapreducefunction.py @@ -25,6 +25,7 @@ def caller(query_compiler, *args, **kwargs): else kwargs.get("axis"), lambda x: map_function(x, *args, **kwargs), lambda y: reduce_function(y, *args, **kwargs), + **call_kwds ) ) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 347d43ccc47..1b18bdffd3a 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -780,7 +780,7 @@ def _fold_reduce(self, axis, func): ) return self._compute_map_reduce_metadata(axis, new_parts) - def _map_reduce(self, axis, map_func, reduce_func=None): + def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): """Apply function that will reduce the data to a Pandas Series. Args: @@ -802,7 +802,20 @@ def _map_reduce(self, axis, map_func, reduce_func=None): reduce_parts = self._frame_mgr_cls.map_axis_partitions( axis, map_parts, reduce_func ) - return self._compute_map_reduce_metadata(axis, reduce_parts) + if preserve_index: + return self._compute_map_reduce_metadata(axis, reduce_parts) + else: + if axis == 0: + new_index = ["__reduced__"] + new_columns = self._frame_mgr_cls.get_indices( + 0, reduce_parts, lambda df: df.index + ) + else: + new_index = self._frame_mgr_cls.get_indices( + 0, reduce_parts, lambda df: df.index + ) + new_columns = ["__reduced__"] + return self.__constructor__(reduce_parts, new_index, new_columns) def _map(self, func, dtypes=None): """Perform a function that maps across the entire dataset. From d47b1d6dd9df2a4aef5ac2b3c4759cc37452bf2d Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 21 Jun 2020 00:47:37 +0300 Subject: [PATCH 3/4] Fix value_counts --- modin/backends/pandas/query_compiler.py | 114 ++++++++++++------ .../functions/mapreducefunction.py | 3 +- modin/engines/base/frame/data.py | 26 ++-- modin/pandas/general.py | 8 +- modin/pandas/series.py | 6 + modin/pandas/test/test_general.py | 49 ++++++-- modin/pandas/test/test_series.py | 58 ++++++--- 7 files changed, 196 insertions(+), 68 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index d15c54b493a..1cfc8ead2e2 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -61,25 +61,6 @@ def str_op_builder(df, *args, **kwargs): return str_op_builder -def map_func(df, *args, **kwargs): - return df.squeeze().value_counts(**kwargs) - - -def reduce_func(df, *args, **kwargs): - sort = kwargs.get("sort", False) - by = df.index - dropna = kwargs.get("dropna", True) - normalize = kwargs.get("normalize", False) - result = df.squeeze().groupby(by, sort=False).sum() - if not dropna and np.nan in df.index: - result = df.loc[[np.nan]].sum().append(result) - if normalize: - result / df.squeeze(axis=1).sum() - return ( - result.sort_values(ascending=kwargs.get("ascending", False)) if sort else result - ) - - def _dt_prop_map(property_name): """ Create a function that call property of property `dt` of the series. @@ -525,12 +506,86 @@ def is_monotonic_decreasing(self): lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True)) / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)), axis=kwargs.get("axis", 0), - ) - ) - value_counts = MapReduceFunction.register( - map_func, reduce_func, preserve_index=False + ), ) + def value_counts(self, **kwargs): + """ + Return a QueryCompiler of Series containing counts of unique values. + + Returns + ------- + PandasQueryCompiler + """ + if kwargs.get("bins", None) is not None: + new_modin_frame = self._modin_frame._apply_full_axis( + 0, lambda df: df.squeeze(axis=1).value_counts(**kwargs) + ) + return self.__constructor__(new_modin_frame) + + def map_func(df, *args, **kwargs): + return df.squeeze(axis=1).value_counts(**kwargs) + + def reduce_func(df, *args, **kwargs): + normalize = kwargs.get("normalize", False) + sort = kwargs.get("sort", True) + ascending = kwargs.get("ascending", False) + dropna = kwargs.get("dropna", True) + + try: + result = df.squeeze(axis=1).groupby(df.index, sort=False).sum() + except (ValueError): + result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum() + + if not dropna and np.nan in df.index: + result = result.append( + pandas.Series( + [df.squeeze(axis=1).loc[[np.nan]].sum()], index=[np.nan] + ) + ) + if normalize: + result = result / df.squeeze(axis=1).sum() + + result = result.sort_values(ascending=ascending) if sort else result + + def sort_index_for_identical_values(result, ascending): + is_range = False + is_end = False + i = 0 + new_index = np.array([], dtype=type(result.index)) + while i < len(result): + j = i + if i < len(result) - 1: + while result[result.index[i]] == result[result.index[i + 1]]: + i += 1 + if is_range is False: + is_range = True + if i == len(result) - 1: + is_end = True + break + if is_range: + new_index = np.concatenate( + ( + new_index, + sorted(result.index[j : i + 1], reverse=not ascending), + ) + ) + if is_end: + break + is_range = False + else: + new_index = np.concatenate( + (new_index, np.array([result.index[j]])) + ) + i += 1 + return pandas.DataFrame(result, index=new_index) + + return sort_index_for_identical_values(result, ascending) + + return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)( + self, **kwargs + ) + # END MapReduce operations # Reduction operations @@ -629,19 +684,6 @@ def is_monotonic_decreasing(self): # END String map partitions operations - def value_counts(self, **kwargs): - """ - Return a QueryCompiler of Series containing counts of unique values. - - Returns - ------- - PandasQueryCompiler - """ - new_modin_frame = self._modin_frame._apply_full_axis( - 0, lambda x: x.squeeze().value_counts(**kwargs) - ) - return self.__constructor__(new_modin_frame) - def unique(self): """Return unique values of Series object. diff --git a/modin/data_management/functions/mapreducefunction.py b/modin/data_management/functions/mapreducefunction.py index 2b83083b14b..aace46679ea 100644 --- a/modin/data_management/functions/mapreducefunction.py +++ b/modin/data_management/functions/mapreducefunction.py @@ -18,6 +18,7 @@ class MapReduceFunction(Function): @classmethod def call(cls, map_function, reduce_function, **call_kwds): def caller(query_compiler, *args, **kwargs): + preserve_index = call_kwds.pop("preserve_index", True) return query_compiler.__constructor__( query_compiler._modin_frame._map_reduce( call_kwds.get("axis") @@ -25,7 +26,7 @@ def caller(query_compiler, *args, **kwargs): else kwargs.get("axis"), lambda x: map_function(x, *args, **kwargs), lambda y: reduce_function(y, *args, **kwargs), - **call_kwds + preserve_index=preserve_index, ) ) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 1b18bdffd3a..59a210aee56 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -781,15 +781,25 @@ def _fold_reduce(self, axis, func): return self._compute_map_reduce_metadata(axis, new_parts) def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): - """Apply function that will reduce the data to a Pandas Series. + """ + Apply function that will reduce the data to a Pandas Series. - Args: - axis: 0 for columns and 1 for rows. Default is 0. - map_func: Callable function to map the dataframe. - reduce_func: Callable function to reduce the dataframe. If none, - then apply map_func twice. + Parameters + ---------- + axis : 0 or 1 + 0 for columns and 1 for rows. + map_func : callable + Callable function to map the dataframe. + reduce_func : callable + Callable function to reduce the dataframe. + If none, then apply map_func twice. Default is None. + preserve_index : boolean + The flag to preserve index for default behavior + map and reduce operations. Default is True. - Return: + Returns + ------- + BasePandasFrame A new dataframe. """ map_func = self._build_mapreduce_func(axis, map_func) @@ -808,7 +818,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): if axis == 0: new_index = ["__reduced__"] new_columns = self._frame_mgr_cls.get_indices( - 0, reduce_parts, lambda df: df.index + 1, reduce_parts, lambda df: df.columns ) else: new_index = self._frame_mgr_cls.get_indices( diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 1a961701f90..d65b8e9ac35 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -291,7 +291,7 @@ def unique(values): def value_counts( - values, sort=True, ascending=False, normalize=False, bins=None, dropna=True, + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True ): """ Compute a histogram of the counts of non-null values. @@ -314,6 +314,12 @@ def value_counts( Returns ------- Series + + Notes + ----- + The indices of resulting object will be in descending + (ascending, if ascending=True) order for equal values. + It slightly differ from pandas where indices are located in random order. """ return Series(values).value_counts( sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d0caf2d2e28..02a5e896377 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1384,6 +1384,12 @@ def value_counts( Returns ------- Series + + Notes + ----- + The indices of resulting object will be in descending + (ascending, if ascending=True) order for equal values. + It slightly differ from pandas where indices are located in random order. """ return self.__constructor__( query_compiler=self._query_compiler.value_counts( diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index bc4ffb2b61f..94473468794 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -319,18 +319,53 @@ def test_unique(): assert_array_equal(modin_result, pandas_result) -def test_value_counts(): +@pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) +def test_value_counts(normalize, bins, dropna): + def sort_index_for_identical_values(result, ascending): + is_range = False + is_end = False + i = 0 + new_index = np.array([], dtype=type(result.index)) + while i < len(result): + j = i + if i < len(result) - 1: + while result[result.index[i]] == result[result.index[i + 1]]: + i += 1 + if is_range is False: + is_range = True + if i == len(result) - 1: + is_end = True + break + if is_range: + new_index = np.concatenate( + (new_index, sorted(result.index[j : i + 1], reverse=not ascending)) + ) + if is_end: + break + is_range = False + else: + new_index = np.concatenate((new_index, np.array([result.index[j]]))) + i += 1 + return pandas.Series(result, index=new_index) + + # We sort indices for pandas result because of issue #1650 values = np.array([3, 1, 2, 3, 4, np.nan]) - modin_result = pd.value_counts(values, normalize=True) - pandas_result = pandas.value_counts(values, normalize=True) + modin_result = pd.value_counts(values, normalize=normalize, ascending=False) + pandas_result = sort_index_for_identical_values( + pandas.value_counts(values, normalize=normalize, ascending=False), False + ) df_equals(modin_result, pandas_result) - modin_result = pd.value_counts(values, bins=3) - pandas_result = pandas.value_counts(values, bins=3) + modin_result = pd.value_counts(values, bins=bins, ascending=False) + pandas_result = sort_index_for_identical_values( + pandas.value_counts(values, bins=bins, ascending=False), False + ) df_equals(modin_result, pandas_result) - modin_result = pd.value_counts(values, dropna=False) - pandas_result = pandas.value_counts(values, dropna=False) + modin_result = pd.value_counts(values, dropna=dropna, ascending=True) + pandas_result = sort_index_for_identical_values( + pandas.value_counts(values, dropna=dropna, ascending=True), True + ) df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index be2bf5a9559..5237de73d5c 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2870,25 +2870,53 @@ def test_update(data, other_data): df_equals(modin_series, pandas_series) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_value_counts(data): - modin_series, pandas_series = create_test_series(data) - modin_result = modin_series.value_counts() - pandas_result = pandas_series.value_counts() - df_equals(modin_result, pandas_result) - - modin_series = pd.Series([3, 1, 2, 3, 4, np.nan]) - pandas_series = pandas.Series([3, 1, 2, 3, 4, np.nan]) - modin_result = modin_series.value_counts(normalize=True) - pandas_result = pandas_series.value_counts(normalize=True) +@pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) +def test_value_counts(normalize, bins, dropna): + def sort_index_for_identical_values(result, ascending): + is_range = False + is_end = False + i = 0 + new_index = np.array([], dtype=type(result.index)) + while i < len(result): + j = i + if i < len(result) - 1: + while result[result.index[i]] == result[result.index[i + 1]]: + i += 1 + if is_range is False: + is_range = True + if i == len(result) - 1: + is_end = True + break + if is_range: + new_index = np.concatenate( + (new_index, sorted(result.index[j : i + 1], reverse=not ascending)) + ) + if is_end: + break + is_range = False + else: + new_index = np.concatenate((new_index, np.array([result.index[j]]))) + i += 1 + return pandas.Series(result, index=new_index) + + # We sort indices for pandas result because of issue #1650 + modin_series, pandas_series = create_test_series(test_data_values[0]) + modin_result = modin_series.value_counts(normalize=normalize, ascending=False) + pandas_result = sort_index_for_identical_values( + pandas_series.value_counts(normalize=normalize, ascending=False), False + ) df_equals(modin_result, pandas_result) - modin_result = modin_series.value_counts(bins=3) - pandas_result = pandas_series.value_counts(bins=3) + modin_result = modin_series.value_counts(bins=bins, ascending=False) + pandas_result = sort_index_for_identical_values( + pandas_series.value_counts(bins=bins, ascending=False), False + ) df_equals(modin_result, pandas_result) - modin_result = modin_series.value_counts(dropna=False) - pandas_result = pandas_series.value_counts(dropna=False) + modin_result = modin_series.value_counts(dropna=dropna, ascending=True) + pandas_result = sort_index_for_identical_values( + pandas_series.value_counts(dropna=dropna, ascending=True), True + ) df_equals(modin_result, pandas_result) From 28daea3d2b518cc332c40aedf792b6cfc8049cdd Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Wed, 24 Jun 2020 00:04:07 +0300 Subject: [PATCH 4/4] apply comments --- modin/backends/pandas/query_compiler.py | 41 +++++++++++++++++-------- modin/pandas/test/test_general.py | 19 ++++++------ modin/pandas/test/test_series.py | 19 ++++++------ 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 1cfc8ead2e2..f4d1e4f20e8 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -534,6 +534,8 @@ def reduce_func(df, *args, **kwargs): try: result = df.squeeze(axis=1).groupby(df.index, sort=False).sum() + # This will happen with Arrow buffer read-only errors. We don't want to copy + # all the time, so this will try to fast-path the code first. except (ValueError): result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum() @@ -548,11 +550,28 @@ def reduce_func(df, *args, **kwargs): result = result.sort_values(ascending=ascending) if sort else result - def sort_index_for_identical_values(result, ascending): + # We want to sort both values and indices of the result object. + # This function will sort indices for equal values. + def sort_index_for_equal_values(result, ascending): + """ + Sort indices for equal values of result object. + + Parameters + ---------- + result : pandas.Series or pandas.DataFrame with one column + The object whose indices for equal values is needed to sort. + ascending : boolean + Sort in ascending (if it is True) or descending (if it is False) order. + + Returns + ------- + pandas.DataFrame + A new DataFrame with sorted indices. + """ is_range = False is_end = False i = 0 - new_index = np.array([], dtype=type(result.index)) + new_index = np.empty(len(result), dtype=type(result.index)) while i < len(result): j = i if i < len(result) - 1: @@ -564,23 +583,21 @@ def sort_index_for_identical_values(result, ascending): is_end = True break if is_range: - new_index = np.concatenate( - ( - new_index, - sorted(result.index[j : i + 1], reverse=not ascending), - ) - ) + k = j + for val in sorted( + result.index[j : i + 1], reverse=not ascending + ): + new_index[k] = val + k += 1 if is_end: break is_range = False else: - new_index = np.concatenate( - (new_index, np.array([result.index[j]])) - ) + new_index[j] = result.index[j] i += 1 return pandas.DataFrame(result, index=new_index) - return sort_index_for_identical_values(result, ascending) + return sort_index_for_equal_values(result, ascending) return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)( self, **kwargs diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index 94473468794..3ce93dfd119 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -321,11 +321,11 @@ def test_unique(): @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) def test_value_counts(normalize, bins, dropna): - def sort_index_for_identical_values(result, ascending): + def sort_index_for_equal_values(result, ascending): is_range = False is_end = False i = 0 - new_index = np.array([], dtype=type(result.index)) + new_index = np.empty(len(result), dtype=type(result.index)) while i < len(result): j = i if i < len(result) - 1: @@ -337,33 +337,34 @@ def sort_index_for_identical_values(result, ascending): is_end = True break if is_range: - new_index = np.concatenate( - (new_index, sorted(result.index[j : i + 1], reverse=not ascending)) - ) + k = j + for val in sorted(result.index[j : i + 1], reverse=not ascending): + new_index[k] = val + k += 1 if is_end: break is_range = False else: - new_index = np.concatenate((new_index, np.array([result.index[j]]))) + new_index[j] = result.index[j] i += 1 return pandas.Series(result, index=new_index) # We sort indices for pandas result because of issue #1650 values = np.array([3, 1, 2, 3, 4, np.nan]) modin_result = pd.value_counts(values, normalize=normalize, ascending=False) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas.value_counts(values, normalize=normalize, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = pd.value_counts(values, bins=bins, ascending=False) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas.value_counts(values, bins=bins, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = pd.value_counts(values, dropna=dropna, ascending=True) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas.value_counts(values, dropna=dropna, ascending=True), True ) df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 5237de73d5c..2eac7bdb2f8 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2872,11 +2872,11 @@ def test_update(data, other_data): @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)]) def test_value_counts(normalize, bins, dropna): - def sort_index_for_identical_values(result, ascending): + def sort_index_for_equal_values(result, ascending): is_range = False is_end = False i = 0 - new_index = np.array([], dtype=type(result.index)) + new_index = np.empty(len(result), dtype=type(result.index)) while i < len(result): j = i if i < len(result) - 1: @@ -2888,33 +2888,34 @@ def sort_index_for_identical_values(result, ascending): is_end = True break if is_range: - new_index = np.concatenate( - (new_index, sorted(result.index[j : i + 1], reverse=not ascending)) - ) + k = j + for val in sorted(result.index[j : i + 1], reverse=not ascending): + new_index[k] = val + k += 1 if is_end: break is_range = False else: - new_index = np.concatenate((new_index, np.array([result.index[j]]))) + new_index[j] = result.index[j] i += 1 return pandas.Series(result, index=new_index) # We sort indices for pandas result because of issue #1650 modin_series, pandas_series = create_test_series(test_data_values[0]) modin_result = modin_series.value_counts(normalize=normalize, ascending=False) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas_series.value_counts(normalize=normalize, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = modin_series.value_counts(bins=bins, ascending=False) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas_series.value_counts(bins=bins, ascending=False), False ) df_equals(modin_result, pandas_result) modin_result = modin_series.value_counts(dropna=dropna, ascending=True) - pandas_result = sort_index_for_identical_values( + pandas_result = sort_index_for_equal_values( pandas_series.value_counts(dropna=dropna, ascending=True), True ) df_equals(modin_result, pandas_result)