From b31375a8ad53264441587d38357dba135e1fc249 Mon Sep 17 00:00:00 2001
From: "Igoshev, Yaroslav" <yaroslav.igoshev@intel.com>
Date: Wed, 3 Jun 2020 23:11:16 +0300
Subject: [PATCH 1/4] Add `value_counts` implementation for both `Series` and
 as free function

Signed-off-by: Yaroslav Igoshev <yaroslav.igoshev@intel.com>
---
 docs/supported_apis/series_supported.rst    |  2 +-
 docs/supported_apis/utilities_supported.rst |  2 +-
 modin/backends/base/query_compiler.py       |  3 ++
 modin/backends/pandas/query_compiler.py     | 13 +++++++
 modin/pandas/__init__.py                    |  4 +-
 modin/pandas/general.py                     | 31 ++++++++++++++++
 modin/pandas/series.py                      | 41 +++++++++++++++++----
 modin/pandas/test/test_general.py           | 15 ++++++++
 modin/pandas/test/test_series.py            | 18 ++++++++-
 9 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst
index d443830845f..400d30a1a4a 100644
--- a/docs/supported_apis/series_supported.rst
+++ b/docs/supported_apis/series_supported.rst
@@ -468,7 +468,7 @@ the related section on `Defaulting to pandas`_.
 +-----------------------------+---------------------------------+
 | ``valid``                   | D                               |
 +-----------------------------+---------------------------------+
-| ``value_counts``            | D                               |
+| ``value_counts``            | Y                               |
 +-----------------------------+---------------------------------+
 | ``values``                  | Y                               |
 +-----------------------------+---------------------------------+
diff --git a/docs/supported_apis/utilities_supported.rst b/docs/supported_apis/utilities_supported.rst
index 419b9ec8006..8d3d4da1708 100644
--- a/docs/supported_apis/utilities_supported.rst
+++ b/docs/supported_apis/utilities_supported.rst
@@ -21,7 +21,7 @@ default to pandas.
 +---------------------------+---------------------------------+----------------------------------------------------+
 | `pd.unique`_              | D                               |                                                    |
 +---------------------------+---------------------------------+----------------------------------------------------+
-| ``pd.value_counts``       | D                               |                                                    |
+| ``pd.value_counts``       | Y                               |                                                    |
 +---------------------------+---------------------------------+----------------------------------------------------+
 | `pd.cut`_                 | D                               |                                                    |
 +---------------------------+---------------------------------+----------------------------------------------------+
diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py
index 00d7a72d945..152b92aba89 100644
--- a/modin/backends/base/query_compiler.py
+++ b/modin/backends/base/query_compiler.py
@@ -432,6 +432,9 @@ def unique(self, **kwargs):
 
     # END Abstract map partitions operations
 
+    def value_counts(self, **kwargs):
+        pass
+
     # Abstract map partitions across select indices
     @abc.abstractmethod
     def astype(self, col_dtypes, **kwargs):
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
index ee947829660..8359e5df255 100644
--- a/modin/backends/pandas/query_compiler.py
+++ b/modin/backends/pandas/query_compiler.py
@@ -480,6 +480,19 @@ def transpose(self, *args, **kwargs):
 
     # END String map partitions operations
 
+    def value_counts(self, **kwargs):
+        """
+        Return a QueryCompiler of Series containing counts of unique values.
+
+        Returns
+        -------
+        PandasQueryCompiler
+        """
+        new_modin_frame = self._modin_frame._apply_full_axis(
+            0, lambda x: x.squeeze().value_counts(**kwargs)
+        )
+        return self.__constructor__(new_modin_frame)
+
     def unique(self):
         """Return unique values of Series object.
 
diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py
index ab61e2a5821..0f41c10daca 100644
--- a/modin/pandas/__init__.py
+++ b/modin/pandas/__init__.py
@@ -28,7 +28,6 @@
 from pandas import (
     eval,
     unique,
-    value_counts,
     cut,
     to_numeric,
     factorize,
@@ -132,6 +131,7 @@
     notnull,
     notna,
     pivot,
+    value_counts,
 )
 from .plotting import Plotting as plotting
 from .. import __execution_engine__ as execution_engine
@@ -284,7 +284,6 @@ def import_pandas(*args):
     "concat",
     "eval",
     "unique",
-    "value_counts",
     "cut",
     "to_numeric",
     "factorize",
@@ -363,6 +362,7 @@ def import_pandas(*args):
     "notnull",
     "notna",
     "pivot",
+    "value_counts",
     "datetime",
     "NamedAgg",
     "DEFAULT_NPARTITIONS",
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index 636d98f80c2..49742d09565 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -16,6 +16,7 @@
 from modin.error_message import ErrorMessage
 from .base import BasePandasDataset
 from .dataframe import DataFrame
+from .series import Series
 from .utils import to_pandas
 
 
@@ -217,3 +218,33 @@ def pivot(data, index=None, columns=None, values=None):
     if not isinstance(data, DataFrame):
         raise ValueError("can not pivot with instance of type {}".format(type(data)))
     return data.pivot(index=index, columns=columns, values=values)
+
+
+def value_counts(
+    values, sort=True, ascending=False, normalize=False, bins=None, dropna=True,
+):
+    """
+    Compute a histogram of the counts of non-null values.
+
+    Parameters
+    ----------
+    values : ndarray (1-d)
+    sort : bool, default True
+        Sort by values
+    ascending : bool, default False
+        Sort in ascending order
+    normalize: bool, default False
+        If True then compute a relative histogram
+    bins : integer, optional
+        Rather than count values, group them into half-open bins,
+        convenience for pd.cut, only works with numeric data
+    dropna : bool, default True
+        Don't include counts of NaN
+
+    Returns
+    -------
+    Series
+    """
+    return Series(values).value_counts(
+        sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna,
+    )
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index d022020df35..85a0d70f19d 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -1256,13 +1256,40 @@ def update(self, other):
     def value_counts(
         self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
     ):
-        return self._default_to_pandas(
-            pandas.Series.value_counts,
-            normalize=normalize,
-            sort=sort,
-            ascending=ascending,
-            bins=bins,
-            dropna=dropna,
+        """
+        Return a Series containing counts of unique values.
+
+        The resulting object will be in descending order so that the
+        first element is the most frequently-occurring element.
+        Excludes NA values by default.
+
+        Parameters
+        ----------
+        normalize : bool, default False
+            If True then the object returned will contain the relative
+            frequencies of the unique values.
+        sort : bool, default True
+            Sort by frequencies.
+        ascending : bool, default False
+            Sort in ascending order.
+        bins : int, optional
+            Rather than count values, group them into half-open bins,
+            a convenience for ``pd.cut``, only works with numeric data.
+        dropna : bool, default True
+            Don't include counts of NaN.
+
+        Returns
+        -------
+        Series
+        """
+        return self.__constructor__(
+            query_compiler=self._query_compiler.value_counts(
+                normalize=normalize,
+                sort=sort,
+                ascending=ascending,
+                bins=bins,
+                dropna=dropna,
+            )
         )
 
     def view(self, dtype=None):
diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py
index c5d93e88a92..3fabb0086c2 100644
--- a/modin/pandas/test/test_general.py
+++ b/modin/pandas/test/test_general.py
@@ -260,6 +260,21 @@ def test_pivot_table():
         )
 
 
+def test_value_counts():
+    values = np.array([3, 1, 2, 3, 4, np.nan])
+    modin_result = pd.value_counts(values, normalize=True)
+    pandas_result = pandas.value_counts(values, normalize=True)
+    df_equals(modin_result, pandas_result)
+
+    modin_result = pd.value_counts(values, bins=3)
+    pandas_result = pandas.value_counts(values, bins=3)
+    df_equals(modin_result, pandas_result)
+
+    modin_result = pd.value_counts(values, dropna=False)
+    pandas_result = pandas.value_counts(values, dropna=False)
+    df_equals(modin_result, pandas_result)
+
+
 def test_to_datetime():
     # DataFrame input for to_datetime
     modin_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index f588c32cb52..f4dbb0d320c 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -2748,9 +2748,23 @@ def test_update(data):
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_value_counts(data):
     modin_series, pandas_series = create_test_series(data)
+    modin_result = modin_series.value_counts()
+    pandas_result = pandas_series.value_counts()
+    df_equals(modin_result, pandas_result)
 
-    with pytest.warns(UserWarning):
-        modin_series.value_counts()
+    modin_series = pd.Series([3, 1, 2, 3, 4, np.nan])
+    pandas_series = pandas.Series([3, 1, 2, 3, 4, np.nan])
+    modin_result = modin_series.value_counts(normalize=True)
+    pandas_result = pandas_series.value_counts(normalize=True)
+    df_equals(modin_result, pandas_result)
+
+    modin_result = modin_series.value_counts(bins=3)
+    pandas_result = pandas_series.value_counts(bins=3)
+    df_equals(modin_result, pandas_result)
+
+    modin_result = modin_series.value_counts(dropna=False)
+    pandas_result = pandas_series.value_counts(dropna=False)
+    df_equals(modin_result, pandas_result)
 
 
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)

From 9efe11c752e7716884e3ffc3e92dc72382215c66 Mon Sep 17 00:00:00 2001
From: Devin Petersohn <devin.petersohn@gmail.com>
Date: Thu, 18 Jun 2020 11:46:25 -0700
Subject: [PATCH 2/4] Implementation for value_counts

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
---
 modin/backends/pandas/query_compiler.py       | 24 ++++++++++++++++++-
 .../functions/mapreducefunction.py            |  1 +
 modin/engines/base/frame/data.py              | 17 +++++++++++--
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
index 6f75beb3d8f..d15c54b493a 100644
--- a/modin/backends/pandas/query_compiler.py
+++ b/modin/backends/pandas/query_compiler.py
@@ -61,6 +61,25 @@ def str_op_builder(df, *args, **kwargs):
     return str_op_builder
 
 
+def map_func(df, *args, **kwargs):
+    return df.squeeze().value_counts(**kwargs)
+
+
+def reduce_func(df, *args, **kwargs):
+    sort = kwargs.get("sort", False)
+    by = df.index
+    dropna = kwargs.get("dropna", True)
+    normalize = kwargs.get("normalize", False)
+    result = df.squeeze().groupby(by, sort=False).sum()
+    if not dropna and np.nan in df.index:
+        result = df.loc[[np.nan]].sum().append(result)
+        if normalize:
+            result / df.squeeze(axis=1).sum()
+    return (
+        result.sort_values(ascending=kwargs.get("ascending", False)) if sort else result
+    )
+
+
 def _dt_prop_map(property_name):
     """
     Create a function that call property of property `dt` of the series.
@@ -506,7 +525,10 @@ def is_monotonic_decreasing(self):
             lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True))
             / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)),
             axis=kwargs.get("axis", 0),
-        ),
+        )
+    )
+    value_counts = MapReduceFunction.register(
+        map_func, reduce_func, preserve_index=False
     )
 
     # END MapReduce operations
diff --git a/modin/data_management/functions/mapreducefunction.py b/modin/data_management/functions/mapreducefunction.py
index d4a99e846e3..2b83083b14b 100644
--- a/modin/data_management/functions/mapreducefunction.py
+++ b/modin/data_management/functions/mapreducefunction.py
@@ -25,6 +25,7 @@ def caller(query_compiler, *args, **kwargs):
                     else kwargs.get("axis"),
                     lambda x: map_function(x, *args, **kwargs),
                     lambda y: reduce_function(y, *args, **kwargs),
+                    **call_kwds
                 )
             )
 
diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py
index 347d43ccc47..1b18bdffd3a 100644
--- a/modin/engines/base/frame/data.py
+++ b/modin/engines/base/frame/data.py
@@ -780,7 +780,7 @@ def _fold_reduce(self, axis, func):
         )
         return self._compute_map_reduce_metadata(axis, new_parts)
 
-    def _map_reduce(self, axis, map_func, reduce_func=None):
+    def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True):
         """Apply function that will reduce the data to a Pandas Series.
 
         Args:
@@ -802,7 +802,20 @@ def _map_reduce(self, axis, map_func, reduce_func=None):
         reduce_parts = self._frame_mgr_cls.map_axis_partitions(
             axis, map_parts, reduce_func
         )
-        return self._compute_map_reduce_metadata(axis, reduce_parts)
+        if preserve_index:
+            return self._compute_map_reduce_metadata(axis, reduce_parts)
+        else:
+            if axis == 0:
+                new_index = ["__reduced__"]
+                new_columns = self._frame_mgr_cls.get_indices(
+                    0, reduce_parts, lambda df: df.index
+                )
+            else:
+                new_index = self._frame_mgr_cls.get_indices(
+                    0, reduce_parts, lambda df: df.index
+                )
+                new_columns = ["__reduced__"]
+            return self.__constructor__(reduce_parts, new_index, new_columns)
 
     def _map(self, func, dtypes=None):
         """Perform a function that maps across the entire dataset.

From d47b1d6dd9df2a4aef5ac2b3c4759cc37452bf2d Mon Sep 17 00:00:00 2001
From: "Igoshev, Yaroslav" <yaroslav.igoshev@intel.com>
Date: Sun, 21 Jun 2020 00:47:37 +0300
Subject: [PATCH 3/4] Fix value_counts

---
 modin/backends/pandas/query_compiler.py       | 114 ++++++++++++------
 .../functions/mapreducefunction.py            |   3 +-
 modin/engines/base/frame/data.py              |  26 ++--
 modin/pandas/general.py                       |   8 +-
 modin/pandas/series.py                        |   6 +
 modin/pandas/test/test_general.py             |  49 ++++++--
 modin/pandas/test/test_series.py              |  58 ++++++---
 7 files changed, 196 insertions(+), 68 deletions(-)

diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
index d15c54b493a..1cfc8ead2e2 100644
--- a/modin/backends/pandas/query_compiler.py
+++ b/modin/backends/pandas/query_compiler.py
@@ -61,25 +61,6 @@ def str_op_builder(df, *args, **kwargs):
     return str_op_builder
 
 
-def map_func(df, *args, **kwargs):
-    return df.squeeze().value_counts(**kwargs)
-
-
-def reduce_func(df, *args, **kwargs):
-    sort = kwargs.get("sort", False)
-    by = df.index
-    dropna = kwargs.get("dropna", True)
-    normalize = kwargs.get("normalize", False)
-    result = df.squeeze().groupby(by, sort=False).sum()
-    if not dropna and np.nan in df.index:
-        result = df.loc[[np.nan]].sum().append(result)
-        if normalize:
-            result / df.squeeze(axis=1).sum()
-    return (
-        result.sort_values(ascending=kwargs.get("ascending", False)) if sort else result
-    )
-
-
 def _dt_prop_map(property_name):
     """
     Create a function that call property of property `dt` of the series.
@@ -525,12 +506,86 @@ def is_monotonic_decreasing(self):
             lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True))
             / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)),
             axis=kwargs.get("axis", 0),
-        )
-    )
-    value_counts = MapReduceFunction.register(
-        map_func, reduce_func, preserve_index=False
+        ),
     )
 
+    def value_counts(self, **kwargs):
+        """
+        Return a QueryCompiler of Series containing counts of unique values.
+
+        Returns
+        -------
+        PandasQueryCompiler
+        """
+        if kwargs.get("bins", None) is not None:
+            new_modin_frame = self._modin_frame._apply_full_axis(
+                0, lambda df: df.squeeze(axis=1).value_counts(**kwargs)
+            )
+            return self.__constructor__(new_modin_frame)
+
+        def map_func(df, *args, **kwargs):
+            return df.squeeze(axis=1).value_counts(**kwargs)
+
+        def reduce_func(df, *args, **kwargs):
+            normalize = kwargs.get("normalize", False)
+            sort = kwargs.get("sort", True)
+            ascending = kwargs.get("ascending", False)
+            dropna = kwargs.get("dropna", True)
+
+            try:
+                result = df.squeeze(axis=1).groupby(df.index, sort=False).sum()
+            except (ValueError):
+                result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum()
+
+            if not dropna and np.nan in df.index:
+                result = result.append(
+                    pandas.Series(
+                        [df.squeeze(axis=1).loc[[np.nan]].sum()], index=[np.nan]
+                    )
+                )
+            if normalize:
+                result = result / df.squeeze(axis=1).sum()
+
+            result = result.sort_values(ascending=ascending) if sort else result
+
+            def sort_index_for_identical_values(result, ascending):
+                is_range = False
+                is_end = False
+                i = 0
+                new_index = np.array([], dtype=type(result.index))
+                while i < len(result):
+                    j = i
+                    if i < len(result) - 1:
+                        while result[result.index[i]] == result[result.index[i + 1]]:
+                            i += 1
+                            if is_range is False:
+                                is_range = True
+                            if i == len(result) - 1:
+                                is_end = True
+                                break
+                    if is_range:
+                        new_index = np.concatenate(
+                            (
+                                new_index,
+                                sorted(result.index[j : i + 1], reverse=not ascending),
+                            )
+                        )
+                        if is_end:
+                            break
+                        is_range = False
+                    else:
+                        new_index = np.concatenate(
+                            (new_index, np.array([result.index[j]]))
+                        )
+                    i += 1
+                return pandas.DataFrame(result, index=new_index)
+
+            return sort_index_for_identical_values(result, ascending)
+
+        return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)(
+            self, **kwargs
+        )
+
     # END MapReduce operations
 
     # Reduction operations
@@ -629,19 +684,6 @@ def is_monotonic_decreasing(self):
 
     # END String map partitions operations
 
-    def value_counts(self, **kwargs):
-        """
-        Return a QueryCompiler of Series containing counts of unique values.
-
-        Returns
-        -------
-        PandasQueryCompiler
-        """
-        new_modin_frame = self._modin_frame._apply_full_axis(
-            0, lambda x: x.squeeze().value_counts(**kwargs)
-        )
-        return self.__constructor__(new_modin_frame)
-
     def unique(self):
         """Return unique values of Series object.
 
diff --git a/modin/data_management/functions/mapreducefunction.py b/modin/data_management/functions/mapreducefunction.py
index 2b83083b14b..aace46679ea 100644
--- a/modin/data_management/functions/mapreducefunction.py
+++ b/modin/data_management/functions/mapreducefunction.py
@@ -18,6 +18,7 @@ class MapReduceFunction(Function):
     @classmethod
     def call(cls, map_function, reduce_function, **call_kwds):
         def caller(query_compiler, *args, **kwargs):
+            preserve_index = call_kwds.pop("preserve_index", True)
             return query_compiler.__constructor__(
                 query_compiler._modin_frame._map_reduce(
                     call_kwds.get("axis")
@@ -25,7 +26,7 @@ def caller(query_compiler, *args, **kwargs):
                     else kwargs.get("axis"),
                     lambda x: map_function(x, *args, **kwargs),
                     lambda y: reduce_function(y, *args, **kwargs),
-                    **call_kwds
+                    preserve_index=preserve_index,
                 )
             )
 
diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py
index 1b18bdffd3a..59a210aee56 100644
--- a/modin/engines/base/frame/data.py
+++ b/modin/engines/base/frame/data.py
@@ -781,15 +781,25 @@ def _fold_reduce(self, axis, func):
         return self._compute_map_reduce_metadata(axis, new_parts)
 
     def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True):
-        """Apply function that will reduce the data to a Pandas Series.
+        """
+        Apply function that will reduce the data to a Pandas Series.
 
-        Args:
-            axis: 0 for columns and 1 for rows. Default is 0.
-            map_func: Callable function to map the dataframe.
-            reduce_func: Callable function to reduce the dataframe. If none,
-                then apply map_func twice.
+        Parameters
+        ----------
+            axis : 0 or 1
+                0 for columns and 1 for rows.
+            map_func : callable
+                Callable function to map the dataframe.
+            reduce_func : callable
+                Callable function to reduce the dataframe.
+                If none, then apply map_func twice. Default is None.
+            preserve_index : boolean
+                The flag to preserve index for default behavior
+                map and reduce operations. Default is True.
 
-        Return:
+        Returns
+        -------
+        BasePandasFrame
             A new dataframe.
         """
         map_func = self._build_mapreduce_func(axis, map_func)
@@ -808,7 +818,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True):
             if axis == 0:
                 new_index = ["__reduced__"]
                 new_columns = self._frame_mgr_cls.get_indices(
-                    0, reduce_parts, lambda df: df.index
+                    1, reduce_parts, lambda df: df.columns
                 )
             else:
                 new_index = self._frame_mgr_cls.get_indices(
diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index 1a961701f90..d65b8e9ac35 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -291,7 +291,7 @@ def unique(values):
 
 
 def value_counts(
-    values, sort=True, ascending=False, normalize=False, bins=None, dropna=True,
+    values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
 ):
     """
     Compute a histogram of the counts of non-null values.
@@ -314,6 +314,12 @@ def value_counts(
     Returns
     -------
     Series
+
+    Notes
+    -----
+    The indices of resulting object will be in descending
+    (ascending, if ascending=True) order for equal values.
+    It slightly differ from pandas where indices are located in random order.
     """
     return Series(values).value_counts(
         sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna,
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index d0caf2d2e28..02a5e896377 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -1384,6 +1384,12 @@ def value_counts(
         Returns
         -------
         Series
+
+        Notes
+        -----
+        The indices of resulting object will be in descending
+        (ascending, if ascending=True) order for equal values.
+        It slightly differ from pandas where indices are located in random order.
         """
         return self.__constructor__(
             query_compiler=self._query_compiler.value_counts(
diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py
index bc4ffb2b61f..94473468794 100644
--- a/modin/pandas/test/test_general.py
+++ b/modin/pandas/test/test_general.py
@@ -319,18 +319,53 @@ def test_unique():
     assert_array_equal(modin_result, pandas_result)
 
 
-def test_value_counts():
+@pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)])
+def test_value_counts(normalize, bins, dropna):
+    def sort_index_for_identical_values(result, ascending):
+        is_range = False
+        is_end = False
+        i = 0
+        new_index = np.array([], dtype=type(result.index))
+        while i < len(result):
+            j = i
+            if i < len(result) - 1:
+                while result[result.index[i]] == result[result.index[i + 1]]:
+                    i += 1
+                    if is_range is False:
+                        is_range = True
+                    if i == len(result) - 1:
+                        is_end = True
+                        break
+            if is_range:
+                new_index = np.concatenate(
+                    (new_index, sorted(result.index[j : i + 1], reverse=not ascending))
+                )
+                if is_end:
+                    break
+                is_range = False
+            else:
+                new_index = np.concatenate((new_index, np.array([result.index[j]])))
+            i += 1
+        return pandas.Series(result, index=new_index)
+
+    # We sort indices for pandas result because of issue #1650
     values = np.array([3, 1, 2, 3, 4, np.nan])
-    modin_result = pd.value_counts(values, normalize=True)
-    pandas_result = pandas.value_counts(values, normalize=True)
+    modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
+    pandas_result = sort_index_for_identical_values(
+        pandas.value_counts(values, normalize=normalize, ascending=False), False
+    )
     df_equals(modin_result, pandas_result)
 
-    modin_result = pd.value_counts(values, bins=3)
-    pandas_result = pandas.value_counts(values, bins=3)
+    modin_result = pd.value_counts(values, bins=bins, ascending=False)
+    pandas_result = sort_index_for_identical_values(
+        pandas.value_counts(values, bins=bins, ascending=False), False
+    )
     df_equals(modin_result, pandas_result)
 
-    modin_result = pd.value_counts(values, dropna=False)
-    pandas_result = pandas.value_counts(values, dropna=False)
+    modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
+    pandas_result = sort_index_for_identical_values(
+        pandas.value_counts(values, dropna=dropna, ascending=True), True
+    )
     df_equals(modin_result, pandas_result)
 
 
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index be2bf5a9559..5237de73d5c 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -2870,25 +2870,53 @@ def test_update(data, other_data):
     df_equals(modin_series, pandas_series)
 
 
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
-def test_value_counts(data):
-    modin_series, pandas_series = create_test_series(data)
-    modin_result = modin_series.value_counts()
-    pandas_result = pandas_series.value_counts()
-    df_equals(modin_result, pandas_result)
-
-    modin_series = pd.Series([3, 1, 2, 3, 4, np.nan])
-    pandas_series = pandas.Series([3, 1, 2, 3, 4, np.nan])
-    modin_result = modin_series.value_counts(normalize=True)
-    pandas_result = pandas_series.value_counts(normalize=True)
+@pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)])
+def test_value_counts(normalize, bins, dropna):
+    def sort_index_for_identical_values(result, ascending):
+        is_range = False
+        is_end = False
+        i = 0
+        new_index = np.array([], dtype=type(result.index))
+        while i < len(result):
+            j = i
+            if i < len(result) - 1:
+                while result[result.index[i]] == result[result.index[i + 1]]:
+                    i += 1
+                    if is_range is False:
+                        is_range = True
+                    if i == len(result) - 1:
+                        is_end = True
+                        break
+            if is_range:
+                new_index = np.concatenate(
+                    (new_index, sorted(result.index[j : i + 1], reverse=not ascending))
+                )
+                if is_end:
+                    break
+                is_range = False
+            else:
+                new_index = np.concatenate((new_index, np.array([result.index[j]])))
+            i += 1
+        return pandas.Series(result, index=new_index)
+
+    # We sort indices for pandas result because of issue #1650
+    modin_series, pandas_series = create_test_series(test_data_values[0])
+    modin_result = modin_series.value_counts(normalize=normalize, ascending=False)
+    pandas_result = sort_index_for_identical_values(
+        pandas_series.value_counts(normalize=normalize, ascending=False), False
+    )
     df_equals(modin_result, pandas_result)
 
-    modin_result = modin_series.value_counts(bins=3)
-    pandas_result = pandas_series.value_counts(bins=3)
+    modin_result = modin_series.value_counts(bins=bins, ascending=False)
+    pandas_result = sort_index_for_identical_values(
+        pandas_series.value_counts(bins=bins, ascending=False), False
+    )
     df_equals(modin_result, pandas_result)
 
-    modin_result = modin_series.value_counts(dropna=False)
-    pandas_result = pandas_series.value_counts(dropna=False)
+    modin_result = modin_series.value_counts(dropna=dropna, ascending=True)
+    pandas_result = sort_index_for_identical_values(
+        pandas_series.value_counts(dropna=dropna, ascending=True), True
+    )
     df_equals(modin_result, pandas_result)
 
 

From 28daea3d2b518cc332c40aedf792b6cfc8049cdd Mon Sep 17 00:00:00 2001
From: "Igoshev, Yaroslav" <yaroslav.igoshev@intel.com>
Date: Wed, 24 Jun 2020 00:04:07 +0300
Subject: [PATCH 4/4] apply comments

---
 modin/backends/pandas/query_compiler.py | 41 +++++++++++++++++--------
 modin/pandas/test/test_general.py       | 19 ++++++------
 modin/pandas/test/test_series.py        | 19 ++++++------
 3 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
index 1cfc8ead2e2..f4d1e4f20e8 100644
--- a/modin/backends/pandas/query_compiler.py
+++ b/modin/backends/pandas/query_compiler.py
@@ -534,6 +534,8 @@ def reduce_func(df, *args, **kwargs):
 
             try:
                 result = df.squeeze(axis=1).groupby(df.index, sort=False).sum()
+            # This will happen with Arrow buffer read-only errors. We don't want to copy
+            # all the time, so this will try to fast-path the code first.
             except (ValueError):
                 result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum()
 
@@ -548,11 +550,28 @@ def reduce_func(df, *args, **kwargs):
 
             result = result.sort_values(ascending=ascending) if sort else result
 
-            def sort_index_for_identical_values(result, ascending):
+            # We want to sort both values and indices of the result object.
+            # This function will sort indices for equal values.
+            def sort_index_for_equal_values(result, ascending):
+                """
+                Sort indices for equal values of result object.
+
+                Parameters
+                ----------
+                result : pandas.Series or pandas.DataFrame with one column
+                    The object whose indices for equal values is needed to sort.
+                ascending : boolean
+                    Sort in ascending (if it is True) or descending (if it is False) order.
+
+                Returns
+                -------
+                pandas.DataFrame
+                    A new DataFrame with sorted indices.
+                """
                 is_range = False
                 is_end = False
                 i = 0
-                new_index = np.array([], dtype=type(result.index))
+                new_index = np.empty(len(result), dtype=type(result.index))
                 while i < len(result):
                     j = i
                     if i < len(result) - 1:
@@ -564,23 +583,21 @@ def sort_index_for_identical_values(result, ascending):
                                 is_end = True
                                 break
                     if is_range:
-                        new_index = np.concatenate(
-                            (
-                                new_index,
-                                sorted(result.index[j : i + 1], reverse=not ascending),
-                            )
-                        )
+                        k = j
+                        for val in sorted(
+                            result.index[j : i + 1], reverse=not ascending
+                        ):
+                            new_index[k] = val
+                            k += 1
                         if is_end:
                             break
                         is_range = False
                     else:
-                        new_index = np.concatenate(
-                            (new_index, np.array([result.index[j]]))
-                        )
+                        new_index[j] = result.index[j]
                     i += 1
                 return pandas.DataFrame(result, index=new_index)
 
-            return sort_index_for_identical_values(result, ascending)
+            return sort_index_for_equal_values(result, ascending)
 
         return MapReduceFunction.register(map_func, reduce_func, preserve_index=False)(
             self, **kwargs
diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py
index 94473468794..3ce93dfd119 100644
--- a/modin/pandas/test/test_general.py
+++ b/modin/pandas/test/test_general.py
@@ -321,11 +321,11 @@ def test_unique():
 
 @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)])
 def test_value_counts(normalize, bins, dropna):
-    def sort_index_for_identical_values(result, ascending):
+    def sort_index_for_equal_values(result, ascending):
         is_range = False
         is_end = False
         i = 0
-        new_index = np.array([], dtype=type(result.index))
+        new_index = np.empty(len(result), dtype=type(result.index))
         while i < len(result):
             j = i
             if i < len(result) - 1:
@@ -337,33 +337,34 @@ def sort_index_for_identical_values(result, ascending):
                         is_end = True
                         break
             if is_range:
-                new_index = np.concatenate(
-                    (new_index, sorted(result.index[j : i + 1], reverse=not ascending))
-                )
+                k = j
+                for val in sorted(result.index[j : i + 1], reverse=not ascending):
+                    new_index[k] = val
+                    k += 1
                 if is_end:
                     break
                 is_range = False
             else:
-                new_index = np.concatenate((new_index, np.array([result.index[j]])))
+                new_index[j] = result.index[j]
             i += 1
         return pandas.Series(result, index=new_index)
 
     # We sort indices for pandas result because of issue #1650
     values = np.array([3, 1, 2, 3, 4, np.nan])
     modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas.value_counts(values, normalize=normalize, ascending=False), False
     )
     df_equals(modin_result, pandas_result)
 
     modin_result = pd.value_counts(values, bins=bins, ascending=False)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas.value_counts(values, bins=bins, ascending=False), False
     )
     df_equals(modin_result, pandas_result)
 
     modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas.value_counts(values, dropna=dropna, ascending=True), True
     )
     df_equals(modin_result, pandas_result)
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index 5237de73d5c..2eac7bdb2f8 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -2872,11 +2872,11 @@ def test_update(data, other_data):
 
 @pytest.mark.parametrize("normalize, bins, dropna", [(True, 3, False)])
 def test_value_counts(normalize, bins, dropna):
-    def sort_index_for_identical_values(result, ascending):
+    def sort_index_for_equal_values(result, ascending):
         is_range = False
         is_end = False
         i = 0
-        new_index = np.array([], dtype=type(result.index))
+        new_index = np.empty(len(result), dtype=type(result.index))
         while i < len(result):
             j = i
             if i < len(result) - 1:
@@ -2888,33 +2888,34 @@ def sort_index_for_identical_values(result, ascending):
                         is_end = True
                         break
             if is_range:
-                new_index = np.concatenate(
-                    (new_index, sorted(result.index[j : i + 1], reverse=not ascending))
-                )
+                k = j
+                for val in sorted(result.index[j : i + 1], reverse=not ascending):
+                    new_index[k] = val
+                    k += 1
                 if is_end:
                     break
                 is_range = False
             else:
-                new_index = np.concatenate((new_index, np.array([result.index[j]])))
+                new_index[j] = result.index[j]
             i += 1
         return pandas.Series(result, index=new_index)
 
     # We sort indices for pandas result because of issue #1650
     modin_series, pandas_series = create_test_series(test_data_values[0])
     modin_result = modin_series.value_counts(normalize=normalize, ascending=False)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas_series.value_counts(normalize=normalize, ascending=False), False
     )
     df_equals(modin_result, pandas_result)
 
     modin_result = modin_series.value_counts(bins=bins, ascending=False)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas_series.value_counts(bins=bins, ascending=False), False
     )
     df_equals(modin_result, pandas_result)
 
     modin_result = modin_series.value_counts(dropna=dropna, ascending=True)
-    pandas_result = sort_index_for_identical_values(
+    pandas_result = sort_index_for_equal_values(
         pandas_series.value_counts(dropna=dropna, ascending=True), True
     )
     df_equals(modin_result, pandas_result)