Skip to content

Commit

Permalink
FIX-#2453: Remove sorting indices for equal values
Browse files Browse the repository at this point in the history
in `Series.value_counts`

Signed-off-by: Igoshev, Yaroslav <yaroslav.igoshev@intel.com>
  • Loading branch information
YarShev committed Nov 26, 2020
1 parent 5d3f693 commit 3b54f3f
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 87 deletions.
6 changes: 2 additions & 4 deletions docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -474,10 +474,8 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``valid`` | D | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``value_counts`` | Y | The indices of resulting object will be in |
| | | descending (ascending, if ascending=True) order for|
| | | equal values. |
| | | In pandas indices are located in random order. |
| ``value_counts`` | Y | The indices order of resulting object may differ |
| | | from pandas. |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``values`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
6 changes: 2 additions & 4 deletions docs/supported_apis/utilities_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ default to pandas.
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.unique`_ | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pd.value_counts`` | Y | The indices of resulting object will be in |
| | | descending (ascending, if ascending=True) order for|
| | | equal values. |
| | | In pandas indices are located in random order. |
| ``pd.value_counts`` | Y | The indices order of resulting object may differ |
| | | from pandas. |
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.cut`_ | D | |
+---------------------------+---------------------------------+----------------------------------------------------+
Expand Down
53 changes: 1 addition & 52 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,58 +750,7 @@ def reduce_func(df, *args, **kwargs):
if normalize:
result = result / df.squeeze(axis=1).sum()

result = result.sort_values(ascending=ascending) if sort else result

# We want to sort both values and indices of the result object.
# This function will sort indices for equal values.
def sort_index_for_equal_values(result, ascending):
"""
Sort indices for equal values of result object.
Parameters
----------
result : pandas.Series or pandas.DataFrame with one column
The object whose indices for equal values is needed to sort.
ascending : boolean
Sort in ascending (if it is True) or descending (if it is False) order.
Returns
-------
pandas.DataFrame
A new DataFrame with sorted indices.
"""
is_range = False
is_end = False
i = 0
new_index = np.empty(len(result), dtype=type(result.index))
while i < len(result):
j = i
if i < len(result) - 1:
while result[result.index[i]] == result[result.index[i + 1]]:
i += 1
if is_range is False:
is_range = True
if i == len(result) - 1:
is_end = True
break
if is_range:
k = j
for val in sorted(
result.index[j : i + 1], reverse=not ascending
):
new_index[k] = val
k += 1
if is_end:
break
is_range = False
else:
new_index[j] = result.index[j]
i += 1
return pandas.DataFrame(
result, index=new_index, columns=["__reduced__"]
)

return sort_index_for_equal_values(result, ascending)
return result.sort_values(ascending=ascending) if sort else result

return MapReduceFunction.register(
map_func, reduce_func, axis=0, preserve_index=False
Expand Down
16 changes: 11 additions & 5 deletions modin/pandas/test/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,23 +352,29 @@ def sort_index_for_equal_values(result, ascending):
else:
new_index[j] = result.index[j]
i += 1
return pandas.Series(result, index=new_index)
return type(result)(result, index=new_index)

# We sort indices for pandas result because of issue #1650
# We sort indices for Modin and pandas result because of issue #1650
values = np.array([3, 1, 2, 3, 4, np.nan])
modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, normalize=normalize, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, bins=bins, ascending=False)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, bins=bins, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, dropna=dropna, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, dropna=dropna, ascending=True), True
)
Expand Down
39 changes: 17 additions & 22 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pandas.core.base import SpecificationError
import sys

from modin.utils import to_pandas, get_current_backend
from modin.utils import to_pandas
from .utils import (
random_state,
RAND_LOW,
Expand Down Expand Up @@ -3376,33 +3376,28 @@ def sort_index_for_equal_values(result, ascending):
i += 1
return type(result)(result, index=new_index)

# We sort indices for pandas result because of issue #1650
# We sort indices for Modin and pandas result because of issue #1650
modin_series, pandas_series = create_test_series(test_data_values[0])
modin_result = modin_series.value_counts(normalize=normalize, ascending=False)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(normalize=normalize, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(bins=bins, ascending=False)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(bins=bins, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(dropna=dropna, ascending=True)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=True)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=dropna, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=dropna, ascending=True), True
)
Expand All @@ -3412,20 +3407,20 @@ def sort_index_for_equal_values(result, ascending):
arr = np.random.rand(2 ** 6)
arr[::10] = np.nan
modin_series, pandas_series = create_test_series(arr)
modin_result = modin_series.value_counts(dropna=False, ascending=True)
modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=False, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=False, ascending=True), True
)
if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=True)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(dropna=False, ascending=False)
modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=False, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=False, ascending=False), False
)
if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)
df_equals(modin_result, pandas_result)


Expand Down

0 comments on commit 3b54f3f

Please sign in to comment.