Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#2453: Remove sorting indices for equal values in Series.value_counts #2454

Merged
merged 1 commit into from
Dec 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -474,10 +474,8 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``valid`` | D | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``value_counts`` | Y | The indices of resulting object will be in |
| | | descending (ascending, if ascending=True) order for|
| | | equal values. |
| | | In pandas indices are located in random order. |
| ``value_counts`` | Y | The indices order of resulting object may differ |
| | | from pandas. |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``values`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
6 changes: 2 additions & 4 deletions docs/supported_apis/utilities_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ default to pandas.
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.unique`_ | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pd.value_counts`` | Y | The indices of resulting object will be in |
| | | descending (ascending, if ascending=True) order for|
| | | equal values. |
| | | In pandas indices are located in random order. |
| ``pd.value_counts`` | Y | The indices order of resulting object may differ |
| | | from pandas. |
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.cut`_ | D | |
+---------------------------+---------------------------------+----------------------------------------------------+
Expand Down
53 changes: 1 addition & 52 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,58 +750,7 @@ def reduce_func(df, *args, **kwargs):
if normalize:
result = result / df.squeeze(axis=1).sum()

result = result.sort_values(ascending=ascending) if sort else result

# We want to sort both values and indices of the result object.
# This function will sort indices for equal values.
def sort_index_for_equal_values(result, ascending):
"""
Sort indices for equal values of result object.

Parameters
----------
result : pandas.Series or pandas.DataFrame with one column
The object whose indices for equal values is needed to sort.
ascending : boolean
Sort in ascending (if it is True) or descending (if it is False) order.

Returns
-------
pandas.DataFrame
A new DataFrame with sorted indices.
"""
is_range = False
is_end = False
i = 0
new_index = np.empty(len(result), dtype=type(result.index))
while i < len(result):
j = i
if i < len(result) - 1:
while result[result.index[i]] == result[result.index[i + 1]]:
i += 1
if is_range is False:
is_range = True
if i == len(result) - 1:
is_end = True
break
if is_range:
k = j
for val in sorted(
result.index[j : i + 1], reverse=not ascending
):
new_index[k] = val
k += 1
if is_end:
break
is_range = False
else:
new_index[j] = result.index[j]
i += 1
return pandas.DataFrame(
result, index=new_index, columns=["__reduced__"]
)

return sort_index_for_equal_values(result, ascending)
return result.sort_values(ascending=ascending) if sort else result

return MapReduceFunction.register(
map_func, reduce_func, axis=0, preserve_index=False
Expand Down
16 changes: 11 additions & 5 deletions modin/pandas/test/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,23 +352,29 @@ def sort_index_for_equal_values(result, ascending):
else:
new_index[j] = result.index[j]
i += 1
return pandas.Series(result, index=new_index)
return type(result)(result, index=new_index)

# We sort indices for pandas result because of issue #1650
# We sort indices for Modin and pandas result because of issue #1650
values = np.array([3, 1, 2, 3, 4, np.nan])
modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, normalize=normalize, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, bins=bins, ascending=False)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, bins=bins, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
modin_result = sort_index_for_equal_values(
pd.value_counts(values, dropna=dropna, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas.value_counts(values, dropna=dropna, ascending=True), True
)
Expand Down
39 changes: 17 additions & 22 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pandas.core.base import SpecificationError
import sys

from modin.utils import to_pandas, get_current_backend
from modin.utils import to_pandas
from .utils import (
random_state,
RAND_LOW,
Expand Down Expand Up @@ -3376,33 +3376,28 @@ def sort_index_for_equal_values(result, ascending):
i += 1
return type(result)(result, index=new_index)

# We sort indices for pandas result because of issue #1650
# We sort indices for Modin and pandas result because of issue #1650
modin_series, pandas_series = create_test_series(test_data_values[0])
modin_result = modin_series.value_counts(normalize=normalize, ascending=False)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(normalize=normalize, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(normalize=normalize, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(bins=bins, ascending=False)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(bins=bins, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(bins=bins, ascending=False), False
)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(dropna=dropna, ascending=True)

if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=True)

modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=dropna, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=dropna, ascending=True), True
)
Expand All @@ -3412,20 +3407,20 @@ def sort_index_for_equal_values(result, ascending):
arr = np.random.rand(2 ** 6)
arr[::10] = np.nan
modin_series, pandas_series = create_test_series(arr)
modin_result = modin_series.value_counts(dropna=False, ascending=True)
modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=False, ascending=True), True
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=False, ascending=True), True
)
if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=True)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(dropna=False, ascending=False)
modin_result = sort_index_for_equal_values(
modin_series.value_counts(dropna=False, ascending=False), False
)
pandas_result = sort_index_for_equal_values(
pandas_series.value_counts(dropna=False, ascending=False), False
)
if get_current_backend() == "BaseOnPython":
modin_result = sort_index_for_equal_values(modin_result, ascending=False)
df_equals(modin_result, pandas_result)


Expand Down