Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add value_counts implementation for Series and as free function #1535

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``valid`` | D |
+-----------------------------+---------------------------------+
| ``value_counts`` | D |
| ``value_counts`` | Y |
+-----------------------------+---------------------------------+
| ``values`` | Y |
+-----------------------------+---------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/utilities_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ default to pandas.
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.unique`_ | D | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pd.value_counts`` | D | |
| ``pd.value_counts`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| `pd.cut`_ | D | |
+---------------------------+---------------------------------+----------------------------------------------------+
Expand Down
3 changes: 3 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ def unique(self, **kwargs):

# END Abstract map partitions operations

def value_counts(self, **kwargs):
pass

# Abstract map partitions across select indices
@abc.abstractmethod
def astype(self, col_dtypes, **kwargs):
Expand Down
13 changes: 13 additions & 0 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,19 @@ def transpose(self, *args, **kwargs):

# END String map partitions operations

def value_counts(self, **kwargs):
"""
Return a QueryCompiler of Series containing counts of unique values.

Returns
-------
PandasQueryCompiler
"""
new_modin_frame = self._modin_frame._apply_full_axis(
0, lambda x: x.squeeze().value_counts(**kwargs)
)
return self.__constructor__(new_modin_frame)

def unique(self):
"""Return unique values of Series object.

Expand Down
4 changes: 2 additions & 2 deletions modin/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pandas import (
eval,
unique,
value_counts,
cut,
to_numeric,
factorize,
Expand Down Expand Up @@ -132,6 +131,7 @@
notnull,
notna,
pivot,
value_counts,
)
from .plotting import Plotting as plotting
from .. import __execution_engine__ as execution_engine
Expand Down Expand Up @@ -284,7 +284,6 @@ def import_pandas(*args):
"concat",
"eval",
"unique",
"value_counts",
"cut",
"to_numeric",
"factorize",
Expand Down Expand Up @@ -363,6 +362,7 @@ def import_pandas(*args):
"notnull",
"notna",
"pivot",
"value_counts",
"datetime",
"NamedAgg",
"DEFAULT_NPARTITIONS",
Expand Down
31 changes: 31 additions & 0 deletions modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from modin.error_message import ErrorMessage
from .base import BasePandasDataset
from .dataframe import DataFrame
from .series import Series
from .utils import to_pandas


Expand Down Expand Up @@ -217,3 +218,33 @@ def pivot(data, index=None, columns=None, values=None):
if not isinstance(data, DataFrame):
raise ValueError("can not pivot with instance of type {}".format(type(data)))
return data.pivot(index=index, columns=columns, values=values)


def value_counts(
values, sort=True, ascending=False, normalize=False, bins=None, dropna=True,
):
"""
Compute a histogram of the counts of non-null values.

Parameters
----------
values : ndarray (1-d)
sort : bool, default True
Sort by values
ascending : bool, default False
Sort in ascending order
normalize: bool, default False
If True then compute a relative histogram
bins : integer, optional
Rather than count values, group them into half-open bins,
convenience for pd.cut, only works with numeric data
dropna : bool, default True
Don't include counts of NaN

Returns
-------
Series
"""
return Series(values).value_counts(
sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna,
)
41 changes: 34 additions & 7 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,13 +1256,40 @@ def update(self, other):
def value_counts(
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
):
return self._default_to_pandas(
pandas.Series.value_counts,
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
"""
Return a Series containing counts of unique values.

The resulting object will be in descending order so that the
first element is the most frequently-occurring element.
Excludes NA values by default.

Parameters
----------
normalize : bool, default False
If True then the object returned will contain the relative
frequencies of the unique values.
sort : bool, default True
Sort by frequencies.
ascending : bool, default False
Sort in ascending order.
bins : int, optional
Rather than count values, group them into half-open bins,
a convenience for ``pd.cut``, only works with numeric data.
dropna : bool, default True
Don't include counts of NaN.

Returns
-------
Series
"""
return self.__constructor__(
query_compiler=self._query_compiler.value_counts(
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
)
)

def view(self, dtype=None):
Expand Down
15 changes: 15 additions & 0 deletions modin/pandas/test/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,21 @@ def test_pivot_table():
)


def test_value_counts():
values = np.array([3, 1, 2, 3, 4, np.nan])
modin_result = pd.value_counts(values, normalize=True)
pandas_result = pandas.value_counts(values, normalize=True)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, bins=3)
pandas_result = pandas.value_counts(values, bins=3)
df_equals(modin_result, pandas_result)

modin_result = pd.value_counts(values, dropna=False)
pandas_result = pandas.value_counts(values, dropna=False)
df_equals(modin_result, pandas_result)


def test_to_datetime():
# DataFrame input for to_datetime
modin_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
Expand Down
18 changes: 16 additions & 2 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2748,9 +2748,23 @@ def test_update(data):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_value_counts(data):
modin_series, pandas_series = create_test_series(data)
modin_result = modin_series.value_counts()
pandas_result = pandas_series.value_counts()
df_equals(modin_result, pandas_result)

with pytest.warns(UserWarning):
modin_series.value_counts()
modin_series = pd.Series([3, 1, 2, 3, 4, np.nan])
pandas_series = pandas.Series([3, 1, 2, 3, 4, np.nan])
modin_result = modin_series.value_counts(normalize=True)
pandas_result = pandas_series.value_counts(normalize=True)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(bins=3)
pandas_result = pandas_series.value_counts(bins=3)
df_equals(modin_result, pandas_result)

modin_result = modin_series.value_counts(dropna=False)
pandas_result = pandas_series.value_counts(dropna=False)
df_equals(modin_result, pandas_result)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down