modin-project · ienkovich · Oct 23, 2020 · Oct 22, 2020
@@ -311,6 +311,22 @@ def _agg(self, agg, axis=0, level=None, **kwargs):
         )
         return self.__constructor__(new_frame, shape_hint="row")
 
+    def value_counts(self, **kwargs):
+        subset = kwargs.get("subset", None)
+        normalize = kwargs.get("normalize", False)
+        sort = kwargs.get("sort", True)
+        ascending = kwargs.get("ascending", False)
+        bins = kwargs.get("bins", False)
+        dropna = kwargs.get("dropna", True)
+
+        if bins or normalize:
+            return super().value_count(**kwargs)
+
+        new_frame = self._modin_frame.value_counts(
+            columns=subset, dropna=dropna, sort=sort, ascending=ascending
+        )
+        return self.__constructor__(new_frame, shape_hint="column")
+
     def _get_index(self):
         if self._modin_frame._has_unsupported_data:
             return default_axis_getter(0)(self)

@@ -333,6 +333,69 @@ def agg(self, agg):
             force_execution_mode=self._force_execution_mode,
         )
 
+    def value_counts(self, dropna, columns, sort, ascending):
+        by = [col for col in self.columns if columns is None or col in columns]
+
+        if not by:
+            raise ValueError("invalid columns subset is specified")
+
+        base = self
+        if dropna:
+            checks = [base.ref(col).is_not_null() for col in by]
+            condition = (
+                checks[0]
+                if len(checks) == 1
+                else OpExpr("AND", [checks], np.dtype("bool"))
+            )
+            base = self.__constructor__(
+                columns=Index.__new__(Index, data=by, dtype="O"),
+                dtypes=base._dtypes[by],
+                op=FilterNode(base, condition),
+                index_cols=None,
+                force_execution_mode=base._force_execution_mode,
+            )
+
+        agg_exprs = OrderedDict()
+        agg_exprs[""] = AggregateExpr("size", None)
+        dtypes = base._dtypes[by].tolist()
+        dtypes.append(np.dtype("int64"))
+
+        new_columns = Index.__new__(Index, data=[""], dtype="O")
+
+        res = self.__constructor__(
+            columns=new_columns,
+            dtypes=dtypes,
+            op=GroupbyAggNode(base, by, agg_exprs, {"sort": False}),
+            index_cols=by.copy(),
+            force_execution_mode=base._force_execution_mode,
+        )
+
+        if sort or ascending:
+            res = self.__constructor__(
+                columns=res.columns,
+                dtypes=res._dtypes,
+                op=SortNode(res, [""], [ascending], "last"),
+                index_cols=res._index_cols,
+                force_execution_mode=res._force_execution_mode,
+            )
+
+        # If a single column is used then it keeps its name.
+        # TODO: move it to upper levels when index renaming is in place.
+        if len(by) == 1:
+            exprs = OrderedDict()
+            exprs["__index__"] = res.ref(by[0])
+            exprs[by[0]] = res.ref("")
+
+            res = self.__constructor__(
+                columns=Index.__new__(Index, data=by, dtype="O"),
+                dtypes=self._dtypes_for_exprs(exprs),
+                op=TransformNode(res, exprs),
+                index_cols=["__index__"],
+                force_execution_mode=res._force_execution_mode,
+            )
+
+        return res
+
     def fillna(
         self,
         value=None,

@@ -97,6 +97,10 @@ def is_null(self):
         new_expr = OpExpr("IS NULL", [self], _get_dtype(bool))
         return new_expr
 
+    def is_not_null(self):
+        new_expr = OpExpr("IS NOT NULL", [self], _get_dtype(bool))
+        return new_expr
+
     def bin_op(self, other, op_name):
         if op_name not in self.binary_operations:
             raise NotImplementedError(f"unsupported binary operation {op_name}")
@@ -256,7 +260,8 @@ class AggregateExpr(BaseExpr):
     def __init__(self, agg, op, distinct=False, dtype=None):
         self.agg = agg
         self.operands = [op]
-        self._dtype = dtype if dtype else _agg_dtype(agg, op._dtype)
+        self._dtype = dtype if dtype else _agg_dtype(agg, op._dtype if op else None)
+        assert self._dtype is not None
         self.distinct = distinct
 
     def copy(self):

@@ -846,10 +846,10 @@ def groupby(df, **kwargs):
 
 class TestAgg:
     data = {
-        "a": [1, 2, None, None, 5, None],
-        "b": [10, 20, None, 40, 50, None],
+        "a": [1, 2, None, None, 1, None],
+        "b": [10, 20, None, 20, 10, None],
         "c": [None, 200, None, 400, 500, 600],
-        "d": [11, 22, 33, 44, 55, 66],
+        "d": [11, 22, 33, 22, 33, 22],
     }
 
     @pytest.mark.parametrize("agg", ["count", "max", "min", "sum"])
@@ -859,6 +859,23 @@ def agg(df, agg, **kwargs):
 
         run_and_compare(agg, data=self.data, agg=agg, force_lazy=False)
 
+    @pytest.mark.parametrize("cols", ["a", "d"])
+    @pytest.mark.parametrize("dropna", [True, False])
+    @pytest.mark.parametrize("sort", [True])
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_value_counts(self, cols, dropna, sort, ascending):
+        def value_counts(df, cols, dropna, sort, ascending, **kwargs):
+            return df[cols].value_counts(dropna=dropna, sort=sort, ascending=ascending)
+
+        run_and_compare(
+            value_counts,
+            data=self.data,
+            cols=cols,
+            dropna=dropna,
+            sort=sort,
+            ascending=ascending,
+        )
+
 
 class TestMerge:
     data = {