From 36fd1523a54fd58aa4657109ac1fe579c2c8259d Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Thu, 3 Nov 2022 09:34:24 -0500 Subject: [PATCH] feat(duckdb): implement `mode` aggregation --- ibis/backends/duckdb/registry.py | 1 + ibis/backends/tests/test_aggregation.py | 43 +++++++++++++++++++++++-- ibis/expr/operations/reductions.py | 6 ++++ ibis/expr/types/generic.py | 4 +++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/ibis/backends/duckdb/registry.py b/ibis/backends/duckdb/registry.py index 747477d88231..a94c4c47435a 100644 --- a/ibis/backends/duckdb/registry.py +++ b/ibis/backends/duckdb/registry.py @@ -222,6 +222,7 @@ def _struct_column(t, op): ), ops.HLLCardinality: reduction(sa.func.approx_count_distinct), ops.ApproxCountDistinct: reduction(sa.func.approx_count_distinct), + ops.Mode: reduction(sa.func.mode), ops.Strftime: _strftime, ops.Arbitrary: _arbitrary, ops.GroupConcat: _string_agg, diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 03997d045ee4..79c9ea995ae8 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -49,6 +49,25 @@ def mean_udf(s): lambda t: t.double_col.max(), id='max', ), + param( + # int_col % 3 so there are no ties for most common value + lambda t: (t.int_col % 3).mode(), + lambda t: (t.int_col % 3).mode().iloc[0], + id='mode', + marks=pytest.mark.notyet( + [ + "clickhouse", + "dask", + "datafusion", + "impala", + "mysql", + "pandas", + "polars", + "pyspark", + "sqlite", + ] + ), + ), param( lambda t: (t.double_col + 5).sum(), lambda t: (t.double_col + 5).sum(), @@ -247,6 +266,27 @@ def mean_and_std(v): lambda t, where: t.double_col[where].max(), id='max', ), + param( + # int_col % 3 so there are no ties for most common value + lambda t, where: (t.int_col % 3).mode(where=where), + lambda t, where: (t.int_col % 3)[where].mode().iloc[0], + id='mode', + marks=pytest.mark.notyet( + [ + "clickhouse", + "dask", + "datafusion", + "impala", + "mysql", + "pandas", + "polars", + "postgres", + "pyspark", + "snowflake", + "sqlite", + ] + ), + ), param( lambda t, where: t.double_col.argmin(t.int_col, where=where), lambda t, where: t.double_col[where].iloc[t.int_col[where].argmin()], @@ -387,9 +427,6 @@ def mean_and_std(v): lambda t, where: t.count(where=where), lambda t, where: len(t[where]), id='count_star', - marks=[ - # pytest.mark.notimpl(["polars"]), - ], ), ], ) diff --git a/ibis/expr/operations/reductions.py b/ibis/expr/operations/reductions.py index d4e705e68e28..f37103b36fde 100644 --- a/ibis/expr/operations/reductions.py +++ b/ibis/expr/operations/reductions.py @@ -177,6 +177,12 @@ class Covariance(Filterable, Reduction): output_dtype = dt.float64 +@public +class Mode(Filterable, Reduction): + arg = rlz.column(rlz.any) + output_dtype = rlz.dtype_like('arg') + + @public class Max(Filterable, Reduction): arg = rlz.column(rlz.any) diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index 654b33a9f362..bb56e69050d8 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -573,6 +573,10 @@ def approx_median( """ return ops.ApproxMedian(self, where).to_expr().name("approx_median") + def mode(self, where: ir.BooleanValue | None = None) -> Scalar: + """Return the mode of a column.""" + return ops.Mode(self, where).to_expr().name("mode") + def max(self, where: ir.BooleanValue | None = None) -> Scalar: """Return the maximum of a column.""" return ops.Max(self, where).to_expr().name("max")