From 36fd1523a54fd58aa4657109ac1fe579c2c8259d Mon Sep 17 00:00:00 2001
From: Jim Crist-Harif <jcristharif@gmail.com>
Date: Thu, 3 Nov 2022 09:34:24 -0500
Subject: [PATCH] feat(duckdb): implement `mode` aggregation

---
 ibis/backends/duckdb/registry.py        |  1 +
 ibis/backends/tests/test_aggregation.py | 43 +++++++++++++++++++++++--
 ibis/expr/operations/reductions.py      |  6 ++++
 ibis/expr/types/generic.py              |  4 +++
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/ibis/backends/duckdb/registry.py b/ibis/backends/duckdb/registry.py
index 747477d88231..a94c4c47435a 100644
--- a/ibis/backends/duckdb/registry.py
+++ b/ibis/backends/duckdb/registry.py
@@ -222,6 +222,7 @@ def _struct_column(t, op):
         ),
         ops.HLLCardinality: reduction(sa.func.approx_count_distinct),
         ops.ApproxCountDistinct: reduction(sa.func.approx_count_distinct),
+        ops.Mode: reduction(sa.func.mode),
         ops.Strftime: _strftime,
         ops.Arbitrary: _arbitrary,
         ops.GroupConcat: _string_agg,
diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py
index 03997d045ee4..79c9ea995ae8 100644
--- a/ibis/backends/tests/test_aggregation.py
+++ b/ibis/backends/tests/test_aggregation.py
@@ -49,6 +49,25 @@ def mean_udf(s):
         lambda t: t.double_col.max(),
         id='max',
     ),
+    param(
+        # int_col % 3 so there are no ties for most common value
+        lambda t: (t.int_col % 3).mode(),
+        lambda t: (t.int_col % 3).mode().iloc[0],
+        id='mode',
+        marks=pytest.mark.notyet(
+            [
+                "clickhouse",
+                "dask",
+                "datafusion",
+                "impala",
+                "mysql",
+                "pandas",
+                "polars",
+                "pyspark",
+                "sqlite",
+            ]
+        ),
+    ),
     param(
         lambda t: (t.double_col + 5).sum(),
         lambda t: (t.double_col + 5).sum(),
@@ -247,6 +266,27 @@ def mean_and_std(v):
             lambda t, where: t.double_col[where].max(),
             id='max',
         ),
+        param(
+            # int_col % 3 so there are no ties for most common value
+            lambda t, where: (t.int_col % 3).mode(where=where),
+            lambda t, where: (t.int_col % 3)[where].mode().iloc[0],
+            id='mode',
+            marks=pytest.mark.notyet(
+                [
+                    "clickhouse",
+                    "dask",
+                    "datafusion",
+                    "impala",
+                    "mysql",
+                    "pandas",
+                    "polars",
+                    "postgres",
+                    "pyspark",
+                    "snowflake",
+                    "sqlite",
+                ]
+            ),
+        ),
         param(
             lambda t, where: t.double_col.argmin(t.int_col, where=where),
             lambda t, where: t.double_col[where].iloc[t.int_col[where].argmin()],
@@ -387,9 +427,6 @@ def mean_and_std(v):
             lambda t, where: t.count(where=where),
             lambda t, where: len(t[where]),
             id='count_star',
-            marks=[
-                # pytest.mark.notimpl(["polars"]),
-            ],
         ),
     ],
 )
diff --git a/ibis/expr/operations/reductions.py b/ibis/expr/operations/reductions.py
index d4e705e68e28..f37103b36fde 100644
--- a/ibis/expr/operations/reductions.py
+++ b/ibis/expr/operations/reductions.py
@@ -177,6 +177,12 @@ class Covariance(Filterable, Reduction):
     output_dtype = dt.float64
 
 
+@public
+class Mode(Filterable, Reduction):
+    arg = rlz.column(rlz.any)
+    output_dtype = rlz.dtype_like('arg')
+
+
 @public
 class Max(Filterable, Reduction):
     arg = rlz.column(rlz.any)
diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py
index 654b33a9f362..bb56e69050d8 100644
--- a/ibis/expr/types/generic.py
+++ b/ibis/expr/types/generic.py
@@ -573,6 +573,10 @@ def approx_median(
         """
         return ops.ApproxMedian(self, where).to_expr().name("approx_median")
 
+    def mode(self, where: ir.BooleanValue | None = None) -> Scalar:
+        """Return the mode of a column."""
+        return ops.Mode(self, where).to_expr().name("mode")
+
     def max(self, where: ir.BooleanValue | None = None) -> Scalar:
         """Return the maximum of a column."""
         return ops.Max(self, where).to_expr().name("max")