diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 0d5b269b048c..80c6f9b4ddd0 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -2226,3 +2226,21 @@ def test_null_isin_null_is_null(con): t = ibis.memtable({"x": [1]}) expr = t.x.isin([None]) assert pd.isna(con.to_pandas(expr).iat[0]) + + +def test_value_counts_on_tables(backend, df): + if backend.name() == "dask": + pytest.skip(reason="flaky errors about sorting on multi-partition dataframes") + from ibis import selectors as s + + t = backend.functional_alltypes + expr = t[["bigint_col", "int_col"]].value_counts().order_by(s.all()) + result = expr.execute() + expected = ( + df.groupby(["bigint_col", "int_col"]) + .string_col.count() + .reset_index() + .rename(columns=dict(string_col="bigint_col_int_col_count")) + ) + expected = expected.sort_values(expected.columns.tolist()).reset_index(drop=True) + backend.assert_frame_equal(result, expected, check_dtype=False) diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 486ea45e97b2..b9374b2b6a80 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -4516,6 +4516,63 @@ def window_by(self, time_col: ir.Value) -> WindowedTable: return WindowedTable(self, time_col) + def value_counts(self) -> ir.Table: + """Compute a frequency table of this table's values. + + Returns + ------- + Table + Frequency table of this table's values. + + Examples + -------- + >>> from ibis import examples + >>> ibis.options.interactive = True + >>> t = examples.penguins.fetch() + >>> t.head() + ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ + ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ + ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ + │ string │ string │ float64 │ float64 │ int64 │ … │ + ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ + │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ … │ + │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ … │ + │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ + │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ + │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ + └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ + >>> t.year.value_counts().order_by("year") + ┏━━━━━━━┳━━━━━━━━━━━━┓ + ┃ year ┃ year_count ┃ + ┡━━━━━━━╇━━━━━━━━━━━━┩ + │ int64 │ int64 │ + ├───────┼────────────┤ + │ 2007 │ 110 │ + │ 2008 │ 114 │ + │ 2009 │ 120 │ + └───────┴────────────┘ + >>> t[["year", "island"]].value_counts().order_by("year", "island") + ┏━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ + ┃ year ┃ island ┃ year_island_count ┃ + ┡━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ + │ int64 │ string │ int64 │ + ├───────┼───────────┼───────────────────┤ + │ 2007 │ Biscoe │ 44 │ + │ 2007 │ Dream │ 46 │ + │ 2007 │ Torgersen │ 20 │ + │ 2008 │ Biscoe │ 64 │ + │ 2008 │ Dream │ 34 │ + │ 2008 │ Torgersen │ 16 │ + │ 2009 │ Biscoe │ 60 │ + │ 2009 │ Dream │ 44 │ + │ 2009 │ Torgersen │ 16 │ + └───────┴───────────┴───────────────────┘ + """ + columns = self.columns + return self.group_by(columns).agg( + lambda t: t.count().name("_".join(columns) + "_count") + ) + @public class CachedTable(Table):