Skip to content

Commit

Permalink
feat(api): add Table.value_counts for easy group by count on multip…
Browse files Browse the repository at this point in the history
…le fields
  • Loading branch information
cpcloud authored May 22, 2024
1 parent e3ee67b commit aba913d
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
18 changes: 18 additions & 0 deletions ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2226,3 +2226,21 @@ def test_null_isin_null_is_null(con):
t = ibis.memtable({"x": [1]})
expr = t.x.isin([None])
assert pd.isna(con.to_pandas(expr).iat[0])


def test_value_counts_on_tables(backend, df):
if backend.name() == "dask":
pytest.skip(reason="flaky errors about sorting on multi-partition dataframes")
from ibis import selectors as s

t = backend.functional_alltypes
expr = t[["bigint_col", "int_col"]].value_counts().order_by(s.all())
result = expr.execute()
expected = (
df.groupby(["bigint_col", "int_col"])
.string_col.count()
.reset_index()
.rename(columns=dict(string_col="bigint_col_int_col_count"))
)
expected = expected.sort_values(expected.columns.tolist()).reset_index(drop=True)
backend.assert_frame_equal(result, expected, check_dtype=False)
57 changes: 57 additions & 0 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4516,6 +4516,63 @@ def window_by(self, time_col: ir.Value) -> WindowedTable:

return WindowedTable(self, time_col)

def value_counts(self) -> ir.Table:
"""Compute a frequency table of this table's values.
Returns
-------
Table
Frequency table of this table's values.
Examples
--------
>>> from ibis import examples
>>> ibis.options.interactive = True
>>> t = examples.penguins.fetch()
>>> t.head()
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓
┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩
│ string │ string │ float64 │ float64 │ int64 │ … │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤
│ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ … │
│ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ … │
│ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │
│ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │
│ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘
>>> t.year.value_counts().order_by("year")
┏━━━━━━━┳━━━━━━━━━━━━┓
┃ year ┃ year_count ┃
┡━━━━━━━╇━━━━━━━━━━━━┩
│ int64 │ int64 │
├───────┼────────────┤
│ 2007 │ 110 │
│ 2008 │ 114 │
│ 2009 │ 120 │
└───────┴────────────┘
>>> t[["year", "island"]].value_counts().order_by("year", "island")
┏━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ year ┃ island ┃ year_island_count ┃
┡━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ int64 │ string │ int64 │
├───────┼───────────┼───────────────────┤
│ 2007 │ Biscoe │ 44 │
│ 2007 │ Dream │ 46 │
│ 2007 │ Torgersen │ 20 │
│ 2008 │ Biscoe │ 64 │
│ 2008 │ Dream │ 34 │
│ 2008 │ Torgersen │ 16 │
│ 2009 │ Biscoe │ 60 │
│ 2009 │ Dream │ 44 │
│ 2009 │ Torgersen │ 16 │
└───────┴───────────┴───────────────────┘
"""
columns = self.columns
return self.group_by(columns).agg(
lambda t: t.count().name("_".join(columns) + "_count")
)


@public
class CachedTable(Table):
Expand Down

0 comments on commit aba913d

Please sign in to comment.