Skip to content

Commit

Permalink
fix(types): fix histogram bin allocation (#9711)
Browse files Browse the repository at this point in the history
## Description of changes

```
import ibis

ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20

t = ibis.range(1000).unnest().name("index").as_table()
t.select(hist=t["index"].histogram(nbins=10)).value_counts()
```

```
┏━━━━━━━┳━━━━━━━━━━━━┓
┃ hist  ┃ hist_count ┃
┡━━━━━━━╇━━━━━━━━━━━━┩
│ int64 │ int64      │
├───────┼────────────┤
│     5 │        100 │
│     9 │        100 │
│     0 │        100 │
│     3 │        100 │
│     6 │        100 │
│     2 │        100 │
│     7 │        100 │
│     8 │        100 │
│     1 │        100 │
│     4 │        100 │
└───────┴────────────┘
```

## Issues closed

* Resolves #9687.

I had to make a slight change to ``histogram`` to account for an edge case that was tested for Impala. It would fail if ``nbins`` was not passed, which is a rather niche use case because ``np.histogram`` for example requires the number of bins to be passed either explicitly or implicitly. I also found a slight quirk with the current design when fixing this because if a ``base`` is passed that is not the minimum value, it would assign those out-of-bound values smaller than the base a negative bin index. It now clips those out-of-bound values to the bin index of -1 to group them together, rather than potentially having bin indices of -1 and -2 for example, so this now aligns with how ``np.histogram`` assigns a bin index of 0 for out-of-bound values.
  • Loading branch information
christophertitchen authored Jul 30, 2024
1 parent 537fc87 commit 6634864
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
11 changes: 10 additions & 1 deletion ibis/backends/tests/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1370,7 +1370,7 @@ def test_clip(backend, alltypes, df, ibis_func, pandas_func):
backend.assert_series_equal(result, expected, check_names=False)


@pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError)
@pytest.mark.notimpl(["datafusion", "polars"], raises=com.OperationNotDefinedError)
@pytest.mark.notyet(
["druid"],
raises=PyDruidProgrammingError,
Expand All @@ -1382,6 +1382,15 @@ def test_histogram(con, alltypes):
vc = hist.value_counts().sort_index()
vc_np, _bin_edges = np.histogram(alltypes.int_col.execute(), bins=n)
assert vc.tolist() == vc_np.tolist()
assert (
con.execute(
ibis.memtable({"value": range(100)})
.select(bin=_.value.histogram(10))
.value_counts()
.bin_count.nunique()
)
== 1
)


@pytest.mark.parametrize("const", ["pi", "e"])
Expand Down
13 changes: 8 additions & 5 deletions ibis/expr/types/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,16 +996,19 @@ def histogram(
f"Cannot pass both `nbins` (got {nbins}) and `binwidth` (got {binwidth})"
)

if binwidth is None or base is None:
if base is None:
base = self.min() - eps

if binwidth is None:
if nbins is None:
raise ValueError("`nbins` is required if `binwidth` is not provided")

if base is None:
base = self.min() - eps

binwidth = (self.max() - base) / nbins

return ((self - base) / binwidth).floor()
if nbins is None:
nbins = ((self.max() - base) / binwidth).ceil()

return ((self - base) / binwidth).floor().clip(-1, nbins - 1)


@public
Expand Down

0 comments on commit 6634864

Please sign in to comment.