Skip to content

Commit

Permalink
fix: Support BinaryView in row decoder to prevent a panic in streamin…
Browse files Browse the repository at this point in the history
…g group by (#15117)

Co-authored-by: ritchie <ritchie46@gmail.com>
  • Loading branch information
mkysylov and ritchie46 authored Mar 18, 2024
1 parent f3d799a commit c0fada8
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
4 changes: 3 additions & 1 deletion crates/polars-row/src/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ unsafe fn decode(rows: &mut [&[u8]], field: &SortField, data_type: &ArrowDataTyp
match data_type {
ArrowDataType::Null => NullArray::new(ArrowDataType::Null, rows.len()).to_boxed(),
ArrowDataType::Boolean => decode_bool(rows, field).to_boxed(),
ArrowDataType::LargeBinary => decode_binview(rows, field).to_boxed(),
ArrowDataType::BinaryView | ArrowDataType::LargeBinary => {
decode_binview(rows, field).to_boxed()
},
ArrowDataType::Utf8View => {
let arr = decode_binview(rows, field);
arr.to_utf8view_unchecked().boxed()
Expand Down
27 changes: 27 additions & 0 deletions py-polars/tests/unit/streaming/test_streaming_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,3 +453,30 @@ def test_streaming_group_null_count() -> None:
assert df.group_by("g").agg(pl.col("a").count()).collect(streaming=True).to_dict(
as_series=False
) == {"g": [1], "a": [3]}


def test_streaming_groupby_binary_15116() -> None:
assert (
pl.LazyFrame(
{
"str": [
"A",
"A",
"BB",
"BB",
"CCCC",
"CCCC",
"DDDDDDDD",
"DDDDDDDD",
"EEEEEEEEEEEEEEEE",
"A",
]
}
)
.select([pl.col("str").cast(pl.Binary)])
.group_by(["str"])
.agg([pl.len().alias("count")])
).sort("str").collect(streaming=True).to_dict(as_series=False) == {
"str": [b"A", b"BB", b"CCCC", b"DDDDDDDD", b"EEEEEEEEEEEEEEEE"],
"count": [3, 2, 2, 2, 1],
}

0 comments on commit c0fada8

Please sign in to comment.