From 1287d7fbe597e65f9030914cd11b82882235c0d1 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 23 Jan 2023 09:28:52 +0100 Subject: [PATCH] fix(rust, python): take offset into account with str.explode --- .../polars-core/src/chunked_array/ops/explode.rs | 14 ++++++++------ .../src/physical_plan/planner/expr.rs | 7 +++---- py-polars/tests/unit/test_explode.py | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/polars/polars-core/src/chunked_array/ops/explode.rs b/polars/polars-core/src/chunked_array/ops/explode.rs index 1d1ee1916c95..f67f4b39568a 100644 --- a/polars/polars-core/src/chunked_array/ops/explode.rs +++ b/polars/polars-core/src/chunked_array/ops/explode.rs @@ -463,13 +463,14 @@ impl ChunkExplode for Utf8Chunked { // capacity estimate let capacity = self.get_values_size() + validity.unset_bits(); + let old_offsets = old_offsets.as_slice(); + let mut old_offset = old_offsets[0]; let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(0i64); + new_offsets.push(old_offset); let mut bitmap = MutableBitmap::with_capacity(capacity); let values = values.as_slice(); - let mut old_offset = 0i64; - for (&offset, valid) in old_offsets.as_slice()[1..].iter().zip(validity) { + for (&offset, valid) in old_offsets[1..].iter().zip(validity) { // safety: // new_offsets already has a single value, so -1 is always in bounds let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; @@ -513,12 +514,13 @@ impl ChunkExplode for Utf8Chunked { // capacity estimate let capacity = self.get_values_size(); + let old_offsets = old_offsets.as_slice(); + let mut old_offset = old_offsets[0]; let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(0i64); + new_offsets.push(old_offset); let values = values.as_slice(); - let mut old_offset = 0i64; - for &offset in &old_offsets.as_slice()[1..] { + for &offset in &old_offsets[1..] { // safety: // new_offsets already has a single value, so -1 is always in bounds let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; diff --git a/polars/polars-lazy/src/physical_plan/planner/expr.rs b/polars/polars-lazy/src/physical_plan/planner/expr.rs index 35b2e912233d..630f874a59e3 100644 --- a/polars/polars-lazy/src/physical_plan/planner/expr.rs +++ b/polars/polars-lazy/src/physical_plan/planner/expr.rs @@ -590,10 +590,9 @@ pub(crate) fn create_physical_expr( } Explode(expr) => { let input = create_physical_expr(expr, ctxt, expr_arena, schema)?; - let function = SpecialEq::new(Arc::new(move |s: &mut [Series]| { - let s = std::mem::take(&mut s[0]); - s.explode() - }) as Arc); + let function = SpecialEq::new( + Arc::new(move |s: &mut [Series]| s[0].explode()) as Arc + ); Ok(Arc::new(ApplyExpr::new_minimal( vec![input], function, diff --git a/py-polars/tests/unit/test_explode.py b/py-polars/tests/unit/test_explode.py index c21d0a21b7b4..1d1c22f637b0 100644 --- a/py-polars/tests/unit/test_explode.py +++ b/py-polars/tests/unit/test_explode.py @@ -224,3 +224,19 @@ def test_explode_inner_lists_3985() -> None: .agg(pl.col("categories")) .with_column(pl.col("categories").arr.eval(pl.element().arr.explode())) ).collect().to_dict(False) == {"id": [1], "categories": [["a", "b", "a", "c"]]} + + +def test_utf8_sliced_explode() -> None: + df = pl.DataFrame( + { + "group": ["a", "b", "b"], + "values": ["foo", "bar", "baz"], + } + ) + + assert df.groupby("group", maintain_order=True).agg( + pl.col("values").flatten() + ).to_dict(False) == { + "group": ["a", "b"], + "values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]], + }