diff --git a/arro3-compute/python/arro3/compute/_compute.pyi b/arro3-compute/python/arro3/compute/_compute.pyi index 925d998..1085721 100644 --- a/arro3-compute/python/arro3/compute/_compute.pyi +++ b/arro3-compute/python/arro3/compute/_compute.pyi @@ -47,19 +47,19 @@ def list_flatten( """ @overload -def list_offsets(input: ArrowArrayExportable, *, physical: bool = True) -> Array: ... +def list_offsets(input: ArrowArrayExportable, *, logical: bool = True) -> Array: ... @overload def list_offsets( - input: ArrowStreamExportable, *, physical: bool = True + input: ArrowStreamExportable, *, logical: bool = True ) -> ArrayReader: ... def list_offsets( - input: ArrowArrayExportable | ArrowStreamExportable, *, physical: bool = True + input: ArrowArrayExportable | ArrowStreamExportable, *, logical: bool = True ) -> Array | ArrayReader: """Access the offsets of this ListArray or LargeListArray Args: input: _description_ - physical: If True, return the physical (unsliced) offsets of the provided list array. Slicing offsets (False) is not yet implemented. + physical: If False, return the physical (unsliced) offsets of the provided list array. If True, adjust the list offsets for the current array slicing. Defaults to `True`. Raises: Exception if not a list-typed array. diff --git a/arro3-compute/src/list_offsets.rs b/arro3-compute/src/list_offsets.rs index 5738a6d..a2a5d86 100644 --- a/arro3-compute/src/list_offsets.rs +++ b/arro3-compute/src/list_offsets.rs @@ -1,7 +1,8 @@ use std::sync::Arc; use arrow::array::AsArray; -use arrow_array::{ArrayRef, Int32Array, Int64Array}; +use arrow_array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; +use arrow_buffer::{OffsetBuffer, ScalarBuffer}; use arrow_schema::{ArrowError, DataType, Field}; use pyo3::prelude::*; use pyo3_arrow::error::PyArrowResult; @@ -10,12 +11,12 @@ use pyo3_arrow::input::AnyArray; use pyo3_arrow::{PyArray, PyArrayReader}; #[pyfunction] -#[pyo3(signature = (input, *, physical=true))] -pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResult { +#[pyo3(signature = (input, *, logical=true))] +pub fn list_offsets(py: Python, input: AnyArray, logical: bool) -> PyArrowResult { match input { AnyArray::Array(array) => { let (array, _field) = array.into_inner(); - let offsets = _list_offsets(array, physical)?; + let offsets = _list_offsets(array, logical)?; Ok(PyArray::from_array_ref(offsets).to_arro3(py)?) } AnyArray::Stream(stream) => { @@ -31,7 +32,7 @@ pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResul }; let iter = reader.into_iter().map(move |array| { - let out = _list_offsets(array?, physical)?; + let out = _list_offsets(array?, logical)?; Ok(out) }); Ok( @@ -42,26 +43,51 @@ pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResul } } -fn _list_offsets(array: ArrayRef, physical: bool) -> Result { - if !physical { - return Err(ArrowError::ComputeError( - "Logical list offset slicing not yet implemented".to_string(), - )); - } +fn _list_offsets(array: ArrayRef, logical: bool) -> Result { + let offset = array.offset(); + let length = array.len(); match array.data_type() { DataType::List(_) => { let arr = array.as_list::(); let offsets = arr.offsets(); - Ok(Arc::new(Int32Array::from(offsets.to_vec()))) + let offsets = if logical { + slice_offsets(offsets, offset, length) + } else { + offsets.clone().into_inner() + }; + Ok(Arc::new(Int32Array::new(offsets, None))) } DataType::LargeList(_) => { let arr = array.as_list::(); let offsets = arr.offsets(); - Ok(Arc::new(Int64Array::from(offsets.to_vec()))) + let offsets = if logical { + slice_offsets(offsets, offset, length) + } else { + offsets.clone().into_inner() + }; + Ok(Arc::new(Int64Array::new(offsets, None))) } _ => Err(ArrowError::SchemaError( "Expected list-typed Array".to_string(), )), } } + +fn slice_offsets( + offsets: &OffsetBuffer, + offset: usize, + length: usize, +) -> ScalarBuffer { + let sliced = offsets.slice(offset, length); + let first_offset = sliced.first().copied().unwrap_or(O::zero()); + if first_offset.to_usize().unwrap() == 0 { + sliced.into_inner() + } else { + let mut new_offsets = Vec::with_capacity(sliced.len()); + for value in sliced.iter() { + new_offsets.push(*value - first_offset); + } + ScalarBuffer::from(new_offsets) + } +} diff --git a/tests/compute/test_list_offsets.py b/tests/compute/test_list_offsets.py new file mode 100644 index 0000000..c9452db --- /dev/null +++ b/tests/compute/test_list_offsets.py @@ -0,0 +1,30 @@ +import pyarrow as pa +from arro3.compute import list_offsets + + +def test_list_flatten(): + list_arr = pa.array([[1, 2], [3, 4]]) + out = pa.array(list_offsets(list_arr)) + assert out == list_arr.offsets + + +def test_list_flatten_sliced_end(): + list_arr = pa.array([[1, 2], [3, 4]]) + sliced = list_arr.slice(1, 1) + + out = pa.array(list_offsets(sliced, logical=False)) + assert out == pa.array([2, 4], type=pa.int32()) + + out = pa.array(list_offsets(sliced, logical=True)) + assert out == pa.array([0, 2], type=pa.int32()) + + +def test_list_flatten_sliced_start(): + list_arr = pa.array([[1, 2], [3, 4]]) + sliced = list_arr.slice(0, 1) + + out = pa.array(list_offsets(sliced, logical=False)) + assert out == pa.array([0, 2], type=pa.int32()) + + out = pa.array(list_offsets(sliced, logical=True)) + assert out == pa.array([0, 2], type=pa.int32())