diff --git a/arro3-compute/python/arro3/compute/_compute.pyi b/arro3-compute/python/arro3/compute/_compute.pyi index 52405bd..925d998 100644 --- a/arro3-compute/python/arro3/compute/_compute.pyi +++ b/arro3-compute/python/arro3/compute/_compute.pyi @@ -46,6 +46,28 @@ def list_flatten( _description_ """ +@overload +def list_offsets(input: ArrowArrayExportable, *, physical: bool = True) -> Array: ... +@overload +def list_offsets( + input: ArrowStreamExportable, *, physical: bool = True +) -> ArrayReader: ... +def list_offsets( + input: ArrowArrayExportable | ArrowStreamExportable, *, physical: bool = True +) -> Array | ArrayReader: + """Access the offsets of this ListArray or LargeListArray + + Args: + input: _description_ + physical: If True, return the physical (unsliced) offsets of the provided list array. Slicing offsets (False) is not yet implemented. + + Raises: + Exception if not a list-typed array. + + Returns: + _description_ + """ + def struct_field( values: ArrowArrayExportable, /, diff --git a/arro3-compute/src/lib.rs b/arro3-compute/src/lib.rs index 63da22f..65951aa 100644 --- a/arro3-compute/src/lib.rs +++ b/arro3-compute/src/lib.rs @@ -3,6 +3,7 @@ use pyo3::prelude::*; mod cast; mod concat; mod list_flatten; +mod list_offsets; mod struct_field; mod take; @@ -22,6 +23,7 @@ fn _compute(_py: Python, m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(take::take))?; m.add_wrapped(wrap_pyfunction!(list_flatten::list_flatten))?; + m.add_wrapped(wrap_pyfunction!(list_offsets::list_offsets))?; m.add_wrapped(wrap_pyfunction!(struct_field::struct_field))?; Ok(()) diff --git a/arro3-compute/src/list_offsets.rs b/arro3-compute/src/list_offsets.rs new file mode 100644 index 0000000..5738a6d --- /dev/null +++ b/arro3-compute/src/list_offsets.rs @@ -0,0 +1,67 @@ +use std::sync::Arc; + +use arrow::array::AsArray; +use arrow_array::{ArrayRef, Int32Array, Int64Array}; +use arrow_schema::{ArrowError, DataType, Field}; +use pyo3::prelude::*; +use pyo3_arrow::error::PyArrowResult; +use pyo3_arrow::ffi::ArrayIterator; +use pyo3_arrow::input::AnyArray; +use pyo3_arrow::{PyArray, PyArrayReader}; + +#[pyfunction] +#[pyo3(signature = (input, *, physical=true))] +pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResult { + match input { + AnyArray::Array(array) => { + let (array, _field) = array.into_inner(); + let offsets = _list_offsets(array, physical)?; + Ok(PyArray::from_array_ref(offsets).to_arro3(py)?) + } + AnyArray::Stream(stream) => { + let reader = stream.into_reader()?; + let out_field = match reader.field().data_type() { + DataType::List(_) => Field::new("", DataType::Int32, false), + DataType::LargeList(_) => Field::new("", DataType::Int64, false), + _ => { + return Err( + ArrowError::SchemaError("Expected list-typed Array".to_string()).into(), + ); + } + }; + + let iter = reader.into_iter().map(move |array| { + let out = _list_offsets(array?, physical)?; + Ok(out) + }); + Ok( + PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) + .to_arro3(py)?, + ) + } + } +} + +fn _list_offsets(array: ArrayRef, physical: bool) -> Result { + if !physical { + return Err(ArrowError::ComputeError( + "Logical list offset slicing not yet implemented".to_string(), + )); + } + + match array.data_type() { + DataType::List(_) => { + let arr = array.as_list::(); + let offsets = arr.offsets(); + Ok(Arc::new(Int32Array::from(offsets.to_vec()))) + } + DataType::LargeList(_) => { + let arr = array.as_list::(); + let offsets = arr.offsets(); + Ok(Arc::new(Int64Array::from(offsets.to_vec()))) + } + _ => Err(ArrowError::SchemaError( + "Expected list-typed Array".to_string(), + )), + } +}