Skip to content

Commit

Permalink
Implement offsets slicing in list_offsets (#88)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron authored Jul 31, 2024
1 parent bd71cf8 commit b6cbaa4
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 17 deletions.
8 changes: 4 additions & 4 deletions arro3-compute/python/arro3/compute/_compute.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ def list_flatten(
"""

@overload
def list_offsets(input: ArrowArrayExportable, *, physical: bool = True) -> Array: ...
def list_offsets(input: ArrowArrayExportable, *, logical: bool = True) -> Array: ...
@overload
def list_offsets(
input: ArrowStreamExportable, *, physical: bool = True
input: ArrowStreamExportable, *, logical: bool = True
) -> ArrayReader: ...
def list_offsets(
input: ArrowArrayExportable | ArrowStreamExportable, *, physical: bool = True
input: ArrowArrayExportable | ArrowStreamExportable, *, logical: bool = True
) -> Array | ArrayReader:
"""Access the offsets of this ListArray or LargeListArray
Args:
input: _description_
physical: If True, return the physical (unsliced) offsets of the provided list array. Slicing offsets (False) is not yet implemented.
physical: If False, return the physical (unsliced) offsets of the provided list array. If True, adjust the list offsets for the current array slicing. Defaults to `True`.
Raises:
Exception if not a list-typed array.
Expand Down
52 changes: 39 additions & 13 deletions arro3-compute/src/list_offsets.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use std::sync::Arc;

use arrow::array::AsArray;
use arrow_array::{ArrayRef, Int32Array, Int64Array};
use arrow_array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
use arrow_buffer::{OffsetBuffer, ScalarBuffer};
use arrow_schema::{ArrowError, DataType, Field};
use pyo3::prelude::*;
use pyo3_arrow::error::PyArrowResult;
Expand All @@ -10,12 +11,12 @@ use pyo3_arrow::input::AnyArray;
use pyo3_arrow::{PyArray, PyArrayReader};

#[pyfunction]
#[pyo3(signature = (input, *, physical=true))]
pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResult<PyObject> {
#[pyo3(signature = (input, *, logical=true))]
pub fn list_offsets(py: Python, input: AnyArray, logical: bool) -> PyArrowResult<PyObject> {
match input {
AnyArray::Array(array) => {
let (array, _field) = array.into_inner();
let offsets = _list_offsets(array, physical)?;
let offsets = _list_offsets(array, logical)?;
Ok(PyArray::from_array_ref(offsets).to_arro3(py)?)
}
AnyArray::Stream(stream) => {
Expand All @@ -31,7 +32,7 @@ pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResul
};

let iter = reader.into_iter().map(move |array| {
let out = _list_offsets(array?, physical)?;
let out = _list_offsets(array?, logical)?;
Ok(out)
});
Ok(
Expand All @@ -42,26 +43,51 @@ pub fn list_offsets(py: Python, input: AnyArray, physical: bool) -> PyArrowResul
}
}

fn _list_offsets(array: ArrayRef, physical: bool) -> Result<ArrayRef, ArrowError> {
if !physical {
return Err(ArrowError::ComputeError(
"Logical list offset slicing not yet implemented".to_string(),
));
}
fn _list_offsets(array: ArrayRef, logical: bool) -> Result<ArrayRef, ArrowError> {
let offset = array.offset();
let length = array.len();

match array.data_type() {
DataType::List(_) => {
let arr = array.as_list::<i32>();
let offsets = arr.offsets();
Ok(Arc::new(Int32Array::from(offsets.to_vec())))
let offsets = if logical {
slice_offsets(offsets, offset, length)
} else {
offsets.clone().into_inner()
};
Ok(Arc::new(Int32Array::new(offsets, None)))
}
DataType::LargeList(_) => {
let arr = array.as_list::<i64>();
let offsets = arr.offsets();
Ok(Arc::new(Int64Array::from(offsets.to_vec())))
let offsets = if logical {
slice_offsets(offsets, offset, length)
} else {
offsets.clone().into_inner()
};
Ok(Arc::new(Int64Array::new(offsets, None)))
}
_ => Err(ArrowError::SchemaError(
"Expected list-typed Array".to_string(),
)),
}
}

fn slice_offsets<O: OffsetSizeTrait>(
offsets: &OffsetBuffer<O>,
offset: usize,
length: usize,
) -> ScalarBuffer<O> {
let sliced = offsets.slice(offset, length);
let first_offset = sliced.first().copied().unwrap_or(O::zero());
if first_offset.to_usize().unwrap() == 0 {
sliced.into_inner()
} else {
let mut new_offsets = Vec::with_capacity(sliced.len());
for value in sliced.iter() {
new_offsets.push(*value - first_offset);
}
ScalarBuffer::from(new_offsets)
}
}
30 changes: 30 additions & 0 deletions tests/compute/test_list_offsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pyarrow as pa
from arro3.compute import list_offsets


def test_list_flatten():
list_arr = pa.array([[1, 2], [3, 4]])
out = pa.array(list_offsets(list_arr))
assert out == list_arr.offsets


def test_list_flatten_sliced_end():
list_arr = pa.array([[1, 2], [3, 4]])
sliced = list_arr.slice(1, 1)

out = pa.array(list_offsets(sliced, logical=False))
assert out == pa.array([2, 4], type=pa.int32())

out = pa.array(list_offsets(sliced, logical=True))
assert out == pa.array([0, 2], type=pa.int32())


def test_list_flatten_sliced_start():
list_arr = pa.array([[1, 2], [3, 4]])
sliced = list_arr.slice(0, 1)

out = pa.array(list_offsets(sliced, logical=False))
assert out == pa.array([0, 2], type=pa.int32())

out = pa.array(list_offsets(sliced, logical=True))
assert out == pa.array([0, 2], type=pa.int32())

0 comments on commit b6cbaa4

Please sign in to comment.