From 791da7fc01c72eec929a00460d10bdfe482c1fe6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 27 Jul 2024 08:30:16 -0400 Subject: [PATCH] Add additional documentation and examples to ArrayAccessor --- arrow-array/src/array/mod.rs | 79 ++++++++++++++++++++++++++++++++++-- arrow/src/lib.rs | 26 ++++++++---- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b115ff9c14cc..50c5699bac32 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -437,13 +437,84 @@ impl<'a, T: Array> Array for &'a T { /// A generic trait for accessing the values of an [`Array`] /// +/// This trait helps write specialized implementations of algorithms for +/// different array types. Specialized implementations allow the compiler +/// to optimize the code for the specific array type, which can lead to +/// significant performance improvements. +/// +/// # Example +/// For example, to write three different implementations of a string length function +/// for [`StringArray`], [`LargeStringArray`], and [`StringViewArray`], you can write +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayAccessor, ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray}; +/// # use arrow_buffer::ArrowNativeType; +/// # use arrow_array::cast::AsArray; +/// # use arrow_array::iterator::ArrayIter; +/// # use arrow_array::types::{Int32Type, Int64Type}; +/// # use arrow_schema::{ArrowError, DataType}; +/// /// This function takes a dynamically typed `ArrayRef` and calls +/// /// calls one of three specialized implementations +/// fn character_length(arg: ArrayRef) -> Result { +/// match arg.data_type() { +/// DataType::Utf8 => { +/// // downcast the ArrayRef to a StringArray and call the specialized implementation +/// let string_array = arg.as_string::(); +/// character_length_general::(string_array) +/// } +/// DataType::LargeUtf8 => { +/// character_length_general::(arg.as_string::()) +/// } +/// DataType::Utf8View => { +/// character_length_general::(arg.as_string_view()) +/// } +/// _ => Err(ArrowError::InvalidArgumentError("Unsupported data type".to_string())), +/// } +/// } +/// +/// /// A generic implementation of the character_length function +/// /// This function uses the `ArrayAccessor` trait to access the values of the array +/// /// so the compiler can generated specialized implementations for different array types +/// /// +/// /// Returns a new array with the length of each string in the input array +/// /// * Int32Array for Utf8 and Utf8View arrays (lengths are 32-bit integers) +/// /// * Int64Array for LargeUtf8 arrays (lengths are 64-bit integers) +/// /// +/// /// This is generic on the type of the primitive array (different string arrays have +/// /// different lengths) and the type of the array accessor (different string arrays +/// /// have different ways to access the values) +/// fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor>( +/// array: V, +/// ) -> Result +/// where +/// T::Native: OffsetSizeTrait, +/// { +/// let iter = ArrayIter::new(array); +/// // Create a Int32Array / Int64Array with the length of each string +/// let result = iter +/// .map(|string| { +/// string.map(|string: &str| { +/// T::Native::from_usize(string.chars().count()) +/// .expect("should not fail as string.chars will always return integer") +/// }) +/// }) +/// .collect::>(); +/// +/// /// Return the result as a new ArrayRef (dynamically typed) +/// Ok(Arc::new(result) as ArrayRef) +/// } +/// ``` +/// /// # Validity /// -/// An [`ArrayAccessor`] must always return a well-defined value for an index that is -/// within the bounds `0..Array::len`, including for null indexes where [`Array::is_null`] is true. +/// An [`ArrayAccessor`] must always return a well-defined value for an index +/// that is within the bounds `0..Array::len`, including for null indexes where +/// [`Array::is_null`] is true. /// -/// The value at null indexes is unspecified, and implementations must not rely on a specific -/// value such as [`Default::default`] being returned, however, it must not be undefined +/// The value at null indexes is unspecified, and implementations must not rely +/// on a specific value such as [`Default::default`] being returned, however, it +/// must not be undefined pub trait ArrayAccessor: Array { /// The Arrow type of the element being accessed. type Item: Send + Sync; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index ea8dfb36b2aa..8796caf43e3d 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -40,8 +40,10 @@ //! assert_eq!(array.values(), &[1, 0, 3]) //! ``` //! -//! It is also possible to write generic code. For example, the following is generic over -//! all primitively typed arrays +//! It is also possible to write generic code for different concrete types. +//! For example, since the following function is generic over all primitively +//! typed arrays, when invoked the Rust compiler will generate specialized implementations +//! with optimized code for each concrete type. //! //! ```rust //! # use std::iter::Sum; @@ -60,7 +62,10 @@ //! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! -//! And the following is generic over all arrays with comparable values +//! And the following uses [`ArrayAccessor`] to implement a generic function +//! over all arrays with comparable values. +//! +//! [`ArrayAccessor`]: array::ArrayAccessor //! //! ```rust //! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray}; @@ -81,10 +86,11 @@ //! //! # Type Erasure / Trait Objects //! -//! It is often the case that code wishes to handle any type of array, without necessarily knowing -//! its concrete type. This use-case is catered for by a combination of [`Array`] -//! and [`DataType`](datatypes::DataType), with the former providing a type-erased container for -//! the array, and the latter identifying the concrete type of array. +//! It is common to write code that handles any type of array, without necessarily +//! knowing its concrete type. This is done using the [`Array`] trait and using +//! [`DataType`] to determine the appropriate `downcast_ref`. +//! +//! [`DataType`]: datatypes::DataType //! //! ```rust //! # use arrow::array::{Array, Float32Array}; @@ -96,14 +102,18 @@ //! //! fn impl_dyn(array: &dyn Array) { //! match array.data_type() { +//! // downcast `dyn Array` to concrete `StringArray` //! DataType::Utf8 => impl_string(array.as_any().downcast_ref().unwrap()), +//! // downcast `dyn Array` to concrete `Float32Array` //! DataType::Float32 => impl_f32(array.as_any().downcast_ref().unwrap()), //! _ => unimplemented!() //! } //! } //! ``` //! -//! To facilitate downcasting, the [`AsArray`](crate::array::AsArray) extension trait can be used +//! You can use the [`AsArray`] extension trait to facilitate downcasting: +//! +//! [`AsArray`]: crate::array::AsArray //! //! ```rust //! # use arrow::array::{Array, Float32Array, AsArray};