diff --git a/src/array/ops/cast.rs b/src/array/ops/cast.rs index 768384b443..fb38555467 100644 --- a/src/array/ops/cast.rs +++ b/src/array/ops/cast.rs @@ -5,7 +5,7 @@ use arrow2::compute::{ use crate::series::IntoSeries; use crate::{ - array::DataArray, + array::{ops::image::ImageArrayVecs, DataArray}, datatypes::logical::{ DateArray, EmbeddingArray, FixedShapeImageArray, ImageArray, LogicalArray, }, @@ -18,7 +18,7 @@ use arrow2::array::Array; use num_traits::NumCast; #[cfg(feature = "python")] -use crate::datatypes::{FixedSizeListArray, ListArray, StructArray}; +use crate::datatypes::{FixedSizeListArray, ListArray}; #[cfg(feature = "python")] use crate::datatypes::{ImageMode, PythonArray}; #[cfg(feature = "python")] @@ -496,14 +496,15 @@ fn extract_python_like_to_list< } #[cfg(feature = "python")] -fn extract_python_like_to_image_struct< +fn extract_python_like_to_image_array< Tgt: numpy::Element + NumCast + ToPrimitive + arrow2::types::NativeType, >( py: Python<'_>, python_objects: &PythonArray, + dtype: &DataType, child_dtype: &DataType, mode_from_dtype: Option, -) -> DaftResult { +) -> DaftResult { // 3 dimensions - height x width x channel. let shape_size = 3; @@ -519,24 +520,8 @@ fn extract_python_like_to_image_struct< let offsets = offsets.expect("Offsets should but non-None for image struct array"); let shapes = shapes.expect("Shapes should be non-None for image struct array"); - let values_array: Box = - Box::new(arrow2::array::PrimitiveArray::from_vec(values_vec)); - - let inner_dtype = child_dtype.to_arrow()?; - - let data_dtype = arrow2::datatypes::DataType::LargeList(Box::new( - arrow2::datatypes::Field::new("data", inner_dtype, true), - )); - let validity = python_objects.as_arrow().validity(); - let data_array = Box::new(arrow2::array::ListArray::new( - data_dtype.clone(), - arrow2::offset::OffsetsBuffer::try_from(offsets)?, - values_array, - validity.cloned(), - )); - let num_rows = shapes.len(); let mut channels = Vec::::with_capacity(num_rows); @@ -591,37 +576,18 @@ fn extract_python_like_to_image_struct< child_dtype, )?) as u8); } - - let channel_array = Box::new(arrow2::array::PrimitiveArray::from_vec(channels)); - let height_array = Box::new(arrow2::array::PrimitiveArray::from_vec(heights)); - let width_array = Box::new(arrow2::array::PrimitiveArray::from_vec(widths)); - let mode_array = Box::new(arrow2::array::PrimitiveArray::from_vec(modes)); - - let struct_dtype = arrow2::datatypes::DataType::Struct(vec![ - arrow2::datatypes::Field::new("data", data_dtype, true), - arrow2::datatypes::Field::new("channel", channel_array.data_type().clone(), true), - arrow2::datatypes::Field::new("height", height_array.data_type().clone(), true), - arrow2::datatypes::Field::new("width", width_array.data_type().clone(), true), - arrow2::datatypes::Field::new("mode", mode_array.data_type().clone(), true), - ]); - - let daft_type = (&struct_dtype).into(); - - let struct_array = arrow2::array::StructArray::new( - struct_dtype, - vec![ - data_array, - channel_array, - height_array, - width_array, - mode_array, - ], - validity.cloned(), - ); - - StructArray::new( - Field::new(python_objects.name(), daft_type).into(), - Box::new(struct_array), + ImageArray::from_vecs( + python_objects.name(), + dtype.clone(), + ImageArrayVecs { + data: values_vec, + channels, + heights, + widths, + modes, + offsets, + validity: validity.cloned(), + }, ) } @@ -707,13 +673,14 @@ impl PythonArray { with_match_numeric_daft_types!(**inner_dtype, |$T| { type Tgt = <$T as DaftNumericType>::Native; pyo3::Python::with_gil(|py| { - let result = extract_python_like_to_image_struct::(py, self, inner_dtype, *mode)?; - Ok( - ImageArray::new( - Field::new(self.name(), dtype.clone()), - result, - ).into_series() - ) + let result = extract_python_like_to_image_array::(py, self, dtype, inner_dtype, *mode)?; + Ok(result.into_series()) + // Ok( + // ImageArray::new( + // Field::new(self.name(), dtype.clone()), + // result, + // ).into_series() + // ) }) }) } diff --git a/src/array/ops/image.rs b/src/array/ops/image.rs index 8a4da08ef6..66c85dbad0 100644 --- a/src/array/ops/image.rs +++ b/src/array/ops/image.rs @@ -3,8 +3,7 @@ use std::vec; use image::{ColorType, DynamicImage, ImageBuffer}; -use crate::datatypes::logical::ImageArray; -use crate::datatypes::{BinaryArray, DataType, Field, ImageMode}; +use crate::datatypes::{logical::ImageArray, BinaryArray, DataType, Field, ImageMode, StructArray}; use crate::error::{DaftError, DaftResult}; use image::{Luma, LumaA, Rgb, Rgba}; @@ -174,6 +173,16 @@ impl<'a> From for DaftImageBuffer<'a> { } } +pub struct ImageArrayVecs { + pub data: Vec, + pub channels: Vec, + pub heights: Vec, + pub widths: Vec, + pub modes: Vec, + pub offsets: Vec, + pub validity: Option, +} + impl ImageArray { fn image_mode(&self) -> &Option { match self.logical_type() { @@ -217,6 +226,77 @@ impl ImageArray { array.as_ref().as_any().downcast_ref().unwrap() } + pub fn from_vecs( + name: &str, + data_type: DataType, + vecs: ImageArrayVecs, + ) -> DaftResult { + if vecs.data.is_empty() { + // Create an all-null array if the data array is empty. + let physical_type = data_type.to_physical(); + let null_struct_array = + arrow2::array::new_null_array(physical_type.to_arrow()?, vecs.channels.len()); + let daft_struct_array = + StructArray::new(Field::new(name, physical_type).into(), null_struct_array)?; + return Ok(ImageArray::new( + Field::new(name, data_type), + daft_struct_array, + )); + } + let offsets = arrow2::offset::OffsetsBuffer::try_from(vecs.offsets)?; + let arrow_dtype: arrow2::datatypes::DataType = T::PRIMITIVE.into(); + if let DataType::Image(inner_dtype, _) = &data_type { + if inner_dtype.to_arrow()? != arrow_dtype { + panic!("Inner value dtype of provided dtype {data_type:?} is inconsistent with inferred value dtype {arrow_dtype:?}"); + } + } + + let list_datatype = arrow2::datatypes::DataType::LargeList(Box::new( + arrow2::datatypes::Field::new("data", arrow_dtype, true), + )); + let data_array = Box::new(arrow2::array::ListArray::::new( + list_datatype, + offsets, + Box::new(arrow2::array::PrimitiveArray::from_vec(vecs.data)), + vecs.validity.clone(), + )); + + let values: Vec> = vec![ + data_array, + Box::new( + arrow2::array::UInt16Array::from_vec(vecs.channels) + .with_validity(vecs.validity.clone()), + ), + Box::new( + arrow2::array::UInt32Array::from_vec(vecs.heights) + .with_validity(vecs.validity.clone()), + ), + Box::new( + arrow2::array::UInt32Array::from_vec(vecs.widths) + .with_validity(vecs.validity.clone()), + ), + Box::new( + arrow2::array::UInt8Array::from_vec(vecs.modes) + .with_validity(vecs.validity.clone()), + ), + ]; + let physical_type = data_type.to_physical(); + let struct_array = Box::new(arrow2::array::StructArray::new( + physical_type.to_arrow()?, + values, + vecs.validity, + )); + + let daft_struct_array = crate::datatypes::StructArray::new( + Field::new(name, physical_type).into(), + struct_array, + )?; + Ok(ImageArray::new( + Field::new(name, data_type), + daft_struct_array, + )) + } + fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { assert!(idx < self.len()); if !self.physical.is_valid(idx) { @@ -311,51 +391,24 @@ impl ImageArray { offsets.push(offsets.last().unwrap() + buffer.len() as i64); } - let collected_data = data_ref.concat(); - let offsets = arrow2::offset::OffsetsBuffer::try_from(offsets)?; - let value_dtype = DataType::UInt8; - let data_type = DataType::Image(Box::new(value_dtype.clone()), *image_mode); - + let data = data_ref.concat(); let validity: Option = match validity.unset_bits() { 0 => None, _ => Some(validity.into()), }; - let arrow_dtype = value_dtype.to_arrow()?; - - let list_datatype = arrow2::datatypes::DataType::LargeList(Box::new( - arrow2::datatypes::Field::new("data", arrow_dtype, true), - )); - let data_array = Box::new(arrow2::array::ListArray::::new( - list_datatype, - offsets, - Box::new(arrow2::array::PrimitiveArray::from_vec(collected_data)), - validity.clone(), - )); - - let values: Vec> = vec![ - data_array, - Box::new( - arrow2::array::UInt16Array::from_vec(channels).with_validity(validity.clone()), - ), - Box::new(arrow2::array::UInt32Array::from_vec(heights).with_validity(validity.clone())), - Box::new(arrow2::array::UInt32Array::from_vec(widths).with_validity(validity.clone())), - Box::new(arrow2::array::UInt8Array::from_vec(modes).with_validity(validity.clone())), - ]; - let physical_type = data_type.to_physical(); - let struct_array = Box::new(arrow2::array::StructArray::new( - physical_type.to_arrow()?, - values, - validity, - )); - - let daft_struct_array = crate::datatypes::StructArray::new( - Field::new(name, physical_type).into(), - struct_array, - )?; - Ok(ImageArray::new( - Field::new(name, data_type), - daft_struct_array, - )) + ImageArray::from_vecs( + name, + DataType::Image(Box::new(DataType::UInt8), *image_mode), + ImageArrayVecs { + data, + channels, + heights, + widths, + modes, + offsets, + validity, + }, + ) } } @@ -386,8 +439,8 @@ impl BinaryArray { } img_bufs.push(img_buf); } - // Series::image_decode() guarantees that we have at least one non-None element in this array. - let cached_dtype = cached_dtype.unwrap(); + // Fall back to UInt8 dtype if series is all nulls. + let cached_dtype = cached_dtype.unwrap_or(DataType::UInt8); match cached_dtype { DataType::UInt8 => Ok(ImageArray::from_daft_image_buffers(self.name(), img_bufs.as_slice(), &None)?), _ => unimplemented!("Decoding images of dtype {cached_dtype:?} is not supported, only uint8 images are supported."), diff --git a/src/series/ops/image.rs b/src/series/ops/image.rs index 0ad00ae899..4b9b4183ee 100644 --- a/src/series/ops/image.rs +++ b/src/series/ops/image.rs @@ -1,6 +1,4 @@ -use arrow2; - -use crate::datatypes::{DataType, ImageType, NullArray}; +use crate::datatypes::{DataType, ImageType}; use crate::{ error::{DaftError, DaftResult}, @@ -10,15 +8,7 @@ use crate::{ impl Series { pub fn image_decode(&self) -> DaftResult { match self.data_type() { - DataType::Binary => { - let binary_array = self.binary()?; - if binary_array.data().null_count() == self.len() { - // All images are None, so return a NullArray. - Ok(NullArray::from(("item", Box::new(arrow2::array::NullArray::new(arrow2::datatypes::DataType::Null, self.len())))).into_series()) - } else { - Ok(self.binary()?.image_decode()?.into_series()) - } - }, + DataType::Binary => Ok(self.binary()?.image_decode()?.into_series()), dtype => Err(DaftError::ValueError(format!( "Decoding in-memory data into images is only supported for binary arrays, but got {}", dtype ))),