From 8cc6e690a4164969d05e4ec789fca92f0bc07ebc Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Mon, 5 Jun 2023 16:57:48 -0700 Subject: [PATCH] [Images] [4/N] Add `Image` series --> Python (NumPy) series casting. (#990) This PR adds an `Image` series --> Python egress path via the casting kernel, where elements in an `ImageArray` series are converted to NumPy ndarrays when hit with a `.cast(DataType::Python)`. This is added for both the fixed-shape image and variable-shape image arrays. Automatic conversion of PIL images on `Series.from_pylist()` and to NumPy ndarrays on `Series.to_pylist()` is saved for a future PR. --- src/array/ops/as_arrow.rs | 9 +++- src/array/ops/broadcast.rs | 12 +++-- src/array/ops/cast.rs | 103 +++++++++++++++++++++++++++++++++---- src/array/ops/image.rs | 14 ++--- tests/series/test_image.py | 94 ++++++++++++++++----------------- 5 files changed, 160 insertions(+), 72 deletions(-) diff --git a/src/array/ops/as_arrow.rs b/src/array/ops/as_arrow.rs index 9fd5dab97e..4dff8185fd 100644 --- a/src/array/ops/as_arrow.rs +++ b/src/array/ops/as_arrow.rs @@ -10,6 +10,11 @@ use crate::{ }, }; +#[cfg(feature = "python")] +use crate::array::pseudo_arrow::PseudoArrowArray; +#[cfg(feature = "python")] +use crate::datatypes::PythonArray; + pub trait AsArrow { type Output; @@ -92,8 +97,8 @@ impl AsArrow for StructArray { } #[cfg(feature = "python")] -impl AsArrow for crate::datatypes::PythonArray { - type Output = crate::array::pseudo_arrow::PseudoArrowArray; +impl AsArrow for PythonArray { + type Output = PseudoArrowArray; // downcasts a DataArray to a PseudoArrowArray of PyObject. fn as_arrow(&self) -> &Self::Output { diff --git a/src/array/ops/broadcast.rs b/src/array/ops/broadcast.rs index 1d9b166e04..0d59839a16 100644 --- a/src/array/ops/broadcast.rs +++ b/src/array/ops/broadcast.rs @@ -9,6 +9,11 @@ use crate::{ use super::as_arrow::AsArrow; +#[cfg(feature = "python")] +use crate::array::pseudo_arrow::PseudoArrowArray; +#[cfg(feature = "python")] +use crate::datatypes::PythonArray; + pub trait Broadcastable { fn broadcast(&self, num: usize) -> DaftResult where @@ -292,9 +297,8 @@ impl Broadcastable for crate::datatypes::PythonArray { } }; - let repeated_values_array: Box = Box::new( - crate::array::pseudo_arrow::PseudoArrowArray::new(repeated_values.into(), validity), - ); - crate::datatypes::PythonArray::new(self.field.clone(), repeated_values_array) + let repeated_values_array: Box = + Box::new(PseudoArrowArray::new(repeated_values.into(), validity)); + PythonArray::new(self.field.clone(), repeated_values_array) } } diff --git a/src/array/ops/cast.rs b/src/array/ops/cast.rs index 6f25b967f4..68848add5f 100644 --- a/src/array/ops/cast.rs +++ b/src/array/ops/cast.rs @@ -5,32 +5,40 @@ use arrow2::compute::{ use crate::series::IntoSeries; use crate::{ - array::{ops::image::ImageArrayVecs, DataArray}, + array::DataArray, datatypes::logical::{ DateArray, EmbeddingArray, FixedShapeImageArray, ImageArray, LogicalArray, }, datatypes::{DaftArrowBackedType, DataType, Field, Utf8Array}, error::{DaftError, DaftResult}, series::Series, - with_match_arrow_daft_types, with_match_daft_logical_types, with_match_numeric_daft_types, + with_match_arrow_daft_types, with_match_daft_logical_types, }; -use arrow2::array::Array; -use num_traits::NumCast; #[cfg(feature = "python")] -use crate::datatypes::{FixedSizeListArray, ListArray}; +use crate::array::{ops::image::ImageArrayVecs, pseudo_arrow::PseudoArrowArray}; +#[cfg(feature = "python")] +use crate::datatypes::{FixedSizeListArray, ImageMode, ListArray, PythonArray}; +#[cfg(feature = "python")] +use crate::ffi; #[cfg(feature = "python")] -use crate::datatypes::{ImageMode, PythonArray}; +use crate::with_match_numeric_daft_types; +#[cfg(feature = "python")] +use arrow2::array::Array; #[cfg(feature = "python")] use log; #[cfg(feature = "python")] -use num_traits::ToPrimitive; +use ndarray::IntoDimension; +#[cfg(feature = "python")] +use num_traits::{NumCast, ToPrimitive}; #[cfg(feature = "python")] -use numpy::PyReadonlyArrayDyn; +use numpy::{PyArray3, PyReadonlyArrayDyn}; #[cfg(feature = "python")] use pyo3::prelude::*; #[cfg(feature = "python")] use std::iter; +#[cfg(feature = "python")] +use std::ops::Deref; use super::as_arrow::AsArrow; use std::sync::Arc; @@ -701,12 +709,87 @@ impl EmbeddingArray { impl ImageArray { pub fn cast(&self, dtype: &DataType) -> DaftResult { - self.physical.cast(dtype) + match dtype { + #[cfg(feature = "python")] + DataType::Python => Python::with_gil(|py| { + let mut ndarrays = Vec::with_capacity(self.len()); + let da = self.data_array(); + let ca = self.channel_array(); + let ha = self.height_array(); + let wa = self.width_array(); + let pyarrow = py.import("pyarrow")?; + for (i, arrow_array) in da.iter().enumerate() { + let shape = ( + ha.value(i) as usize, + wa.value(i) as usize, + ca.value(i) as usize, + ); + let py_array = match arrow_array { + Some(arrow_array) => ffi::to_py_array(arrow_array, py, pyarrow)? + .call_method1(py, pyo3::intern!(py, "to_numpy"), (false,))? + .call_method1(py, pyo3::intern!(py, "reshape"), (shape,))?, + None => PyArray3::::zeros(py, shape.into_dimension(), false) + .deref() + .to_object(py), + }; + ndarrays.push(py_array); + } + let values_array = + PseudoArrowArray::new(ndarrays.into(), self.as_arrow().validity().cloned()); + Ok(PythonArray::new( + Field::new(self.name(), dtype.clone()).into(), + values_array.to_boxed(), + )? + .into_series()) + }), + _ => self.physical.cast(dtype), + } } } impl FixedShapeImageArray { pub fn cast(&self, dtype: &DataType) -> DaftResult { - self.physical.cast(dtype) + match (dtype, self.logical_type()) { + #[cfg(feature = "python")] + (DataType::Python, DataType::FixedShapeImage(_, mode, height, width)) => { + pyo3::Python::with_gil(|py| { + let shape = ( + self.len(), + *height as usize, + *width as usize, + mode.num_channels() as usize, + ); + let pyarrow = py.import("pyarrow")?; + // Only go through FFI layer once instead of for every image. + // We create an (N, H, W, C) ndarray view on the entire image array + // buffer sans the validity mask, and then create a subndarray view + // for each image ndarray in the PythonArray. + let py_array = ffi::to_py_array( + self.as_arrow().values().with_validity(None), + py, + pyarrow, + )? + .call_method1(py, pyo3::intern!(py, "to_numpy"), (false,))? + .call_method1( + py, + pyo3::intern!(py, "reshape"), + (shape,), + )?; + let ndarrays = py_array + .as_ref(py) + .iter()? + .map(|a| a.unwrap().to_object(py)) + .collect::>(); + let values_array = + PseudoArrowArray::new(ndarrays.into(), self.as_arrow().validity().cloned()); + Ok(PythonArray::new( + Field::new(self.name(), dtype.clone()).into(), + values_array.to_boxed(), + )? + .into_series()) + }) + } + (_, _) => self.physical.cast(dtype), + } } } diff --git a/src/array/ops/image.rs b/src/array/ops/image.rs index 66c85dbad0..f6740a9f85 100644 --- a/src/array/ops/image.rs +++ b/src/array/ops/image.rs @@ -184,42 +184,42 @@ pub struct ImageArrayVecs { } impl ImageArray { - fn image_mode(&self) -> &Option { + pub fn image_mode(&self) -> &Option { match self.logical_type() { DataType::Image(_, mode) => mode, _ => panic!("Expected dtype to be Image"), } } - fn data_array(&self) -> &arrow2::array::ListArray { + pub fn data_array(&self) -> &arrow2::array::ListArray { let p = self.physical.as_arrow(); const IMAGE_DATA_IDX: usize = 0; let array = p.values().get(IMAGE_DATA_IDX).unwrap(); array.as_ref().as_any().downcast_ref().unwrap() } - fn channel_array(&self) -> &arrow2::array::UInt16Array { + pub fn channel_array(&self) -> &arrow2::array::UInt16Array { let p = self.physical.as_arrow(); const IMAGE_CHANNEL_IDX: usize = 1; let array = p.values().get(IMAGE_CHANNEL_IDX).unwrap(); array.as_ref().as_any().downcast_ref().unwrap() } - fn height_array(&self) -> &arrow2::array::UInt32Array { + pub fn height_array(&self) -> &arrow2::array::UInt32Array { let p = self.physical.as_arrow(); const IMAGE_HEIGHT_IDX: usize = 2; let array = p.values().get(IMAGE_HEIGHT_IDX).unwrap(); array.as_ref().as_any().downcast_ref().unwrap() } - fn width_array(&self) -> &arrow2::array::UInt32Array { + pub fn width_array(&self) -> &arrow2::array::UInt32Array { let p = self.physical.as_arrow(); const IMAGE_WIDTH_IDX: usize = 3; let array = p.values().get(IMAGE_WIDTH_IDX).unwrap(); array.as_ref().as_any().downcast_ref().unwrap() } - fn mode_array(&self) -> &arrow2::array::UInt8Array { + pub fn mode_array(&self) -> &arrow2::array::UInt8Array { let p = self.physical.as_arrow(); const IMAGE_MODE_IDX: usize = 4; let array = p.values().get(IMAGE_MODE_IDX).unwrap(); @@ -297,7 +297,7 @@ impl ImageArray { )) } - fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { + pub fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { assert!(idx < self.len()); if !self.physical.is_valid(idx) { return None; diff --git a/tests/series/test_image.py b/tests/series/test_image.py index 6b5521667a..f41a76ae9f 100644 --- a/tests/series/test_image.py +++ b/tests/series/test_image.py @@ -48,7 +48,7 @@ } -def test_image_arrow_round_trip(): +def test_image_round_trip(): data = [ np.arange(12, dtype=np.uint8).reshape((2, 2, 3)), np.arange(12, 39, dtype=np.uint8).reshape((3, 3, 3)), @@ -62,6 +62,16 @@ def test_image_arrow_round_trip(): assert t.datatype() == target_dtype + # Test pylist roundtrip. + back_dtype = DataType.python() + back = t.cast(back_dtype) + + assert back.datatype() == back_dtype + + out = back.to_pylist() + np.testing.assert_equal(out, data) + + # Test Arrow roundtrip. arrow_arr = t.to_arrow() assert isinstance(arrow_arr.type, DaftExtension) @@ -94,7 +104,7 @@ def test_image_arrow_round_trip(): ) def test_image_decode_pil(mode, file_format): np_dtype = MODE_TO_NP_DTYPE[mode] - img_mode = ImageMode.from_mode_string(mode) + ImageMode.from_mode_string(mode) num_channels = MODE_TO_NUM_CHANNELS[mode] shape = (4, 4) if num_channels > 1: @@ -109,12 +119,11 @@ def test_image_decode_pil(mode, file_format): t = s.image.decode() # TODO(Clark): Infer type-leve mode if all images are the same mode. assert t.datatype() == DataType.image() - for py_img in t.to_pylist(): - assert py_img["channel"] == num_channels - assert py_img["height"] == shape[0] - assert py_img["width"] == shape[1] - assert py_img["mode"] == img_mode - np.testing.assert_equal(np.array(py_img["data"]).reshape(shape).astype(np_dtype), arr) + out = t.cast(DataType.python()).to_pylist() + expected_arrs = [arr, arr, arr] + if num_channels == 1: + expected_arrs = [np.expand_dims(arr, -1) for arr in expected_arrs] + np.testing.assert_equal(out, expected_arrs) @pytest.mark.parametrize( @@ -146,7 +155,7 @@ def test_image_decode_pil(mode, file_format): ) def test_image_decode_opencv(mode, file_format): np_dtype = MODE_TO_NP_DTYPE[mode] - img_mode = ImageMode.from_mode_string(mode) + ImageMode.from_mode_string(mode) num_channels = MODE_TO_NUM_CHANNELS[mode] shape = (4, 4, num_channels) arr = np.arange(np.prod(shape)).reshape(shape).astype(np_dtype) @@ -163,12 +172,9 @@ def test_image_decode_opencv(mode, file_format): if np_dtype == np.uint8: # TODO(Clark): Infer type-leve mode if all images are the same mode. assert t.datatype() == DataType.image() - for py_img in t.to_pylist(): - assert py_img["channel"] == num_channels - assert py_img["height"] == shape[0] - assert py_img["width"] == shape[1] - assert py_img["mode"] == img_mode - np.testing.assert_equal(np.array(py_img["data"]).reshape(shape).astype(np_dtype), arr) + out = t.cast(DataType.python()).to_pylist() + expected_arrs = [arr, arr, arr] + np.testing.assert_equal(out, expected_arrs) def test_image_resize(): @@ -188,20 +194,15 @@ def test_image_resize(): resized = t.image.resize(5, 5) - as_py = resized.to_pylist() - assert resized.datatype() == target_dtype - first_resized = np.array(as_py[0]["data"]).reshape(5, 5, 3) - assert np.all(first_resized[..., 0] == 1) - assert np.all(first_resized[..., 1] == 2) - assert np.all(first_resized[..., 2] == 3) + out = resized.cast(DataType.python()).to_pylist() - sec_resized = np.array(as_py[1]["data"]).reshape(5, 5, 3) - sec_resized_gt = np.asarray(Image.fromarray(second).resize((5, 5), resample=Image.BILINEAR)) - assert np.all(sec_resized == sec_resized_gt) + def resize(arr): + # Use opencv as a resizing baseline. + return cv2.resize(arr, dsize=(5, 5), interpolation=cv2.INTER_LINEAR_EXACT) - assert as_py[2] == None + np.testing.assert_equal(out, [resize(first), resize(second), None]) def test_image_resize_mixed_modes(): @@ -229,34 +230,19 @@ def test_image_resize_mixed_modes(): resized = t.image.resize(5, 5) - as_py = resized.to_pylist() - - assert resized.datatype() == target_dtype - - first_resized = np.array(as_py[0]["data"]).reshape(5, 5, 3) - assert np.all(first_resized[..., 0] == 1) - assert np.all(first_resized[..., 1] == 2) - assert np.all(first_resized[..., 2] == 3) - - second_resized = np.array(as_py[1]["data"]).reshape(5, 5, 4) - assert np.all(second_resized[..., 0] == 1) - assert np.all(second_resized[..., 1] == 2) - assert np.all(second_resized[..., 2] == 3) - assert np.all(second_resized[..., 3] == 4) - - for i in range(2, 4): - resized_i = np.array(as_py[i]["data"]).reshape(5, 5, -1) - resized_i_gt = np.asarray(Image.fromarray(data[i]).resize((5, 5), resample=Image.BILINEAR)).reshape(5, 5, -1) - assert np.all(resized_i == resized_i_gt), f"{i} does not match" + out = resized.cast(DataType.python()).to_pylist() - # LA sampling doesn't work for some reason in PIL - resized_i = np.array(as_py[4]["data"]).reshape(5, 5, -1) - assert np.all(resized_i == 10) + def resize(arr): + # Use opencv as a resizing baseline. + arr = cv2.resize(arr, dsize=(5, 5), interpolation=cv2.INTER_LINEAR_EXACT) + if arr.ndim == 2: + arr = np.expand_dims(arr, -1) + return arr - assert as_py[-1] == None + np.testing.assert_equal(out, [resize(arr) if arr is not None else None for arr in data]) -def test_fixed_shape_image_arrow_round_trip(): +def test_fixed_shape_image_roundtrip(): height = 2 width = 2 shape = (height, width, 3) @@ -269,6 +255,16 @@ def test_fixed_shape_image_arrow_round_trip(): assert t.datatype() == target_dtype + # Test pylist roundtrip. + back_dtype = DataType.python() + back = t.cast(back_dtype) + + assert back.datatype() == back_dtype + + out = back.to_pylist() + np.testing.assert_equal(out, data) + + # Test Arrow roundtrip. arrow_arr = t.to_arrow() assert isinstance(arrow_arr.type, DaftExtension)