Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-11599: [Rust] Add function to create array with all nulls #9469

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions rust/arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use std::{any::Any, convert::TryFrom};
use super::ArrayDataRef;
use super::*;
use crate::array::equal_json::JsonEqual;
use crate::buffer::{Buffer, MutableBuffer};
use crate::error::Result;
use crate::ffi;

Expand Down Expand Up @@ -326,6 +327,170 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
let data = ArrayData::new_empty(data_type);
make_array(Arc::new(data))
}
/// Creates a new array of `data_type` of length `length` filled entirely of `NULL` values
pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
// context: https://github.com/apache/arrow/pull/9469#discussion_r574761687
match data_type {
DataType::Null => Arc::new(NullArray::new(length)),
DataType::Boolean => {
let null_buf: Buffer = MutableBuffer::new_null(length).into();
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(null_buf.clone()),
0,
vec![null_buf],
vec![],
)))
}
DataType::Int8 => new_null_sized_array::<Int8Type>(data_type, length),
DataType::UInt8 => new_null_sized_array::<UInt8Type>(data_type, length),
DataType::Int16 => new_null_sized_array::<Int16Type>(data_type, length),
DataType::UInt16 => new_null_sized_array::<UInt16Type>(data_type, length),
DataType::Float16 => unreachable!(),
DataType::Int32 => new_null_sized_array::<Int32Type>(data_type, length),
DataType::UInt32 => new_null_sized_array::<UInt32Type>(data_type, length),
DataType::Float32 => new_null_sized_array::<Float32Type>(data_type, length),
DataType::Date32 => new_null_sized_array::<Date32Type>(data_type, length),
// expanding this into Date23{unit}Type results in needless branching
DataType::Time32(_) => new_null_sized_array::<Int32Type>(data_type, length),
DataType::Int64 => new_null_sized_array::<Int64Type>(data_type, length),
DataType::UInt64 => new_null_sized_array::<UInt64Type>(data_type, length),
DataType::Float64 => new_null_sized_array::<Float64Type>(data_type, length),
DataType::Date64 => new_null_sized_array::<Date64Type>(data_type, length),
// expanding this into Timestamp{unit}Type results in needless branching
DataType::Timestamp(_, _) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Time64(_) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Duration(_) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Interval(unit) => match unit {
IntervalUnit::YearMonth => {
new_null_sized_array::<IntervalYearMonthType>(data_type, length)
}
IntervalUnit::DayTime => {
new_null_sized_array::<IntervalDayTimeType>(data_type, length)
}
},
DataType::FixedSizeBinary(value_len) => make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(vec![0u8; *value_len as usize * length])],
vec![],
))),
DataType::Binary | DataType::Utf8 => {
new_null_binary_array::<i32>(data_type, length)
}
DataType::LargeBinary | DataType::LargeUtf8 => {
new_null_binary_array::<i64>(data_type, length)
}
DataType::List(field) => {
new_null_list_array::<i32>(data_type, field.data_type(), length)
}
DataType::LargeList(field) => {
new_null_list_array::<i64>(data_type, field.data_type(), length)
}
DataType::FixedSizeList(field, value_len) => {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![],
vec![
new_null_array(field.data_type(), *value_len as usize * length)
.data(),
],
)))
}
DataType::Struct(fields) => make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![],
fields
.iter()
.map(|field| Arc::new(ArrayData::new_empty(field.data_type())))
.collect(),
))),
DataType::Union(_) => {
unimplemented!("Creating null Union array not yet supported")
}
DataType::Dictionary(_, value) => {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![MutableBuffer::new(0).into()], // values are empty
vec![new_empty_array(value.as_ref()).data()],
)))
}
DataType::Decimal(_, _) => {
unimplemented!("Creating null Decimal array not yet supported")
}
}
}

#[inline]
fn new_null_list_array<OffsetSize: OffsetSizeTrait>(
data_type: &DataType,
child_data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(
vec![OffsetSize::zero(); length + 1].to_byte_slice(),
)],
vec![Arc::new(ArrayData::new_empty(child_data_type))],
)))
}

#[inline]
fn new_null_binary_array<OffsetSize: OffsetSizeTrait>(
data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![
Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()),
MutableBuffer::new(0).into(),
],
vec![],
)))
}

#[inline]
fn new_null_sized_array<T: ArrowPrimitiveType>(
data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(vec![0u8; length * T::get_byte_width()])],
vec![],
)))
}

/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface
/// # Safety
Expand Down Expand Up @@ -409,4 +574,60 @@ mod tests {
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
}

#[test]
fn test_null_boolean() {
let array = new_null_array(&DataType::Boolean, 9);
let a = array.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(a.len(), 9);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_primitive() {
let array = new_null_array(&DataType::Int32, 9);
let a = array.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(a.len(), 9);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_variable_sized() {
let array = new_null_array(&DataType::Utf8, 9);
let a = array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(a.len(), 9);
assert_eq!(a.value_offsets()[9], 0i32);
nevi-me marked this conversation as resolved.
Show resolved Hide resolved
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_list_primitive() {
let data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a soundness hole here, that applies more for parquet. If I created this array as non-nullable, there wouldn't have been an error, even though this doesn't comply with the spec.

@alamb @jorgecarleitao @Dandandan should we panic, or should I change the signature to Result<ArrayRef> so I can validate this condition?

I'm leaning to a panic, just like we panic on impl From<ArrayDataRef> for {type}Array. Wrapping a function that doesn't really error out, in a Result feels like assembing the Avengers for a single villain.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I vote panic for now if you think it is a logic error that a user should be avoiding anyways.

Or alternately, we can make it panic now and Result later if someone finds a need

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, that hole exists in so many places, unfortunately. Since ArrayData has a null_count, we have been relying on it for this.

Technically, this goes back to the issue that we are not performing any validation whether the datatype is consistent with the array's content. :(

let array = new_null_array(&data_type, 9);
let a = array.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(a.len(), 9);
assert_eq!(a.value_offsets()[9], 0i32);
nevi-me marked this conversation as resolved.
Show resolved Hide resolved
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_dictionary() {
let values = vec![None, None, None, None, None, None, None, None, None]
as Vec<Option<&str>>;

let array: DictionaryArray<Int8Type> = values.into_iter().collect();
let array = Arc::new(array) as ArrayRef;

let null_array = new_null_array(array.data_type(), 9);
assert_eq!(&array, &null_array);
}
nevi-me marked this conversation as resolved.
Show resolved Hide resolved
}
1 change: 1 addition & 0 deletions rust/arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub use self::null::NullArray;

pub use self::array::make_array;
pub use self::array::new_empty_array;
pub use self::array::new_null_array;

pub type Int8Array = PrimitiveArray<Int8Type>;
pub type Int16Array = PrimitiveArray<Int16Type>;
Expand Down
20 changes: 5 additions & 15 deletions rust/arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,6 @@ impl NullArray {
let array_data = ArrayData::builder(DataType::Null).len(length).build();
NullArray::from(array_data)
}

/// Create a new null array of the specified length and type
pub fn new_with_type(length: usize, data_type: DataType) -> Self {
let array_data = ArrayData::builder(data_type).len(length).build();
NullArray::from(array_data)
}
}

impl Array for NullArray {
Expand Down Expand Up @@ -104,6 +98,11 @@ impl Array for NullArray {

impl From<ArrayDataRef> for NullArray {
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
data.data_type(),
&DataType::Null,
"NullArray data type should be Null"
);
assert_eq!(
data.buffers().len(),
0,
Expand Down Expand Up @@ -153,15 +152,6 @@ mod tests {
assert_eq!(array2.offset(), 8);
}

#[test]
fn test_null_array_new_with_type() {
let length = 10;
let data_type = DataType::Int8;
let array = NullArray::new_with_type(length, data_type.clone());
assert_eq!(array.len(), length);
assert_eq!(array.data_type(), &data_type);
}

#[test]
fn test_debug_null_array() {
let array = NullArray::new(1024 * 1024);
Expand Down
18 changes: 7 additions & 11 deletions rust/datafusion/src/physical_plan/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ use crate::{
optimizer::utils,
prelude::ExecutionConfig,
};
use arrow::error::{ArrowError, Result as ArrowResult};
use arrow::record_batch::RecordBatch;
use arrow::{
array::new_null_array,
error::{ArrowError, Result as ArrowResult},
};
use arrow::{
array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder},
buffer::MutableBuffer,
Expand Down Expand Up @@ -646,13 +649,6 @@ enum StatisticsType {
Max,
}

fn build_null_array(data_type: &DataType, length: usize) -> ArrayRef {
Arc::new(arrow::array::NullArray::new_with_type(
length,
data_type.clone(),
))
}

fn build_statistics_array(
statistics: &[Option<&ParquetStatistics>],
statistics_type: StatisticsType,
Expand All @@ -665,7 +661,7 @@ fn build_statistics_array(
statistics
} else {
// no row group has statistics defined
return build_null_array(data_type, statistics_count);
return new_null_array(data_type, statistics_count);
};

let (data_size, arrow_type) = match first_group_stats {
Expand All @@ -678,7 +674,7 @@ fn build_statistics_array(
}
_ => {
// type of statistics not supported
return build_null_array(data_type, statistics_count);
return new_null_array(data_type, statistics_count);
}
};

Expand Down Expand Up @@ -735,7 +731,7 @@ fn build_statistics_array(
}
// cast statistics array to required data type
arrow::compute::cast(&statistics_array, data_type)
.unwrap_or_else(|_| build_null_array(data_type, statistics_count))
.unwrap_or_else(|_| new_null_array(data_type, statistics_count))
}

#[async_trait]
Expand Down
Loading