Skip to content

Commit

Permalink
ARROW-11599: [Rust] Add function to create array with all nulls
Browse files Browse the repository at this point in the history
This allows creating an array with n null values

Closes apache#9469 from nevi-me/make-empty-arrays

Lead-authored-by: Neville Dipale <nevilledips@gmail.com>
Co-authored-by: Wakahisa <nevilledips@gmail.com>
Signed-off-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
nevi-me authored and michalursa committed Jun 13, 2021
1 parent d497001 commit aaa67ac
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 83 deletions.
221 changes: 221 additions & 0 deletions rust/arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use std::{any::Any, convert::TryFrom};
use super::ArrayDataRef;
use super::*;
use crate::array::equal_json::JsonEqual;
use crate::buffer::{Buffer, MutableBuffer};
use crate::error::Result;
use crate::ffi;

Expand Down Expand Up @@ -326,6 +327,170 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
let data = ArrayData::new_empty(data_type);
make_array(Arc::new(data))
}
/// Creates a new array of `data_type` of length `length` filled entirely of `NULL` values
pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
// context: https://github.com/apache/arrow/pull/9469#discussion_r574761687
match data_type {
DataType::Null => Arc::new(NullArray::new(length)),
DataType::Boolean => {
let null_buf: Buffer = MutableBuffer::new_null(length).into();
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(null_buf.clone()),
0,
vec![null_buf],
vec![],
)))
}
DataType::Int8 => new_null_sized_array::<Int8Type>(data_type, length),
DataType::UInt8 => new_null_sized_array::<UInt8Type>(data_type, length),
DataType::Int16 => new_null_sized_array::<Int16Type>(data_type, length),
DataType::UInt16 => new_null_sized_array::<UInt16Type>(data_type, length),
DataType::Float16 => unreachable!(),
DataType::Int32 => new_null_sized_array::<Int32Type>(data_type, length),
DataType::UInt32 => new_null_sized_array::<UInt32Type>(data_type, length),
DataType::Float32 => new_null_sized_array::<Float32Type>(data_type, length),
DataType::Date32 => new_null_sized_array::<Date32Type>(data_type, length),
// expanding this into Date23{unit}Type results in needless branching
DataType::Time32(_) => new_null_sized_array::<Int32Type>(data_type, length),
DataType::Int64 => new_null_sized_array::<Int64Type>(data_type, length),
DataType::UInt64 => new_null_sized_array::<UInt64Type>(data_type, length),
DataType::Float64 => new_null_sized_array::<Float64Type>(data_type, length),
DataType::Date64 => new_null_sized_array::<Date64Type>(data_type, length),
// expanding this into Timestamp{unit}Type results in needless branching
DataType::Timestamp(_, _) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Time64(_) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Duration(_) => new_null_sized_array::<Int64Type>(data_type, length),
DataType::Interval(unit) => match unit {
IntervalUnit::YearMonth => {
new_null_sized_array::<IntervalYearMonthType>(data_type, length)
}
IntervalUnit::DayTime => {
new_null_sized_array::<IntervalDayTimeType>(data_type, length)
}
},
DataType::FixedSizeBinary(value_len) => make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(vec![0u8; *value_len as usize * length])],
vec![],
))),
DataType::Binary | DataType::Utf8 => {
new_null_binary_array::<i32>(data_type, length)
}
DataType::LargeBinary | DataType::LargeUtf8 => {
new_null_binary_array::<i64>(data_type, length)
}
DataType::List(field) => {
new_null_list_array::<i32>(data_type, field.data_type(), length)
}
DataType::LargeList(field) => {
new_null_list_array::<i64>(data_type, field.data_type(), length)
}
DataType::FixedSizeList(field, value_len) => {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![],
vec![
new_null_array(field.data_type(), *value_len as usize * length)
.data(),
],
)))
}
DataType::Struct(fields) => make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![],
fields
.iter()
.map(|field| Arc::new(ArrayData::new_empty(field.data_type())))
.collect(),
))),
DataType::Union(_) => {
unimplemented!("Creating null Union array not yet supported")
}
DataType::Dictionary(_, value) => {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![MutableBuffer::new(0).into()], // values are empty
vec![new_empty_array(value.as_ref()).data()],
)))
}
DataType::Decimal(_, _) => {
unimplemented!("Creating null Decimal array not yet supported")
}
}
}

#[inline]
fn new_null_list_array<OffsetSize: OffsetSizeTrait>(
data_type: &DataType,
child_data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(
vec![OffsetSize::zero(); length + 1].to_byte_slice(),
)],
vec![Arc::new(ArrayData::new_empty(child_data_type))],
)))
}

#[inline]
fn new_null_binary_array<OffsetSize: OffsetSizeTrait>(
data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![
Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()),
MutableBuffer::new(0).into(),
],
vec![],
)))
}

#[inline]
fn new_null_sized_array<T: ArrowPrimitiveType>(
data_type: &DataType,
length: usize,
) -> ArrayRef {
make_array(Arc::new(ArrayData::new(
data_type.clone(),
length,
Some(length),
Some(MutableBuffer::new_null(length).into()),
0,
vec![Buffer::from(vec![0u8; length * T::get_byte_width()])],
vec![],
)))
}

/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface
/// # Safety
Expand Down Expand Up @@ -409,4 +574,60 @@ mod tests {
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
}

#[test]
fn test_null_boolean() {
let array = new_null_array(&DataType::Boolean, 9);
let a = array.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(a.len(), 9);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_primitive() {
let array = new_null_array(&DataType::Int32, 9);
let a = array.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(a.len(), 9);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_variable_sized() {
let array = new_null_array(&DataType::Utf8, 9);
let a = array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(a.len(), 9);
assert_eq!(a.value_offsets()[9], 0i32);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_list_primitive() {
let data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let array = new_null_array(&data_type, 9);
let a = array.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(a.len(), 9);
assert_eq!(a.value_offsets()[9], 0i32);
for i in 0..9 {
assert!(a.is_null(i));
}
}

#[test]
fn test_null_dictionary() {
let values = vec![None, None, None, None, None, None, None, None, None]
as Vec<Option<&str>>;

let array: DictionaryArray<Int8Type> = values.into_iter().collect();
let array = Arc::new(array) as ArrayRef;

let null_array = new_null_array(array.data_type(), 9);
assert_eq!(&array, &null_array);
}
}
1 change: 1 addition & 0 deletions rust/arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub use self::null::NullArray;

pub use self::array::make_array;
pub use self::array::new_empty_array;
pub use self::array::new_null_array;

pub type Int8Array = PrimitiveArray<Int8Type>;
pub type Int16Array = PrimitiveArray<Int16Type>;
Expand Down
20 changes: 5 additions & 15 deletions rust/arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,6 @@ impl NullArray {
let array_data = ArrayData::builder(DataType::Null).len(length).build();
NullArray::from(array_data)
}

/// Create a new null array of the specified length and type
pub fn new_with_type(length: usize, data_type: DataType) -> Self {
let array_data = ArrayData::builder(data_type).len(length).build();
NullArray::from(array_data)
}
}

impl Array for NullArray {
Expand Down Expand Up @@ -104,6 +98,11 @@ impl Array for NullArray {

impl From<ArrayDataRef> for NullArray {
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
data.data_type(),
&DataType::Null,
"NullArray data type should be Null"
);
assert_eq!(
data.buffers().len(),
0,
Expand Down Expand Up @@ -153,15 +152,6 @@ mod tests {
assert_eq!(array2.offset(), 8);
}

#[test]
fn test_null_array_new_with_type() {
let length = 10;
let data_type = DataType::Int8;
let array = NullArray::new_with_type(length, data_type.clone());
assert_eq!(array.len(), length);
assert_eq!(array.data_type(), &data_type);
}

#[test]
fn test_debug_null_array() {
let array = NullArray::new(1024 * 1024);
Expand Down
18 changes: 7 additions & 11 deletions rust/datafusion/src/physical_plan/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ use crate::{
optimizer::utils,
prelude::ExecutionConfig,
};
use arrow::error::{ArrowError, Result as ArrowResult};
use arrow::record_batch::RecordBatch;
use arrow::{
array::new_null_array,
error::{ArrowError, Result as ArrowResult},
};
use arrow::{
array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder},
buffer::MutableBuffer,
Expand Down Expand Up @@ -646,13 +649,6 @@ enum StatisticsType {
Max,
}

fn build_null_array(data_type: &DataType, length: usize) -> ArrayRef {
Arc::new(arrow::array::NullArray::new_with_type(
length,
data_type.clone(),
))
}

fn build_statistics_array(
statistics: &[Option<&ParquetStatistics>],
statistics_type: StatisticsType,
Expand All @@ -665,7 +661,7 @@ fn build_statistics_array(
statistics
} else {
// no row group has statistics defined
return build_null_array(data_type, statistics_count);
return new_null_array(data_type, statistics_count);
};

let (data_size, arrow_type) = match first_group_stats {
Expand All @@ -678,7 +674,7 @@ fn build_statistics_array(
}
_ => {
// type of statistics not supported
return build_null_array(data_type, statistics_count);
return new_null_array(data_type, statistics_count);
}
};

Expand Down Expand Up @@ -735,7 +731,7 @@ fn build_statistics_array(
}
// cast statistics array to required data type
arrow::compute::cast(&statistics_array, data_type)
.unwrap_or_else(|_| build_null_array(data_type, statistics_count))
.unwrap_or_else(|_| new_null_array(data_type, statistics_count))
}

#[async_trait]
Expand Down
Loading

0 comments on commit aaa67ac

Please sign in to comment.