From fe0363c2a552537896dc3c00a010d784667544be Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 12 Dec 2023 15:55:10 +0100 Subject: [PATCH 01/13] Refactor array_union and array_intersect functions --- .../physical-expr/src/array_expressions.rs | 151 +++++++----------- 1 file changed, 57 insertions(+), 94 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 3ee99d7e8e55..0683312caadf 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1777,14 +1777,24 @@ macro_rules! to_string { }}; } -fn union_generic_lists( +/// general function for array_union and array_intersect +fn general_set_lists( l: &GenericListArray, r: &GenericListArray, - field: &FieldRef, + is_union: bool, ) -> Result> { + let set_op = if is_union { + "array_union" + } else { + "array_intersect" + }; + if l.value_type() != r.value_type() { + return internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); + } + + let dt = l.value_type(); let converter = RowConverter::new(vec![SortField::new(l.value_type())])?; - let nulls = NullBuffer::union(l.nulls(), r.nulls()); let l_values = l.values().clone(); let r_values = r.values().clone(); let l_values = converter.convert_columns(&[l_values])?; @@ -1806,7 +1816,7 @@ fn union_generic_lists( } for i in r_slice { let right_row = r_values.row(i); - if dedup.insert(right_row) { + if dedup.insert(right_row) == is_union { rows.push(right_row); } } @@ -1814,6 +1824,7 @@ fn union_generic_lists( dedup.clear(); } + let field = Arc::new(Field::new("item", dt, true)); let values = converter.convert_rows(rows)?; let offsets = OffsetBuffer::new(offsets.into()); let result = values[0].clone(); @@ -1821,7 +1832,7 @@ fn union_generic_lists( field.clone(), offsets, result, - nulls, + None, )) } @@ -1833,36 +1844,56 @@ pub fn array_union(args: &[ArrayRef]) -> Result { let array1 = &args[0]; let array2 = &args[1]; - fn union_arrays( - array1: &ArrayRef, - array2: &ArrayRef, - l_field_ref: &Arc, - r_field_ref: &Arc, - ) -> Result { - match (l_field_ref.data_type(), r_field_ref.data_type()) { - (DataType::Null, _) => Ok(array2.clone()), - (_, DataType::Null) => Ok(array1.clone()), - (_, _) => { - let list1 = array1.as_list::(); - let list2 = array2.as_list::(); - let result = union_generic_lists::(list1, list2, l_field_ref)?; - Ok(Arc::new(result)) - } + match (array1.data_type(), array2.data_type()) { + (DataType::Null, _) => Ok(array2.clone()), + (_, DataType::Null) => Ok(array1.clone()), + (DataType::List(_), DataType::List(_)) => { + let first_array = as_list_array(&array1)?; + let second_array = as_list_array(&array2)?; + let arr = general_set_lists::(first_array, second_array, true)?; + Ok(Arc::new(arr)) + } + (DataType::LargeList(_), DataType::LargeList(_)) => { + let first_array = as_large_list_array(&array1)?; + let second_array = as_large_list_array(&array2)?; + let arr = general_set_lists::(first_array, second_array, true)?; + Ok(Arc::new(arr)) + } + (data_type1, data_type2) => { + internal_err!( + "array_union does not support types '{data_type1:?}' and '{data_type2:?}'" + ) } } +} + +/// array_intersect SQL function +pub fn array_intersect(args: &[ArrayRef]) -> Result { + if args.len() != 2 { + return exec_err!("array_intersect needs two arguments"); + } + + let array1 = &args[0]; + let array2 = &args[1]; match (array1.data_type(), array2.data_type()) { (DataType::Null, _) => Ok(array2.clone()), (_, DataType::Null) => Ok(array1.clone()), - (DataType::List(l_field_ref), DataType::List(r_field_ref)) => { - union_arrays::(array1, array2, l_field_ref, r_field_ref) + (DataType::List(_), DataType::List(_)) => { + let array1 = as_list_array(&array1)?; + let array2 = as_list_array(&array2)?; + let result = general_set_lists::(array1, array2, false)?; + Ok(Arc::new(result)) } - (DataType::LargeList(l_field_ref), DataType::LargeList(r_field_ref)) => { - union_arrays::(array1, array2, l_field_ref, r_field_ref) + (DataType::LargeList(_), DataType::LargeList(_)) => { + let array1 = as_large_list_array(&array1)?; + let array2 = as_large_list_array(&array2)?; + let result = general_set_lists::(array1, array2, false)?; + Ok(Arc::new(result)) } - _ => { + (data_type1, data_type2) => { internal_err!( - "array_union only support list with offsets of type int32 and int64" + "array_intersect does not support types '{data_type1:?}' and '{data_type2:?}'" ) } } @@ -2359,74 +2390,6 @@ pub fn string_to_array(args: &[ArrayRef]) -> Result Result { - if args.len() != 2 { - return exec_err!("array_intersect needs two arguments"); - } - - let first_array = &args[0]; - let second_array = &args[1]; - - match (first_array.data_type(), second_array.data_type()) { - (DataType::Null, _) => Ok(second_array.clone()), - (_, DataType::Null) => Ok(first_array.clone()), - _ => { - let first_array = as_list_array(&first_array)?; - let second_array = as_list_array(&second_array)?; - - if first_array.value_type() != second_array.value_type() { - return internal_err!("array_intersect is not implemented for '{first_array:?}' and '{second_array:?}'"); - } - - let dt = first_array.value_type(); - - let mut offsets = vec![0]; - let mut new_arrays = vec![]; - - let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; - for (first_arr, second_arr) in first_array.iter().zip(second_array.iter()) { - if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) { - let l_values = converter.convert_columns(&[first_arr])?; - let r_values = converter.convert_columns(&[second_arr])?; - - let values_set: HashSet<_> = l_values.iter().collect(); - let mut rows = Vec::with_capacity(r_values.num_rows()); - for r_val in r_values.iter().sorted().dedup() { - if values_set.contains(&r_val) { - rows.push(r_val); - } - } - - let last_offset: i32 = match offsets.last().copied() { - Some(offset) => offset, - None => return internal_err!("offsets should not be empty"), - }; - offsets.push(last_offset + rows.len() as i32); - let arrays = converter.convert_rows(rows)?; - let array = match arrays.first() { - Some(array) => array.clone(), - None => { - return internal_err!( - "array_intersect: failed to get array from rows" - ) - } - }; - new_arrays.push(array); - } - } - - let field = Arc::new(Field::new("item", dt, true)); - let offsets = OffsetBuffer::new(offsets.into()); - let new_arrays_ref = - new_arrays.iter().map(|v| v.as_ref()).collect::>(); - let values = compute::concat(&new_arrays_ref)?; - let arr = Arc::new(ListArray::try_new(field, offsets, values, None)?); - Ok(arr) - } - } -} - pub fn general_array_distinct( array: &GenericListArray, field: &FieldRef, From b78947c28da8bed710c57447f5db6bc96bf3f259 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 12 Dec 2023 22:09:48 +0100 Subject: [PATCH 02/13] fix cli --- .../physical-expr/src/array_expressions.rs | 155 +++++++++++------- 1 file changed, 99 insertions(+), 56 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 0683312caadf..7f5b60eb7309 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1782,58 +1782,75 @@ fn general_set_lists( l: &GenericListArray, r: &GenericListArray, is_union: bool, -) -> Result> { - let set_op = if is_union { - "array_union" - } else { - "array_intersect" - }; +) -> Result { + if matches!(l.value_type(), DataType::Null) { + let field = Arc::new(Field::new("item", r.value_type(), true)); + return general_array_distinct::(r, &field); + } else if matches!(r.value_type(), DataType::Null) { + let field = Arc::new(Field::new("item", l.value_type(), true)); + return general_array_distinct::(l, &field); + } + if l.value_type() != r.value_type() { - return internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); + let operation = if is_union { + "array_union" + } else { + "array_intersect" + }; + return internal_err!("{operation} is not implemented for '{l:?}' and '{r:?}'"); } let dt = l.value_type(); - let converter = RowConverter::new(vec![SortField::new(l.value_type())])?; - let l_values = l.values().clone(); - let r_values = r.values().clone(); - let l_values = converter.convert_columns(&[l_values])?; - let r_values = converter.convert_columns(&[r_values])?; + let mut offsets = vec![OffsetSize::usize_as(0)]; + let mut new_arrays = vec![]; - // Might be worth adding an upstream OffsetBufferBuilder - let mut offsets = Vec::::with_capacity(l.len() + 1); - offsets.push(OffsetSize::usize_as(0)); - let mut rows = Vec::with_capacity(l_values.num_rows() + r_values.num_rows()); - let mut dedup = HashSet::new(); - for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) { - let l_slice = l_w[0].as_usize()..l_w[1].as_usize(); - let r_slice = r_w[0].as_usize()..r_w[1].as_usize(); - for i in l_slice { - let left_row = l_values.row(i); - if dedup.insert(left_row) { - rows.push(left_row); - } - } - for i in r_slice { - let right_row = r_values.row(i); - if dedup.insert(right_row) == is_union { - rows.push(right_row); + let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; + for (first_arr, second_arr) in l.iter().zip(r.iter()) { + if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) { + let l_values = converter.convert_columns(&[first_arr])?; + let r_values = converter.convert_columns(&[second_arr])?; + + let l_iter = l_values.iter().sorted().dedup(); + let values_set: HashSet<_> = l_iter.clone().collect(); + let mut rows = if is_union { + l_iter.collect::>() + } else { + vec![] + }; + for r_val in r_values.iter().sorted().dedup() { + if !values_set.contains(&r_val) == is_union { + rows.push(r_val); + } } + + let last_offset = match offsets.last().copied() { + Some(offset) => offset, + None => return internal_err!("offsets should not be empty"), + }; + offsets.push(last_offset + OffsetSize::usize_as(rows.len())); + let arrays = converter.convert_rows(rows)?; + let array = match arrays.first() { + Some(array) => array.clone(), + None => { + let operation = if is_union { + "array_union" + } else { + "array_intersect" + }; + return internal_err!("{operation}: failed to get array from rows"); + } + }; + new_arrays.push(array); } - offsets.push(OffsetSize::usize_as(rows.len())); - dedup.clear(); } let field = Arc::new(Field::new("item", dt, true)); - let values = converter.convert_rows(rows)?; let offsets = OffsetBuffer::new(offsets.into()); - let result = values[0].clone(); - Ok(GenericListArray::::new( - field.clone(), - offsets, - result, - None, - )) + let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::>(); + let values = compute::concat(&new_arrays_ref)?; + let arr = GenericListArray::::try_new(field, offsets, values, None)?; + Ok(Arc::new(arr)) } /// Array_union SQL function @@ -1845,19 +1862,32 @@ pub fn array_union(args: &[ArrayRef]) -> Result { let array2 = &args[1]; match (array1.data_type(), array2.data_type()) { - (DataType::Null, _) => Ok(array2.clone()), - (_, DataType::Null) => Ok(array1.clone()), + (DataType::Null, DataType::List(field)) + | (DataType::List(field), DataType::Null) => { + let array = match array1.data_type() { + DataType::Null => as_list_array(&array2)?, + _ => as_list_array(&array1)?, + }; + general_array_distinct::(array, field) + } + (DataType::Null, DataType::LargeList(field)) + | (DataType::LargeList(field), DataType::Null) => { + let array = match array1.data_type() { + DataType::Null => as_large_list_array(&array2)?, + _ => as_large_list_array(&array1)?, + }; + general_array_distinct::(array, field) + } + (DataType::Null, DataType::Null) => Ok(array1.clone()), (DataType::List(_), DataType::List(_)) => { - let first_array = as_list_array(&array1)?; - let second_array = as_list_array(&array2)?; - let arr = general_set_lists::(first_array, second_array, true)?; - Ok(Arc::new(arr)) + let array1 = as_list_array(&array1)?; + let array2 = as_list_array(&array2)?; + general_set_lists::(array1, array2, true) } (DataType::LargeList(_), DataType::LargeList(_)) => { - let first_array = as_large_list_array(&array1)?; - let second_array = as_large_list_array(&array2)?; - let arr = general_set_lists::(first_array, second_array, true)?; - Ok(Arc::new(arr)) + let array1 = as_large_list_array(&array1)?; + let array2 = as_large_list_array(&array2)?; + general_set_lists::(array1, array2, true) } (data_type1, data_type2) => { internal_err!( @@ -1877,19 +1907,32 @@ pub fn array_intersect(args: &[ArrayRef]) -> Result { let array2 = &args[1]; match (array1.data_type(), array2.data_type()) { - (DataType::Null, _) => Ok(array2.clone()), - (_, DataType::Null) => Ok(array1.clone()), + (DataType::Null, DataType::List(field)) + | (DataType::List(field), DataType::Null) => { + let array = match array1.data_type() { + DataType::Null => as_list_array(&array2)?, + _ => as_list_array(&array1)?, + }; + general_array_distinct::(array, field) + } + (DataType::Null, DataType::LargeList(field)) + | (DataType::LargeList(field), DataType::Null) => { + let array = match array1.data_type() { + DataType::Null => as_large_list_array(&array2)?, + _ => as_large_list_array(&array1)?, + }; + general_array_distinct::(array, field) + } + (DataType::Null, DataType::Null) => Ok(array1.clone()), (DataType::List(_), DataType::List(_)) => { let array1 = as_list_array(&array1)?; let array2 = as_list_array(&array2)?; - let result = general_set_lists::(array1, array2, false)?; - Ok(Arc::new(result)) + general_set_lists::(array1, array2, false) } (DataType::LargeList(_), DataType::LargeList(_)) => { let array1 = as_large_list_array(&array1)?; let array2 = as_large_list_array(&array2)?; - let result = general_set_lists::(array1, array2, false)?; - Ok(Arc::new(result)) + general_set_lists::(array1, array2, false) } (data_type1, data_type2) => { internal_err!( From 87952c343eaf3fa5f4668a32850337dd3e04f65d Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 12 Dec 2023 22:39:07 +0100 Subject: [PATCH 03/13] fix ci --- .../physical-expr/src/array_expressions.rs | 117 ++++++++---------- 1 file changed, 54 insertions(+), 63 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 7f5b60eb7309..f6cd8001c73e 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -19,6 +19,7 @@ use std::any::type_name; use std::collections::HashSet; +use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::*; @@ -1777,11 +1778,25 @@ macro_rules! to_string { }}; } -/// general function for array_union and array_intersect -fn general_set_lists( +#[derive(Debug, PartialEq)] +enum SetOp { + Union, + Intersect, +} + +impl Display for SetOp { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + SetOp::Union => write!(f, "array_union"), + SetOp::Intersect => write!(f, "array_intersect"), + } + } +} + +fn generic_set_lists( l: &GenericListArray, r: &GenericListArray, - is_union: bool, + set_op: SetOp, ) -> Result { if matches!(l.value_type(), DataType::Null) { let field = Arc::new(Field::new("item", r.value_type(), true)); @@ -1792,12 +1807,7 @@ fn general_set_lists( } if l.value_type() != r.value_type() { - let operation = if is_union { - "array_union" - } else { - "array_intersect" - }; - return internal_err!("{operation} is not implemented for '{l:?}' and '{r:?}'"); + return internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); } let dt = l.value_type(); @@ -1813,14 +1823,23 @@ fn general_set_lists( let l_iter = l_values.iter().sorted().dedup(); let values_set: HashSet<_> = l_iter.clone().collect(); - let mut rows = if is_union { + let mut rows = if set_op == SetOp::Union { l_iter.collect::>() } else { vec![] }; for r_val in r_values.iter().sorted().dedup() { - if !values_set.contains(&r_val) == is_union { - rows.push(r_val); + match set_op { + SetOp::Union => { + if !values_set.contains(&r_val) { + rows.push(r_val); + } + } + SetOp::Intersect => { + if values_set.contains(&r_val) { + rows.push(r_val); + } + } } } @@ -1833,12 +1852,7 @@ fn general_set_lists( let array = match arrays.first() { Some(array) => array.clone(), None => { - let operation = if is_union { - "array_union" - } else { - "array_intersect" - }; - return internal_err!("{operation}: failed to get array from rows"); + return internal_err!("{set_op}: failed to get array from rows"); } }; new_arrays.push(array); @@ -1853,15 +1867,13 @@ fn general_set_lists( Ok(Arc::new(arr)) } -/// Array_union SQL function -pub fn array_union(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!("array_union needs 2 arguments"); - } - let array1 = &args[0]; - let array2 = &args[1]; - +fn general_set_op( + array1: &ArrayRef, + array2: &ArrayRef, + set_op: SetOp, +) -> Result { match (array1.data_type(), array2.data_type()) { + // Null type (DataType::Null, DataType::List(field)) | (DataType::List(field), DataType::Null) => { let array = match array1.data_type() { @@ -1879,24 +1891,36 @@ pub fn array_union(args: &[ArrayRef]) -> Result { general_array_distinct::(array, field) } (DataType::Null, DataType::Null) => Ok(array1.clone()), + (DataType::List(_), DataType::List(_)) => { let array1 = as_list_array(&array1)?; let array2 = as_list_array(&array2)?; - general_set_lists::(array1, array2, true) + generic_set_lists::(array1, array2, set_op) } (DataType::LargeList(_), DataType::LargeList(_)) => { let array1 = as_large_list_array(&array1)?; let array2 = as_large_list_array(&array2)?; - general_set_lists::(array1, array2, true) + generic_set_lists::(array1, array2, set_op) } (data_type1, data_type2) => { internal_err!( - "array_union does not support types '{data_type1:?}' and '{data_type2:?}'" + "{set_op} does not support types '{data_type1:?}' and '{data_type2:?}'" ) } } } +/// Array_union SQL function +pub fn array_union(args: &[ArrayRef]) -> Result { + if args.len() != 2 { + return exec_err!("array_union needs two arguments"); + } + let array1 = &args[0]; + let array2 = &args[1]; + + general_set_op(array1, array2, SetOp::Union) +} + /// array_intersect SQL function pub fn array_intersect(args: &[ArrayRef]) -> Result { if args.len() != 2 { @@ -1906,40 +1930,7 @@ pub fn array_intersect(args: &[ArrayRef]) -> Result { let array1 = &args[0]; let array2 = &args[1]; - match (array1.data_type(), array2.data_type()) { - (DataType::Null, DataType::List(field)) - | (DataType::List(field), DataType::Null) => { - let array = match array1.data_type() { - DataType::Null => as_list_array(&array2)?, - _ => as_list_array(&array1)?, - }; - general_array_distinct::(array, field) - } - (DataType::Null, DataType::LargeList(field)) - | (DataType::LargeList(field), DataType::Null) => { - let array = match array1.data_type() { - DataType::Null => as_large_list_array(&array2)?, - _ => as_large_list_array(&array1)?, - }; - general_array_distinct::(array, field) - } - (DataType::Null, DataType::Null) => Ok(array1.clone()), - (DataType::List(_), DataType::List(_)) => { - let array1 = as_list_array(&array1)?; - let array2 = as_list_array(&array2)?; - general_set_lists::(array1, array2, false) - } - (DataType::LargeList(_), DataType::LargeList(_)) => { - let array1 = as_large_list_array(&array1)?; - let array2 = as_large_list_array(&array2)?; - general_set_lists::(array1, array2, false) - } - (data_type1, data_type2) => { - internal_err!( - "array_intersect does not support types '{data_type1:?}' and '{data_type2:?}'" - ) - } - } + general_set_op(array1, array2, SetOp::Intersect) } /// Array_to_string SQL function From 30aef30a6728ea58b9c4f92ff3de9e6c65316c4b Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 12 Dec 2023 22:44:53 +0100 Subject: [PATCH 04/13] add tests for null --- datafusion/sqllogictest/test_files/array.slt | 30 +++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 283f2d67b7a0..871dd4938c3d 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -2658,23 +2658,29 @@ NULL # array_union scalar function #11 query ? +select array_union([1, 1, 2, 2, 3, 3], null); +---- +[1, 2, 3] + +# array_union scalar function #12 +query ? +select array_union(null, [1, 1, 2, 2, 3, 3]); +---- +[1, 2, 3] + +# array_union scalar function #13 +query ? select array_union([1.2, 3.0], [1.2, 3.0, 5.7]); ---- [1.2, 3.0, 5.7] -# array_union scalar function #12 +# array_union scalar function #14 query ? select array_union(['hello'], ['hello','datafusion']); ---- [hello, datafusion] - - - - - - # list_to_string scalar function #4 (function alias `array_to_string`) query TTT select list_to_string(['h', 'e', 'l', 'l', 'o'], ','), list_to_string([1, 2, 3, 4, 5], '-'), list_to_string([1.0, 2.0, 3.0], '|'); @@ -3601,6 +3607,16 @@ select array_intersect([], []); ---- [] +query ? +select array_intersect([1, 1, 2, 2, 3, 3], null); +---- +[1, 2, 3] + +query ? +select array_intersect(null, [1, 1, 2, 2, 3, 3]); +---- +[1, 2, 3] + query ? select array_intersect([], null); ---- From 38a882c18ce3c8d380576595413fdebf5ef9bf38 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Mon, 18 Dec 2023 10:18:11 +0100 Subject: [PATCH 05/13] modify the return type --- datafusion/expr/src/built_in_function.rs | 13 +++++- .../physical-expr/src/array_expressions.rs | 40 ++++++++++++------- datafusion/sqllogictest/test_files/array.slt | 8 ++-- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 3818e8ee5658..c454a9781eda 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -618,7 +618,18 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayReplaceAll => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArraySlice => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayToString => Ok(Utf8), - BuiltinScalarFunction::ArrayUnion | BuiltinScalarFunction::ArrayIntersect => { + BuiltinScalarFunction::ArrayIntersect => { + match (input_expr_types[0].clone(), input_expr_types[1].clone()) { + (DataType::Null, DataType::Null) | (DataType::Null, _) => { + Ok(DataType::Null) + } + (_, DataType::Null) => { + Ok(List(Arc::new(Field::new("item", Null, true)))) + } + (dt, _) => Ok(dt), + } + } + BuiltinScalarFunction::ArrayUnion => { match (input_expr_types[0].clone(), input_expr_types[1].clone()) { (DataType::Null, dt) => Ok(dt), (dt, DataType::Null) => Ok(dt), diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index f6cd8001c73e..c53457c13332 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1873,24 +1873,36 @@ fn general_set_op( set_op: SetOp, ) -> Result { match (array1.data_type(), array2.data_type()) { - // Null type - (DataType::Null, DataType::List(field)) - | (DataType::List(field), DataType::Null) => { - let array = match array1.data_type() { - DataType::Null => as_list_array(&array2)?, - _ => as_list_array(&array1)?, - }; + (DataType::Null, DataType::List(field)) => { + if set_op == SetOp::Intersect { + return make_array(dbg!(&[])); + } + let array = as_list_array(&array2)?; general_array_distinct::(array, field) } - (DataType::Null, DataType::LargeList(field)) - | (DataType::LargeList(field), DataType::Null) => { - let array = match array1.data_type() { - DataType::Null => as_large_list_array(&array2)?, - _ => as_large_list_array(&array1)?, - }; + + (DataType::List(field), DataType::Null) => { + if set_op == SetOp::Intersect { + return make_array(dbg!(&[])); + } + let array = as_list_array(&array1)?; + general_array_distinct::(array, field) + } + (DataType::Null, DataType::LargeList(field)) => { + if set_op == SetOp::Intersect { + return make_array(&[]); + } + let array = as_large_list_array(&array2)?; + general_array_distinct::(array, field) + } + (DataType::LargeList(field), DataType::Null) => { + if set_op == SetOp::Intersect { + return make_array(&[]); + } + let array = as_large_list_array(&array1)?; general_array_distinct::(array, field) } - (DataType::Null, DataType::Null) => Ok(array1.clone()), + (DataType::Null, DataType::Null) => make_array(&[]), (DataType::List(_), DataType::List(_)) => { let array1 = as_list_array(&array1)?; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 871dd4938c3d..8c47ee51b304 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -2654,7 +2654,7 @@ select array_union(null, []); query ? select array_union(null, null); ---- -NULL +[] # array_union scalar function #11 query ? @@ -3610,12 +3610,12 @@ select array_intersect([], []); query ? select array_intersect([1, 1, 2, 2, 3, 3], null); ---- -[1, 2, 3] +[] query ? select array_intersect(null, [1, 1, 2, 2, 3, 3]); ---- -[1, 2, 3] +[] query ? select array_intersect([], null); @@ -3630,7 +3630,7 @@ select array_intersect(null, []); query ? select array_intersect(null, null); ---- -NULL +[] query ?????? SELECT list_intersect(make_array(1,2,3), make_array(2,3,4)), From a693492aa1b2f13c0e2a333ee7b183f7a8b83dbd Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Mon, 18 Dec 2023 10:29:16 +0100 Subject: [PATCH 06/13] update tests --- datafusion/physical-expr/src/array_expressions.rs | 8 ++++---- datafusion/sqllogictest/test_files/array.slt | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index c53457c13332..e8019fcf43d7 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1875,7 +1875,7 @@ fn general_set_op( match (array1.data_type(), array2.data_type()) { (DataType::Null, DataType::List(field)) => { if set_op == SetOp::Intersect { - return make_array(dbg!(&[])); + return Ok(new_empty_array(&DataType::Null)); } let array = as_list_array(&array2)?; general_array_distinct::(array, field) @@ -1883,14 +1883,14 @@ fn general_set_op( (DataType::List(field), DataType::Null) => { if set_op == SetOp::Intersect { - return make_array(dbg!(&[])); + return make_array(&[]); } let array = as_list_array(&array1)?; general_array_distinct::(array, field) } (DataType::Null, DataType::LargeList(field)) => { if set_op == SetOp::Intersect { - return make_array(&[]); + return Ok(new_empty_array(&DataType::Null)); } let array = as_large_list_array(&array2)?; general_array_distinct::(array, field) @@ -1902,7 +1902,7 @@ fn general_set_op( let array = as_large_list_array(&array1)?; general_array_distinct::(array, field) } - (DataType::Null, DataType::Null) => make_array(&[]), + (DataType::Null, DataType::Null) => return Ok(new_empty_array(&DataType::Null)), (DataType::List(_), DataType::List(_)) => { let array1 = as_list_array(&array1)?; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 8c47ee51b304..84ca5da067ac 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -2654,7 +2654,7 @@ select array_union(null, []); query ? select array_union(null, null); ---- -[] +NULL # array_union scalar function #11 query ? @@ -3615,7 +3615,7 @@ select array_intersect([1, 1, 2, 2, 3, 3], null); query ? select array_intersect(null, [1, 1, 2, 2, 3, 3]); ---- -[] +NULL query ? select array_intersect([], null); @@ -3625,12 +3625,12 @@ select array_intersect([], null); query ? select array_intersect(null, []); ---- -[] +NULL query ? select array_intersect(null, null); ---- -[] +NULL query ?????? SELECT list_intersect(make_array(1,2,3), make_array(2,3,4)), From 37f41dc7c9d7b9e27c4fa030a6d57ba07eead3b1 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Mon, 18 Dec 2023 12:53:05 +0100 Subject: [PATCH 07/13] fix clippy --- datafusion/physical-expr/src/array_expressions.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index e8019fcf43d7..976c25ac691a 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1800,14 +1800,14 @@ fn generic_set_lists( ) -> Result { if matches!(l.value_type(), DataType::Null) { let field = Arc::new(Field::new("item", r.value_type(), true)); - return general_array_distinct::(r, &field); + general_array_distinct::(r, &field); } else if matches!(r.value_type(), DataType::Null) { let field = Arc::new(Field::new("item", l.value_type(), true)); - return general_array_distinct::(l, &field); + general_array_distinct::(l, &field); } if l.value_type() != r.value_type() { - return internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); + internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); } let dt = l.value_type(); @@ -1883,7 +1883,7 @@ fn general_set_op( (DataType::List(field), DataType::Null) => { if set_op == SetOp::Intersect { - return make_array(&[]); + make_array(&[]); } let array = as_list_array(&array1)?; general_array_distinct::(array, field) @@ -1897,7 +1897,7 @@ fn general_set_op( } (DataType::LargeList(field), DataType::Null) => { if set_op == SetOp::Intersect { - return make_array(&[]); + make_array(&[]); } let array = as_large_list_array(&array1)?; general_array_distinct::(array, field) From 696780057417f3a83ead05f477361dcd7a53514b Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 19 Dec 2023 21:13:09 +0100 Subject: [PATCH 08/13] fix clippy --- datafusion/physical-expr/src/array_expressions.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 976c25ac691a..f09bcd3d007f 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1800,14 +1800,14 @@ fn generic_set_lists( ) -> Result { if matches!(l.value_type(), DataType::Null) { let field = Arc::new(Field::new("item", r.value_type(), true)); - general_array_distinct::(r, &field); + return general_array_distinct::(r, &field); } else if matches!(r.value_type(), DataType::Null) { let field = Arc::new(Field::new("item", l.value_type(), true)); - general_array_distinct::(l, &field); + return general_array_distinct::(l, &field); } if l.value_type() != r.value_type() { - internal_err!("{set_op} is not implemented for '{l:?}' and '{r:?}'"); + return internal_err!("{set_op:?} is not implemented for '{l:?}' and '{r:?}'"); } let dt = l.value_type(); @@ -1883,7 +1883,7 @@ fn general_set_op( (DataType::List(field), DataType::Null) => { if set_op == SetOp::Intersect { - make_array(&[]); + return make_array(&[]); } let array = as_list_array(&array1)?; general_array_distinct::(array, field) @@ -1897,7 +1897,7 @@ fn general_set_op( } (DataType::LargeList(field), DataType::Null) => { if set_op == SetOp::Intersect { - make_array(&[]); + return make_array(&[]); } let array = as_large_list_array(&array1)?; general_array_distinct::(array, field) @@ -2305,7 +2305,7 @@ pub fn array_has(args: &[ArrayRef]) -> Result { DataType::LargeList(_) => { general_array_has_dispatch::(&args[0], &args[1], ComparisonType::Single) } - _ => internal_err!("array_has does not support type '{array_type:?}'."), + _ => exec_err!("array_has does not support type '{array_type:?}'."), } } From 2cc8f2f7e268d4de4ecfe6c2af0bb5e1fe7801fa Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 19 Dec 2023 23:30:10 +0100 Subject: [PATCH 09/13] add tests for largelist --- datafusion/sqllogictest/test_files/array.slt | 245 +++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 84ca5da067ac..b363080e18f2 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -231,6 +231,19 @@ AS VALUES (make_array(11, 22), make_array(11), make_array(11,22,33), make_array(11,33), make_array(11,33,55), make_array(22,44,66,88,11,33)) ; +statement ok +CREATE TABLE large_array_intersect_table_1D +AS + SELECT + arrow_cast(column1, 'LargeList(Int64)') as column1, + arrow_cast(column2, 'LargeList(Int64)') as column2, + arrow_cast(column3, 'LargeList(Int64)') as column3, + arrow_cast(column4, 'LargeList(Int64)') as column4, + arrow_cast(column5, 'LargeList(Int64)') as column5, + arrow_cast(column6, 'LargeList(Int64)') as column6 +FROM array_intersect_table_1D +; + statement ok CREATE TABLE array_intersect_table_1D_Float AS VALUES @@ -238,6 +251,19 @@ AS VALUES (make_array(3.0, 4.0, 5.0), make_array(2.0), make_array(1.0,2.0,3.0,4.0), make_array(2.0,5.0), make_array(2.22, 1.11), make_array(1.11, 3.33)) ; +statement ok +CREATE TABLE large_array_intersect_table_1D_Float +AS + SELECT + arrow_cast(column1, 'LargeList(Float64)') as column1, + arrow_cast(column2, 'LargeList(Float64)') as column2, + arrow_cast(column3, 'LargeList(Float64)') as column3, + arrow_cast(column4, 'LargeList(Float64)') as column4, + arrow_cast(column5, 'LargeList(Float64)') as column5, + arrow_cast(column6, 'LargeList(Float64)') as column6 +FROM array_intersect_table_1D_Float +; + statement ok CREATE TABLE array_intersect_table_1D_Boolean AS VALUES @@ -245,6 +271,19 @@ AS VALUES (make_array(false, false, false), make_array(false), make_array(true, false, true), make_array(true, true), make_array(true, true), make_array(false,false,true)) ; +statement ok +CREATE TABLE large_array_intersect_table_1D_Boolean +AS + SELECT + arrow_cast(column1, 'LargeList(Boolean)') as column1, + arrow_cast(column2, 'LargeList(Boolean)') as column2, + arrow_cast(column3, 'LargeList(Boolean)') as column3, + arrow_cast(column4, 'LargeList(Boolean)') as column4, + arrow_cast(column5, 'LargeList(Boolean)') as column5, + arrow_cast(column6, 'LargeList(Boolean)') as column6 +FROM array_intersect_table_1D_Boolean +; + statement ok CREATE TABLE array_intersect_table_1D_UTF8 AS VALUES @@ -252,6 +291,19 @@ AS VALUES (make_array('a', 'bc', 'def'), make_array('defg'), make_array('datafusion', 'rust', 'arrow'), make_array('datafusion', 'rust', 'arrow', 'python'), make_array('rust', 'arrow'), make_array('datafusion', 'rust', 'arrow')) ; +statement ok +CREATE TABLE large_array_intersect_table_1D_UTF8 +AS + SELECT + arrow_cast(column1, 'LargeList(Utf8)') as column1, + arrow_cast(column2, 'LargeList(Utf8)') as column2, + arrow_cast(column3, 'LargeList(Utf8)') as column3, + arrow_cast(column4, 'LargeList(Utf8)') as column4, + arrow_cast(column5, 'LargeList(Utf8)') as column5, + arrow_cast(column6, 'LargeList(Utf8)') as column6 +FROM array_intersect_table_1D_UTF8 +; + statement ok CREATE TABLE array_intersect_table_2D AS VALUES @@ -259,6 +311,17 @@ AS VALUES (make_array([3,4], [5]), make_array([3,4]), make_array([1,2,3,4], [5,6,7], [8,9,10]), make_array([1,2,3], [5,6,7], [8,9,10])) ; +statement ok +CREATE TABLE large_array_intersect_table_2D +AS + SELECT + arrow_cast(column1, 'LargeList(List(Int64))') as column1, + arrow_cast(column2, 'LargeList(List(Int64))') as column2, + arrow_cast(column3, 'LargeList(List(Int64))') as column3, + arrow_cast(column4, 'LargeList(List(Int64))') as column4 +FROM array_intersect_table_2D +; + statement ok CREATE TABLE array_intersect_table_2D_float AS VALUES @@ -266,6 +329,15 @@ AS VALUES (make_array([1.0, 2.0, 3.0], [1.1, 2.2], [3.3]), make_array([1.0], [1.1, 2.2], [3.3])) ; +statement ok +CREATE TABLE large_array_intersect_table_2D_Float +AS + SELECT + arrow_cast(column1, 'LargeList(List(Float64))') as column1, + arrow_cast(column2, 'LargeList(List(Float64))') as column2 +FROM array_intersect_table_2D_Float +; + statement ok CREATE TABLE array_intersect_table_3D AS VALUES @@ -273,6 +345,15 @@ AS VALUES (make_array([[1,2]]), make_array([[1,2]])) ; +statement ok +CREATE TABLE large_array_intersect_table_3D +AS + SELECT + arrow_cast(column1, 'LargeList(List(List(Int64)))') as column1, + arrow_cast(column2, 'LargeList(List(List(Int64)))') as column2 +FROM array_intersect_table_3D +; + statement ok CREATE TABLE arrays_values_without_nulls AS VALUES @@ -2589,24 +2670,44 @@ select array_union([1, 2, 3, 4], [5, 6, 3, 4]); ---- [1, 2, 3, 4, 5, 6] +query ? +select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 3, 4], 'LargeList(Int64)')); +---- +[1, 2, 3, 4, 5, 6] + # array_union scalar function #2 query ? select array_union([1, 2, 3, 4], [5, 6, 7, 8]); ---- [1, 2, 3, 4, 5, 6, 7, 8] +query ? +select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 6, 7, 8], 'LargeList(Int64)')); +---- +[1, 2, 3, 4, 5, 6, 7, 8] + # array_union scalar function #3 query ? select array_union([1,2,3], []); ---- [1, 2, 3] +query ? +select array_union(arrow_cast([1,2,3], 'LargeList(Int64)'), arrow_cast([], 'LargeList(Null)')); +---- +[1, 2, 3] + # array_union scalar function #4 query ? select array_union([1, 2, 3, 4], [5, 4]); ---- [1, 2, 3, 4, 5] +query ? +select array_union(arrow_cast([1, 2, 3, 4], 'LargeList(Int64)'), arrow_cast([5, 4], 'LargeList(Int64)')); +---- +[1, 2, 3, 4, 5] + # array_union scalar function #5 statement ok CREATE TABLE arrays_with_repeating_elements_for_union @@ -2623,6 +2724,13 @@ select array_union(column1, column2) from arrays_with_repeating_elements_for_uni [2, 3] [3, 4] +query ? +select array_union(arrow_cast(column1, 'LargeList(Int64)'), arrow_cast(column2, 'LargeList(Int64)')) from arrays_with_repeating_elements_for_union; +---- +[1, 2] +[2, 3] +[3, 4] + statement ok drop table arrays_with_repeating_elements_for_union; @@ -2632,24 +2740,44 @@ select array_union([], []); ---- [] +query ? +select array_union(arrow_cast([], 'LargeList(Null)'), arrow_cast([], 'LargeList(Null)')); +---- +[] + # array_union scalar function #7 query ? select array_union([[null]], []); ---- [[]] +query ? +select array_union(arrow_cast([[null]], 'LargeList(List(Null))'), arrow_cast([], 'LargeList(Null)')); +---- +[[]] + # array_union scalar function #8 query ? select array_union([null], [null]); ---- [] +query ? +select array_union(arrow_cast([[null]], 'LargeList(List(Null))'), arrow_cast([[null]], 'LargeList(List(Null))')); +---- +[[]] + # array_union scalar function #9 query ? select array_union(null, []); ---- [] +query ? +select array_union(null, arrow_cast([], 'LargeList(Null)')); +---- +[] + # array_union scalar function #10 query ? select array_union(null, null); @@ -2662,24 +2790,44 @@ select array_union([1, 1, 2, 2, 3, 3], null); ---- [1, 2, 3] +query ? +select array_union(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null); +---- +[1, 2, 3] + # array_union scalar function #12 query ? select array_union(null, [1, 1, 2, 2, 3, 3]); ---- [1, 2, 3] +query ? +select array_union(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)')); +---- +[1, 2, 3] + # array_union scalar function #13 query ? select array_union([1.2, 3.0], [1.2, 3.0, 5.7]); ---- [1.2, 3.0, 5.7] +query ? +select array_union(arrow_cast([1.2, 3.0], 'LargeList(Float64)'), arrow_cast([1.2, 3.0, 5.7], 'LargeList(Float64)')); +---- +[1.2, 3.0, 5.7] + # array_union scalar function #14 query ? select array_union(['hello'], ['hello','datafusion']); ---- [hello, datafusion] +query ? +select array_union(arrow_cast(['hello'], 'LargeList(Utf8)'), arrow_cast(['hello','datafusion'], 'LargeList(Utf8)')); +---- +[hello, datafusion] + # list_to_string scalar function #4 (function alias `array_to_string`) query TTT @@ -3542,6 +3690,15 @@ from array_intersect_table_1D; [1] [1, 3] [1, 3] [11] [11, 33] [11, 33] +query ??? +select array_intersect(column1, column2), + array_intersect(column3, column4), + array_intersect(column5, column6) +from large_array_intersect_table_1D; +---- +[1] [1, 3] [1, 3] +[11] [11, 33] [11, 33] + query ??? select array_intersect(column1, column2), array_intersect(column3, column4), @@ -3560,6 +3717,15 @@ from array_intersect_table_1D_Boolean; [] [false, true] [false] [false] [true] [true] +query ??? +select array_intersect(column1, column2), + array_intersect(column3, column4), + array_intersect(column5, column6) +from large_array_intersect_table_1D_Boolean; +---- +[] [false, true] [false] +[false] [true] [true] + query ??? select array_intersect(column1, column2), array_intersect(column3, column4), @@ -3569,6 +3735,15 @@ from array_intersect_table_1D_UTF8; [bc] [arrow, rust] [] [] [arrow, datafusion, rust] [arrow, rust] +query ??? +select array_intersect(column1, column2), + array_intersect(column3, column4), + array_intersect(column5, column6) +from large_array_intersect_table_1D_UTF8; +---- +[bc] [arrow, rust] [] +[] [arrow, datafusion, rust] [arrow, rust] + query ?? select array_intersect(column1, column2), array_intersect(column3, column4) @@ -3577,6 +3752,15 @@ from array_intersect_table_2D; [] [[4, 5], [6, 7]] [[3, 4]] [[5, 6, 7], [8, 9, 10]] +query ?? +select array_intersect(column1, column2), + array_intersect(column3, column4) +from large_array_intersect_table_2D; +---- +[] [[4, 5], [6, 7]] +[[3, 4]] [[5, 6, 7], [8, 9, 10]] + + query ? select array_intersect(column1, column2) from array_intersect_table_2D_float; @@ -3584,6 +3768,13 @@ from array_intersect_table_2D_float; [[1.1, 2.2], [3.3]] [[1.1, 2.2], [3.3]] +query ? +select array_intersect(column1, column2) +from large_array_intersect_table_2D_float; +---- +[[1.1, 2.2], [3.3]] +[[1.1, 2.2], [3.3]] + query ? select array_intersect(column1, column2) from array_intersect_table_3D; @@ -3591,6 +3782,13 @@ from array_intersect_table_3D; [] [[[1, 2]]] +query ? +select array_intersect(column1, column2) +from large_array_intersect_table_3D; +---- +[] +[[[1, 2]]] + query ?????? SELECT array_intersect(make_array(1,2,3), make_array(2,3,4)), array_intersect(make_array(1,3,5), make_array(2,4,6)), @@ -3602,31 +3800,67 @@ SELECT array_intersect(make_array(1,2,3), make_array(2,3,4)), ---- [2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] +query ?????? +SELECT array_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')), + array_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')), + array_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')), + array_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')), + array_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')), + array_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))')) +; +---- +[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] + query ? select array_intersect([], []); ---- [] +query ? +select array_intersect(arrow_cast([], 'LargeList(Null)'), arrow_cast([], 'LargeList(Null)')); +---- +[] + query ? select array_intersect([1, 1, 2, 2, 3, 3], null); ---- [] +query ? +select array_intersect(arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)'), null); +---- +[] + query ? select array_intersect(null, [1, 1, 2, 2, 3, 3]); ---- NULL +query ? +select array_intersect(null, arrow_cast([1, 1, 2, 2, 3, 3], 'LargeList(Int64)')); +---- +NULL + query ? select array_intersect([], null); ---- [] +query ? +select array_intersect(arrow_cast([], 'LargeList(Null)'), null); +---- +[] + query ? select array_intersect(null, []); ---- NULL +query ? +select array_intersect(null, arrow_cast([], 'LargeList(Null)')); +---- +NULL + query ? select array_intersect(null, null); ---- @@ -3643,6 +3877,17 @@ SELECT list_intersect(make_array(1,2,3), make_array(2,3,4)), ---- [2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] +query ?????? +SELECT list_intersect(arrow_cast(make_array(1,2,3), 'LargeList(Int64)'), arrow_cast(make_array(2,3,4), 'LargeList(Int64)')), + list_intersect(arrow_cast(make_array(1,3,5), 'LargeList(Int64)'), arrow_cast(make_array(2,4,6), 'LargeList(Int64)')), + list_intersect(arrow_cast(make_array('aa','bb','cc'), 'LargeList(Utf8)'), arrow_cast(make_array('cc','aa','dd'), 'LargeList(Utf8)')), + list_intersect(arrow_cast(make_array(true, false), 'LargeList(Boolean)'), arrow_cast(make_array(true), 'LargeList(Boolean)')), + list_intersect(arrow_cast(make_array(1.1, 2.2, 3.3), 'LargeList(Float64)'), arrow_cast(make_array(2.2, 3.3, 4.4), 'LargeList(Float64)')), + list_intersect(arrow_cast(make_array([1, 1], [2, 2], [3, 3]), 'LargeList(List(Int64))'), arrow_cast(make_array([2, 2], [3, 3], [4, 4]), 'LargeList(List(Int64))')) +; +---- +[2, 3] [] [aa, cc] [true] [2.2, 3.3] [[2, 2], [3, 3]] + query BBBB select list_has_all(make_array(1,2,3), make_array(4,5,6)), list_has_all(make_array(1,2,3), make_array(1,2)), From ccbc651ffe325ff19daff9d0f54556f5d29c334d Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Tue, 19 Dec 2023 23:39:18 +0100 Subject: [PATCH 10/13] fix clippy --- datafusion/physical-expr/src/array_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index f09bcd3d007f..bffab70c5d82 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1902,7 +1902,7 @@ fn general_set_op( let array = as_large_list_array(&array1)?; general_array_distinct::(array, field) } - (DataType::Null, DataType::Null) => return Ok(new_empty_array(&DataType::Null)), + (DataType::Null, DataType::Null) => Ok(new_empty_array(&DataType::Null)), (DataType::List(_), DataType::List(_)) => { let array1 = as_list_array(&array1)?; From df0d5a80563bbedc01797588424c7f50b9462510 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Wed, 20 Dec 2023 15:19:21 +0100 Subject: [PATCH 11/13] Add field parameter to generic_set_lists() function --- datafusion/physical-expr/src/array_expressions.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index bffab70c5d82..98077505bfee 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1796,6 +1796,7 @@ impl Display for SetOp { fn generic_set_lists( l: &GenericListArray, r: &GenericListArray, + field: Arc, set_op: SetOp, ) -> Result { if matches!(l.value_type(), DataType::Null) { @@ -1859,7 +1860,6 @@ fn generic_set_lists( } } - let field = Arc::new(Field::new("item", dt, true)); let offsets = OffsetBuffer::new(offsets.into()); let new_arrays_ref = new_arrays.iter().map(|v| v.as_ref()).collect::>(); let values = compute::concat(&new_arrays_ref)?; @@ -1904,15 +1904,15 @@ fn general_set_op( } (DataType::Null, DataType::Null) => Ok(new_empty_array(&DataType::Null)), - (DataType::List(_), DataType::List(_)) => { + (DataType::List(field), DataType::List(_)) => { let array1 = as_list_array(&array1)?; let array2 = as_list_array(&array2)?; - generic_set_lists::(array1, array2, set_op) + generic_set_lists::(array1, array2, field.clone(), set_op) } - (DataType::LargeList(_), DataType::LargeList(_)) => { + (DataType::LargeList(field), DataType::LargeList(_)) => { let array1 = as_large_list_array(&array1)?; let array2 = as_large_list_array(&array2)?; - generic_set_lists::(array1, array2, set_op) + generic_set_lists::(array1, array2, field.clone(), set_op) } (data_type1, data_type2) => { internal_err!( From e95eb38c04ab649be36049d8cacb2e9a04e9c3a5 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Wed, 20 Dec 2023 15:19:31 +0100 Subject: [PATCH 12/13] Add large array drop statements --- datafusion/sqllogictest/test_files/array.slt | 21 ++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index b363080e18f2..4c4adbabfda5 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -4367,24 +4367,45 @@ drop table array_has_table_3D; statement ok drop table array_intersect_table_1D; +statement ok +drop table large_array_intersect_table_1D; + statement ok drop table array_intersect_table_1D_Float; +statement ok +drop table large_array_intersect_table_1D_Float; + statement ok drop table array_intersect_table_1D_Boolean; +statement ok +drop table large_array_intersect_table_1D_Boolean; + statement ok drop table array_intersect_table_1D_UTF8; +statement ok +drop table large_array_intersect_table_1D_UTF8; + statement ok drop table array_intersect_table_2D; +statement ok +drop table large_array_intersect_table_2D; + statement ok drop table array_intersect_table_2D_float; +statement ok +drop table large_array_intersect_table_2D_float; + statement ok drop table array_intersect_table_3D; +statement ok +drop table large_array_intersect_table_3D; + statement ok drop table arrays_values_without_nulls; From c34e6c0b2e235c664f85bbec786de7cf74634b8f Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Wed, 20 Dec 2023 15:20:34 +0100 Subject: [PATCH 13/13] fix clippy --- datafusion/physical-expr/src/array_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 98077505bfee..274d1db4eb0d 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -1816,7 +1816,7 @@ fn generic_set_lists( let mut offsets = vec![OffsetSize::usize_as(0)]; let mut new_arrays = vec![]; - let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; + let converter = RowConverter::new(vec![SortField::new(dt)])?; for (first_arr, second_arr) in l.iter().zip(r.iter()) { if let (Some(first_arr), Some(second_arr)) = (first_arr, second_arr) { let l_values = converter.convert_columns(&[first_arr])?;