From 56514bebca941f9fed91af42d8edc982c7ef1d11 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Oct 2023 17:13:51 -0400 Subject: [PATCH] Support binary argumnets (via coercion) for `like` and `ilike` and string functions fixup --- datafusion/expr/src/built_in_function.rs | 23 +++- datafusion/expr/src/type_coercion/binary.rs | 113 +++++++++++++++--- .../expr/src/type_coercion/functions.rs | 11 +- datafusion/sqllogictest/test_files/binary.slt | 41 +++++-- 4 files changed, 159 insertions(+), 29 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index d7cfd9f4209e..942d83abe94d 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -1523,16 +1523,29 @@ impl FromStr for BuiltinScalarFunction { } } +/// Creates a function that returns the return type of a string function given +/// the type of its first argument. +/// +/// If the input type is `LargeUtf8` or `LargeBinary` the return type is +/// `$largeUtf8Type`, +/// +/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, macro_rules! make_utf8_to_return_type { ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { fn $FUNC(arg_type: &DataType, name: &str) -> Result { Ok(match arg_type { - DataType::LargeUtf8 => $largeUtf8Type, - DataType::Utf8 => $utf8Type, + DataType::LargeUtf8 => $largeUtf8Type, + // LargeBinary inputs are automatically coerced to Utf8 + DataType::LargeBinary => $largeUtf8Type, + DataType::Utf8 => $utf8Type, + // Binary inputs are automatically coerced to Utf8 + DataType::Binary => $utf8Type, DataType::Null => DataType::Null, DataType::Dictionary(_, value_type) => match **value_type { - DataType::LargeUtf8 => $largeUtf8Type, + DataType::LargeUtf8 => $largeUtf8Type, + DataType::LargeBinary => $largeUtf8Type, DataType::Utf8 => $utf8Type, + DataType::Binary => $utf8Type, DataType::Null => DataType::Null, _ => { return plan_err!( @@ -1553,8 +1566,10 @@ macro_rules! make_utf8_to_return_type { } }; } - +// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); + +// `utf8_to_str_type`: returns either a Int32 or Int64 based on the input type size. make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result { diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index ba3c21a15d4f..f94ce614605c 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -29,7 +29,11 @@ use datafusion_common::{plan_err, DataFusionError}; use crate::Operator; -/// The type signature of an instantiation of binary expression +/// The type signature of an instantiation of binary operator expression such as +/// `lhs + rhs` +/// +/// Note this is different than [`crate::signature::Signature`] which +/// describes the type signature of a function. struct Signature { /// The type to coerce the left argument to lhs: DataType, @@ -648,8 +652,9 @@ fn string_concat_internal_coercion( } } -/// Coercion rules for Strings: the type that both lhs and rhs can be -/// casted to for the purpose of a string computation +/// Coercion rules for string types (Utf8/LargeUtf8): If at least on argument is +/// a string type and both arguments can be coerced into a string type, coerce +/// to string type. fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -665,8 +670,30 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option } } -/// Coercion rules for Binaries: the type that both lhs and rhs can be -/// casted to for the purpose of a computation +/// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8): +/// If one argument is binary and the other is a string then coerce to string +/// (e.g. for `like`) +fn binary_to_string_coercion( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (Binary, Utf8) => Some(Utf8), + (Binary, LargeUtf8) => Some(LargeUtf8), + (LargeBinary, Utf8) => Some(LargeUtf8), + (LargeBinary, LargeUtf8) => Some(LargeUtf8), + (Utf8, Binary) => Some(Utf8), + (Utf8, LargeBinary) => Some(LargeUtf8), + (LargeUtf8, Binary) => Some(LargeUtf8), + (LargeUtf8, LargeBinary) => Some(LargeUtf8), + _ => None, + } +} + +/// Coercion rules for binary types (Binary/LargeBinary): If at least on argument is +/// a binary type and both arguments can be coerced into a binary type, coerce +/// to binary type. fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -681,6 +708,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option /// This is a union of string coercion rules and dictionary coercion rules pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) + .or_else(|| binary_to_string_coercion(lhs_type, rhs_type)) .or_else(|| dictionary_coercion(lhs_type, rhs_type, false)) .or_else(|| null_coercion(lhs_type, rhs_type)) } @@ -763,7 +791,7 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { match (lhs_type, rhs_type) { @@ -917,11 +945,33 @@ mod tests { ); } + /// Test coercion rules for binary operators + /// + /// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that the + /// the result type is `$RESULT_TYPE` macro_rules! test_coercion_binary_rule { - ($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{ - let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?; - assert_eq!(lhs, $C_TYPE); - assert_eq!(rhs, $C_TYPE); + ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{ + let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?; + assert_eq!(lhs, $RESULT_TYPE); + assert_eq!(rhs, $RESULT_TYPE); + }}; + } + + /// Test coercion rules for like + /// + /// Applies coercion rules for both + /// * `$LHS_TYPE LIKE $RHS_TYPE` + /// * `$RHS_TYPE LIKE $LHS_TYPE` + /// + /// And asserts the result type is `$RESULT_TYPE` + macro_rules! test_like_rule { + ($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{ + println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE); + let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE); + assert_eq!(result, $RESULT_TYPE); + // reverse the order + let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE); + assert_eq!(result, $RESULT_TYPE); }}; } @@ -948,11 +998,46 @@ mod tests { } #[test] - fn test_type_coercion() -> Result<()> { - // test like coercion rule - let result = like_coercion(&DataType::Utf8, &DataType::Utf8); - assert_eq!(result, Some(DataType::Utf8)); + fn test_like_coercion() { + // string coerce to strings + test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8)); + test_like_rule!( + DataType::LargeUtf8, + DataType::Utf8, + Some(DataType::LargeUtf8) + ); + test_like_rule!( + DataType::Utf8, + DataType::LargeUtf8, + Some(DataType::LargeUtf8) + ); + test_like_rule!( + DataType::LargeUtf8, + DataType::LargeUtf8, + Some(DataType::LargeUtf8) + ); + + // Also coerce binary to strings + test_like_rule!(DataType::Binary, DataType::Utf8, Some(DataType::Utf8)); + test_like_rule!( + DataType::LargeBinary, + DataType::Utf8, + Some(DataType::LargeUtf8) + ); + test_like_rule!( + DataType::Binary, + DataType::LargeUtf8, + Some(DataType::LargeUtf8) + ); + test_like_rule!( + DataType::LargeBinary, + DataType::LargeUtf8, + Some(DataType::LargeUtf8) + ); + } + #[test] + fn test_type_coercion() -> Result<()> { test_coercion_binary_rule!( DataType::Utf8, DataType::Date32, diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 17ca40236d41..7771ea92abff 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -46,6 +46,8 @@ pub fn data_types( return Ok(current_types.to_vec()); } + // Try and coerce the argument types to match the signature, returning the + // coerced types from the first matching signature. for valid_types in valid_types { if let Some(types) = maybe_data_types(&valid_types, current_types) { return Ok(types); @@ -60,6 +62,7 @@ pub fn data_types( ) } +/// Returns a Vec of all possible valid argument types for the given signature. fn get_valid_types( signature: &TypeSignature, current_types: &[DataType], @@ -104,7 +107,12 @@ fn get_valid_types( Ok(valid_types) } -/// Try to coerce current_types into valid_types. +/// Try to coerce the current argument types to match the given `valid_types`. +/// +/// For example, if a function `func` accepts arguments of `(int64, int64)`, +/// but was called with `(int32, int64)`, this function could match the +/// valid_types by by coercing the first argument to `int64`, and would return +/// `Some([int64, int64])`. fn maybe_data_types( valid_types: &[DataType], current_types: &[DataType], @@ -220,6 +228,7 @@ fn coerced_from<'a>( Some(type_into.clone()) } Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()), + // Any type can be coerced into strings Utf8 | LargeUtf8 => Some(type_into.clone()), Null if can_cast_types(type_from, type_into) => Some(type_into.clone()), diff --git a/datafusion/sqllogictest/test_files/binary.slt b/datafusion/sqllogictest/test_files/binary.slt index 38f8a2e14ffc..0568ada3ad7d 100644 --- a/datafusion/sqllogictest/test_files/binary.slt +++ b/datafusion/sqllogictest/test_files/binary.slt @@ -217,30 +217,51 @@ SELECT largebinary FROM t ORDER BY largebinary; NULL # LIKE -# https://github.com/apache/arrow-datafusion/issues/7342 -query error DataFusion error: type_coercion -SELECT binary FROM t where binary LIKE '%F'; - -query error DataFusion error: type_coercion -SELECT largebinary FROM t where largebinary LIKE '%F'; +query ? +SELECT binary FROM t where binary LIKE '%F%'; +---- +466f6f +466f6f426172 +query ? +SELECT largebinary FROM t where largebinary LIKE '%F%'; +---- +466f6f +466f6f426172 # character_length function -# https://github.com/apache/arrow-datafusion/issues/7344 -query error DataFusion error: Error during planning: The "character_length" function can only accept strings, but got Binary\. +query TITI SELECT cast(binary as varchar) as str, character_length(binary) as binary_len, cast(largebinary as varchar) as large_str, character_length(binary) as largebinary_len from t; +---- +Foo 3 Foo 3 +NULL NULL NULL NULL +Bar 3 Bar 3 +FooBar 6 FooBar 6 + +query I +SELECT character_length(X'20'); +---- +1 + +# still errors on values that can not be coerced to utf8 +query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes from index 0 +SELECT character_length(X'c328'); # regexp_replace -# https://github.com/apache/arrow-datafusion/issues/7345 -query error DataFusion error: Error during planning: The "regexp_replace" function can only accept strings, but got Binary\. +query TTTT SELECT cast(binary as varchar) as str, regexp_replace(binary, 'F', 'f') as binary_replaced, cast(largebinary as varchar) as large_str, regexp_replace(largebinary, 'F', 'f') as large_binary_replaced from t; +---- +Foo foo Foo foo +NULL NULL NULL NULL +Bar Bar Bar Bar +FooBar fooBar FooBar fooBar