-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support Binary
/LargeBinary
--> Utf8
/LargeUtf8
in ilike and string functions
#7840
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1523,16 +1523,29 @@ impl FromStr for BuiltinScalarFunction { | |
} | ||
} | ||
|
||
/// Creates a function that returns the return type of a string function given | ||
/// the type of its first argument. | ||
/// | ||
/// If the input type is `LargeUtf8` or `LargeBinary` the return type is | ||
/// `$largeUtf8Type`, | ||
/// | ||
/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, | ||
macro_rules! make_utf8_to_return_type { | ||
($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { | ||
fn $FUNC(arg_type: &DataType, name: &str) -> Result<DataType> { | ||
Ok(match arg_type { | ||
DataType::LargeUtf8 => $largeUtf8Type, | ||
DataType::Utf8 => $utf8Type, | ||
DataType::LargeUtf8 => $largeUtf8Type, | ||
// LargeBinary inputs are automatically coerced to Utf8 | ||
DataType::LargeBinary => $largeUtf8Type, | ||
DataType::Utf8 => $utf8Type, | ||
// Binary inputs are automatically coerced to Utf8 | ||
DataType::Binary => $utf8Type, | ||
DataType::Null => DataType::Null, | ||
DataType::Dictionary(_, value_type) => match **value_type { | ||
DataType::LargeUtf8 => $largeUtf8Type, | ||
DataType::LargeUtf8 => $largeUtf8Type, | ||
DataType::LargeBinary => $largeUtf8Type, | ||
DataType::Utf8 => $utf8Type, | ||
DataType::Binary => $utf8Type, | ||
DataType::Null => DataType::Null, | ||
_ => { | ||
return plan_err!( | ||
|
@@ -1553,8 +1566,10 @@ macro_rules! make_utf8_to_return_type { | |
} | ||
}; | ||
} | ||
|
||
// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. | ||
make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); | ||
|
||
// `utf8_to_str_type`: returns either a Int32 or Int64 based on the input type size. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice find -- filed #7861 |
||
make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); | ||
|
||
fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> Result<DataType> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,11 @@ use datafusion_common::{plan_err, DataFusionError}; | |
|
||
use crate::Operator; | ||
|
||
/// The type signature of an instantiation of binary expression | ||
/// The type signature of an instantiation of binary operator expression such as | ||
/// `lhs + rhs` | ||
/// | ||
/// Note this is different than [`crate::signature::Signature`] which | ||
/// describes the type signature of a function. | ||
struct Signature { | ||
/// The type to coerce the left argument to | ||
lhs: DataType, | ||
|
@@ -648,8 +652,9 @@ fn string_concat_internal_coercion( | |
} | ||
} | ||
|
||
/// Coercion rules for Strings: the type that both lhs and rhs can be | ||
/// casted to for the purpose of a string computation | ||
/// Coercion rules for string types (Utf8/LargeUtf8): If at least on argument is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at least on argument -> at least one argument |
||
/// a string type and both arguments can be coerced into a string type, coerce | ||
/// to string type. | ||
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
use arrow::datatypes::DataType::*; | ||
match (lhs_type, rhs_type) { | ||
|
@@ -665,8 +670,30 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> | |
} | ||
} | ||
|
||
/// Coercion rules for Binaries: the type that both lhs and rhs can be | ||
/// casted to for the purpose of a computation | ||
/// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8): | ||
/// If one argument is binary and the other is a string then coerce to string | ||
/// (e.g. for `like`) | ||
fn binary_to_string_coercion( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the new rule, only used for I originally tried modifying
|
||
lhs_type: &DataType, | ||
rhs_type: &DataType, | ||
) -> Option<DataType> { | ||
use arrow::datatypes::DataType::*; | ||
match (lhs_type, rhs_type) { | ||
(Binary, Utf8) => Some(Utf8), | ||
(Binary, LargeUtf8) => Some(LargeUtf8), | ||
(LargeBinary, Utf8) => Some(LargeUtf8), | ||
(LargeBinary, LargeUtf8) => Some(LargeUtf8), | ||
(Utf8, Binary) => Some(Utf8), | ||
(Utf8, LargeBinary) => Some(LargeUtf8), | ||
(LargeUtf8, Binary) => Some(LargeUtf8), | ||
(LargeUtf8, LargeBinary) => Some(LargeUtf8), | ||
_ => None, | ||
} | ||
} | ||
|
||
/// Coercion rules for binary types (Binary/LargeBinary): If at least on argument is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at least on argument -> at least one argument |
||
/// a binary type and both arguments can be coerced into a binary type, coerce | ||
/// to binary type. | ||
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
use arrow::datatypes::DataType::*; | ||
match (lhs_type, rhs_type) { | ||
|
@@ -681,6 +708,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> | |
/// This is a union of string coercion rules and dictionary coercion rules | ||
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
string_coercion(lhs_type, rhs_type) | ||
.or_else(|| binary_to_string_coercion(lhs_type, rhs_type)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the fix for |
||
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false)) | ||
.or_else(|| null_coercion(lhs_type, rhs_type)) | ||
} | ||
|
@@ -763,7 +791,7 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp | |
} | ||
} | ||
|
||
/// coercion rules from NULL type. Since NULL can be casted to most of types in arrow, | ||
/// coercion rules from NULL type. Since NULL can be casted to any other type in arrow, | ||
/// either lhs or rhs is NULL, if NULL can be casted to type of the other side, the coercion is valid. | ||
fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
match (lhs_type, rhs_type) { | ||
|
@@ -917,11 +945,33 @@ mod tests { | |
); | ||
} | ||
|
||
/// Test coercion rules for binary operators | ||
/// | ||
/// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that the | ||
/// the result type is `$RESULT_TYPE` | ||
macro_rules! test_coercion_binary_rule { | ||
($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{ | ||
let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?; | ||
assert_eq!(lhs, $C_TYPE); | ||
assert_eq!(rhs, $C_TYPE); | ||
($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the same logic, but I renamed the arguments for clarity |
||
let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?; | ||
assert_eq!(lhs, $RESULT_TYPE); | ||
assert_eq!(rhs, $RESULT_TYPE); | ||
}}; | ||
} | ||
|
||
/// Test coercion rules for like | ||
/// | ||
/// Applies coercion rules for both | ||
/// * `$LHS_TYPE LIKE $RHS_TYPE` | ||
/// * `$RHS_TYPE LIKE $LHS_TYPE` | ||
/// | ||
/// And asserts the result type is `$RESULT_TYPE` | ||
macro_rules! test_like_rule { | ||
($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{ | ||
println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE); | ||
let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE); | ||
assert_eq!(result, $RESULT_TYPE); | ||
// reverse the order | ||
let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE); | ||
assert_eq!(result, $RESULT_TYPE); | ||
}}; | ||
} | ||
|
||
|
@@ -948,11 +998,46 @@ mod tests { | |
} | ||
|
||
#[test] | ||
fn test_type_coercion() -> Result<()> { | ||
// test like coercion rule | ||
let result = like_coercion(&DataType::Utf8, &DataType::Utf8); | ||
assert_eq!(result, Some(DataType::Utf8)); | ||
fn test_like_coercion() { | ||
// string coerce to strings | ||
test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8)); | ||
test_like_rule!( | ||
DataType::LargeUtf8, | ||
DataType::Utf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
test_like_rule!( | ||
DataType::Utf8, | ||
DataType::LargeUtf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
test_like_rule!( | ||
DataType::LargeUtf8, | ||
DataType::LargeUtf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
|
||
// Also coerce binary to strings | ||
test_like_rule!(DataType::Binary, DataType::Utf8, Some(DataType::Utf8)); | ||
test_like_rule!( | ||
DataType::LargeBinary, | ||
DataType::Utf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
test_like_rule!( | ||
DataType::Binary, | ||
DataType::LargeUtf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
test_like_rule!( | ||
DataType::LargeBinary, | ||
DataType::LargeUtf8, | ||
Some(DataType::LargeUtf8) | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_type_coercion() -> Result<()> { | ||
test_coercion_binary_rule!( | ||
DataType::Utf8, | ||
DataType::Date32, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,8 @@ pub fn data_types( | |
return Ok(current_types.to_vec()); | ||
} | ||
|
||
// Try and coerce the argument types to match the signature, returning the | ||
// coerced types from the first matching signature. | ||
for valid_types in valid_types { | ||
if let Some(types) = maybe_data_types(&valid_types, current_types) { | ||
return Ok(types); | ||
|
@@ -60,6 +62,7 @@ pub fn data_types( | |
) | ||
} | ||
|
||
/// Returns a Vec of all possible valid argument types for the given signature. | ||
fn get_valid_types( | ||
signature: &TypeSignature, | ||
current_types: &[DataType], | ||
|
@@ -104,7 +107,12 @@ fn get_valid_types( | |
Ok(valid_types) | ||
} | ||
|
||
/// Try to coerce current_types into valid_types. | ||
/// Try to coerce the current argument types to match the given `valid_types`. | ||
/// | ||
/// For example, if a function `func` accepts arguments of `(int64, int64)`, | ||
/// but was called with `(int32, int64)`, this function could match the | ||
/// valid_types by by coercing the first argument to `int64`, and would return | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. valid_types by by coercing -> valid_types by coercing |
||
/// `Some([int64, int64])`. | ||
fn maybe_data_types( | ||
valid_types: &[DataType], | ||
current_types: &[DataType], | ||
|
@@ -220,6 +228,7 @@ fn coerced_from<'a>( | |
Some(type_into.clone()) | ||
} | ||
Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()), | ||
// Any type can be coerced into strings | ||
Utf8 | LargeUtf8 => Some(type_into.clone()), | ||
Null if can_cast_types(type_from, type_into) => Some(type_into.clone()), | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -217,30 +217,51 @@ SELECT largebinary FROM t ORDER BY largebinary; | |
NULL | ||
|
||
# LIKE | ||
# https://github.com/apache/arrow-datafusion/issues/7342 | ||
query error DataFusion error: type_coercion | ||
SELECT binary FROM t where binary LIKE '%F'; | ||
|
||
query error DataFusion error: type_coercion | ||
SELECT largebinary FROM t where largebinary LIKE '%F'; | ||
query ? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They work 🎉 |
||
SELECT binary FROM t where binary LIKE '%F%'; | ||
---- | ||
466f6f | ||
466f6f426172 | ||
|
||
query ? | ||
SELECT largebinary FROM t where largebinary LIKE '%F%'; | ||
---- | ||
466f6f | ||
466f6f426172 | ||
|
||
# character_length function | ||
# https://github.com/apache/arrow-datafusion/issues/7344 | ||
query error DataFusion error: Error during planning: The "character_length" function can only accept strings, but got Binary\. | ||
query TITI | ||
SELECT | ||
cast(binary as varchar) as str, | ||
character_length(binary) as binary_len, | ||
cast(largebinary as varchar) as large_str, | ||
character_length(binary) as largebinary_len | ||
from t; | ||
---- | ||
Foo 3 Foo 3 | ||
NULL NULL NULL NULL | ||
Bar 3 Bar 3 | ||
FooBar 6 FooBar 6 | ||
|
||
query I | ||
SELECT character_length(X'20'); | ||
---- | ||
1 | ||
|
||
# still errors on values that can not be coerced to utf8 | ||
query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes from index 0 | ||
SELECT character_length(X'c328'); | ||
|
||
# regexp_replace | ||
# https://github.com/apache/arrow-datafusion/issues/7345 | ||
query error DataFusion error: Error during planning: The "regexp_replace" function can only accept strings, but got Binary\. | ||
query TTTT | ||
SELECT | ||
cast(binary as varchar) as str, | ||
regexp_replace(binary, 'F', 'f') as binary_replaced, | ||
cast(largebinary as varchar) as large_str, | ||
regexp_replace(largebinary, 'F', 'f') as large_binary_replaced | ||
from t; | ||
---- | ||
Foo foo Foo foo | ||
NULL NULL NULL NULL | ||
Bar Bar Bar Bar | ||
FooBar fooBar FooBar fooBar |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is the fix for string functions taking
binary
argumentsThe new cases for
Binary/LargeBinary
are all that is new here -- the rest is just comments