-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Signature::Coercible with user defined implicit casting #14440
base: main
Are you sure you want to change the base?
Changes from all commits
386c7ed
579ffef
104da43
aae48ff
f585136
bad7348
c99e986
07f97d0
da84394
e0a889e
7a78a6d
2f8c2ad
f02bf6e
8054915
62da381
231d75b
44250fc
2f6b5d6
5387e5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,13 @@ | |
//! and return types of functions in DataFusion. | ||
|
||
use std::fmt::Display; | ||
use std::hash::Hash; | ||
use std::num::NonZeroUsize; | ||
|
||
use crate::type_coercion::aggregates::NUMERICS; | ||
use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; | ||
use datafusion_common::types::{LogicalTypeRef, NativeType}; | ||
use indexmap::IndexSet; | ||
use itertools::Itertools; | ||
|
||
/// Constant that is used as a placeholder for any valid timezone. | ||
|
@@ -127,12 +129,11 @@ pub enum TypeSignature { | |
Exact(Vec<DataType>), | ||
/// One or more arguments belonging to the [`TypeSignatureClass`], in order. | ||
/// | ||
/// For example, `Coercible(vec![logical_float64()])` accepts | ||
/// arguments like `vec![Int32]` or `vec![Float32]` | ||
/// since i32 and f32 can be cast to f64 | ||
/// [`Coercion`] contains not only the desired type but also the allowed casts. | ||
/// For example, if you expect a function has string type, but you also allow it to be casted from binary type. | ||
/// | ||
/// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`]. | ||
Coercible(Vec<TypeSignatureClass>), | ||
Coercible(Vec<Coercion>), | ||
/// One or more arguments coercible to a single, comparable type. | ||
/// | ||
/// Each argument will be coerced to a single type using the | ||
|
@@ -209,14 +210,13 @@ impl TypeSignature { | |
#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] | ||
pub enum TypeSignatureClass { | ||
Timestamp, | ||
Date, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can use NativeType::Date instead, not need |
||
Time, | ||
Interval, | ||
Duration, | ||
Native(LogicalTypeRef), | ||
// TODO: | ||
// Numeric | ||
// Integer | ||
Integer, | ||
} | ||
|
||
impl Display for TypeSignatureClass { | ||
|
@@ -310,8 +310,8 @@ impl TypeSignature { | |
TypeSignature::Comparable(num) => { | ||
vec![format!("Comparable({num})")] | ||
} | ||
TypeSignature::Coercible(types) => { | ||
vec![Self::join_types(types, ", ")] | ||
TypeSignature::Coercible(coercions) => { | ||
vec![Self::join_types(coercions, ", ")] | ||
} | ||
TypeSignature::Exact(types) => { | ||
vec![Self::join_types(types, ", ")] | ||
|
@@ -365,7 +365,12 @@ impl TypeSignature { | |
} | ||
} | ||
|
||
/// get all possible types for the given `TypeSignature` | ||
/// This function is used specifically internally for `information_schema` | ||
/// We suggest not to rely on this function | ||
/// | ||
/// Get all possible types for `information_schema` from the given `TypeSignature` | ||
// | ||
// TODO: Make this function private | ||
pub fn get_possible_types(&self) -> Vec<Vec<DataType>> { | ||
match self { | ||
TypeSignature::Exact(types) => vec![types.clone()], | ||
|
@@ -378,31 +383,24 @@ impl TypeSignature { | |
.cloned() | ||
.map(|data_type| vec![data_type; *arg_count]) | ||
.collect(), | ||
TypeSignature::Coercible(types) => types | ||
TypeSignature::Coercible(coercions) => coercions | ||
.iter() | ||
.map(|logical_type| match logical_type { | ||
TypeSignatureClass::Native(l) => get_data_types(l.native()), | ||
TypeSignatureClass::Timestamp => { | ||
vec![ | ||
DataType::Timestamp(TimeUnit::Nanosecond, None), | ||
DataType::Timestamp( | ||
TimeUnit::Nanosecond, | ||
Some(TIMEZONE_WILDCARD.into()), | ||
), | ||
] | ||
} | ||
TypeSignatureClass::Date => { | ||
vec![DataType::Date64] | ||
} | ||
TypeSignatureClass::Time => { | ||
vec![DataType::Time64(TimeUnit::Nanosecond)] | ||
} | ||
TypeSignatureClass::Interval => { | ||
vec![DataType::Interval(IntervalUnit::DayTime)] | ||
} | ||
TypeSignatureClass::Duration => { | ||
vec![DataType::Duration(TimeUnit::Nanosecond)] | ||
.map(|c| { | ||
let mut all_types: IndexSet<DataType> = | ||
get_possible_types_from_signature_classes(&c.desired_type) | ||
.into_iter() | ||
.collect(); | ||
|
||
if let Some(implicit_coercion) = &c.implicit_coercion { | ||
let allowed_casts: Vec<DataType> = implicit_coercion | ||
.allowed_source_types | ||
.iter() | ||
.flat_map(get_possible_types_from_signature_classes) | ||
.collect(); | ||
all_types.extend(allowed_casts); | ||
} | ||
|
||
all_types.into_iter().collect::<Vec<_>>() | ||
}) | ||
.multi_cartesian_product() | ||
.collect(), | ||
|
@@ -431,6 +429,32 @@ impl TypeSignature { | |
} | ||
} | ||
|
||
fn get_possible_types_from_signature_classes( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would make more sense to me as a method on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is used in @goldmedal |
||
signature_classes: &TypeSignatureClass, | ||
) -> Vec<DataType> { | ||
match signature_classes { | ||
TypeSignatureClass::Native(l) => get_data_types(l.native()), | ||
TypeSignatureClass::Timestamp => { | ||
vec![ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The rest of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is "simplified" because of some reasons, if we add all possible types, the combination will be huge. This function is used in |
||
DataType::Timestamp(TimeUnit::Nanosecond, None), | ||
DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), | ||
] | ||
} | ||
TypeSignatureClass::Time => { | ||
vec![DataType::Time64(TimeUnit::Nanosecond)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible |
||
} | ||
TypeSignatureClass::Interval => { | ||
vec![DataType::Interval(IntervalUnit::DayTime)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible |
||
} | ||
TypeSignatureClass::Duration => { | ||
vec![DataType::Duration(TimeUnit::Nanosecond)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible |
||
} | ||
TypeSignatureClass::Integer => { | ||
vec![DataType::Int64] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should encompass all possible ints. |
||
} | ||
} | ||
} | ||
|
||
fn get_data_types(native_type: &NativeType) -> Vec<DataType> { | ||
match native_type { | ||
NativeType::Null => vec![DataType::Null], | ||
|
@@ -460,6 +484,110 @@ fn get_data_types(native_type: &NativeType) -> Vec<DataType> { | |
} | ||
} | ||
|
||
#[derive(Debug, Clone, Eq, PartialOrd)] | ||
pub struct Coercion { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like the idea of Coercion. Can this idea be used for user defined coercions or coercions to specific (Arrow) types? Also, is there any way to allow users to provide their own coercion rules? For example, if Sail / @shehabgamin wants to support converting numeric values to strings automatically, would he be express that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 this would be really great to have! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to ascii that binary to string is supported, you add numeric types in |
||
pub desired_type: TypeSignatureClass, | ||
implicit_coercion: Option<ImplicitCoercion>, | ||
} | ||
|
||
impl Coercion { | ||
pub fn new(desired_type: TypeSignatureClass) -> Self { | ||
Self { | ||
desired_type, | ||
implicit_coercion: None, | ||
} | ||
} | ||
|
||
/// Create a new coercion with implicit coercion rules. | ||
/// | ||
/// `allowed_source_types` defines the possible types that can be coerced to `desired_type`. | ||
/// `default_casted_type` is the default type to be used for coercion if we cast from other types via `allowed_source_types`. | ||
pub fn new_with_implicit_coercion( | ||
desired_type: TypeSignatureClass, | ||
allowed_source_types: Vec<TypeSignatureClass>, | ||
default_casted_type: NativeType, | ||
) -> Self { | ||
Self { | ||
desired_type, | ||
implicit_coercion: Some(ImplicitCoercion { | ||
allowed_source_types, | ||
default_casted_type, | ||
}), | ||
} | ||
} | ||
|
||
pub fn allowed_source_types(&self) -> &[TypeSignatureClass] { | ||
self.implicit_coercion | ||
.as_ref() | ||
.map(|c| c.allowed_source_types.as_slice()) | ||
.unwrap_or_default() | ||
} | ||
|
||
pub fn default_casted_type(&self) -> Option<&NativeType> { | ||
self.implicit_coercion | ||
.as_ref() | ||
.map(|c| &c.default_casted_type) | ||
} | ||
} | ||
|
||
impl Display for Coercion { | ||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
write!(f, "Coercion({}", self.desired_type)?; | ||
if let Some(implicit_coercion) = &self.implicit_coercion { | ||
write!(f, ", implicit_coercion={implicit_coercion}",) | ||
} else { | ||
write!(f, ")") | ||
} | ||
} | ||
} | ||
|
||
impl PartialEq for Coercion { | ||
fn eq(&self, other: &Self) -> bool { | ||
self.desired_type == other.desired_type | ||
&& self.implicit_coercion == other.implicit_coercion | ||
} | ||
} | ||
|
||
impl Hash for Coercion { | ||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) { | ||
self.desired_type.hash(state); | ||
self.implicit_coercion.hash(state); | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone, Eq, PartialOrd)] | ||
pub struct ImplicitCoercion { | ||
allowed_source_types: Vec<TypeSignatureClass>, | ||
/// For types like Timestamp, there are multiple possible timeunit and timezone from a given TypeSignatureClass | ||
/// We need to specify the default type to be used for coercion if we cast from other types via `allowed_source_types` | ||
/// Other types like Int64, you don't need to specify this field since there is only one possible type. | ||
default_casted_type: NativeType, | ||
} | ||
|
||
impl Display for ImplicitCoercion { | ||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
write!( | ||
f, | ||
"ImplicitCoercion({:?}, default_type={:?})", | ||
self.allowed_source_types, self.default_casted_type | ||
) | ||
} | ||
} | ||
|
||
impl PartialEq for ImplicitCoercion { | ||
fn eq(&self, other: &Self) -> bool { | ||
self.allowed_source_types == other.allowed_source_types | ||
&& self.default_casted_type == other.default_casted_type | ||
} | ||
} | ||
|
||
impl Hash for ImplicitCoercion { | ||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) { | ||
self.allowed_source_types.hash(state); | ||
self.default_casted_type.hash(state); | ||
} | ||
} | ||
|
||
/// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function. | ||
/// | ||
/// DataFusion will automatically coerce (cast) argument types to one of the supported | ||
|
@@ -536,11 +664,9 @@ impl Signature { | |
volatility, | ||
} | ||
} | ||
|
||
/// Target coerce types in order | ||
pub fn coercible( | ||
target_types: Vec<TypeSignatureClass>, | ||
volatility: Volatility, | ||
) -> Self { | ||
pub fn coercible(target_types: Vec<Coercion>, volatility: Volatility) -> Self { | ||
Self { | ||
type_signature: TypeSignature::Coercible(target_types), | ||
volatility, | ||
|
@@ -739,8 +865,8 @@ mod tests { | |
); | ||
|
||
let type_signature = TypeSignature::Coercible(vec![ | ||
TypeSignatureClass::Native(logical_string()), | ||
TypeSignatureClass::Native(logical_int64()), | ||
Coercion::new(TypeSignatureClass::Native(logical_string())), | ||
Coercion::new(TypeSignatureClass::Native(logical_int64())), | ||
]); | ||
let possible_types = type_signature.get_possible_types(); | ||
assert_eq!( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this API makes a lot of sense to me-- in fact I think it is pretty close to being able to express most other signatures.