diff --git a/Cargo.lock b/Cargo.lock index 3f597a88d971..77270af4d85f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6234,7 +6234,7 @@ dependencies = [ [[package]] name = "ethnum" version = "1.5.0" -source = "git+https://github.com/ariesdevil/ethnum-rs?rev=4cb05f1#4cb05f1e407f76b193d81eef71b5dd0b73216856" +source = "git+https://github.com/datafuse-extras/ethnum-rs?rev=4cb05f1#4cb05f1e407f76b193d81eef71b5dd0b73216856" dependencies = [ "borsh", "serde", @@ -11062,7 +11062,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "stable_deref_trait", ] @@ -14746,7 +14746,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.22.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "aho-corasick", "arc-swap", @@ -14796,7 +14796,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "bitpacking 0.9.2", ] @@ -14804,7 +14804,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "downcast-rs", "fastdivide", @@ -14819,7 +14819,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "async-trait", "byteorder", @@ -14842,7 +14842,7 @@ dependencies = [ [[package]] name = "tantivy-jieba" version = "0.11.0" -source = "git+https://github.com/b41sh/tantivy-jieba?rev=af84361#af843610bc3bea826329af07256598c413f0dd6a" +source = "git+https://github.com/datafuse-extras/tantivy-jieba?rev=124a8fc#124a8fc8c8a9f1389af5a9bfa497fb358ecc556e" dependencies = [ "jieba-rs", "lazy_static", @@ -14852,7 +14852,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "nom", ] @@ -14860,7 +14860,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -14871,7 +14871,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "murmurhash32", "rand_distr", @@ -14881,7 +14881,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" +source = "git+https://github.com/datafuse-extras/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 3b2372590785..642aa2cdbdfe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -310,7 +310,9 @@ serde_with = { version = "3.8.1" } serfig = "0.1.0" sled = { version = "0.34", default-features = false } stream-more = "0.1.3" -tantivy = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0" } +tantivy = "0.22.0" +tantivy-common = "0.7.0" +tantivy-jieba = "0.11.0" thiserror = { version = "1" } tikv-jemalloc-ctl = { version = "0.5.0", features = ["use_std"] } tokio = { version = "1.35.0", features = ["full"] } @@ -401,9 +403,12 @@ async-recursion = { git = "https://github.com/zhang2014/async-recursion.git", re backtrace = { git = "https://github.com/rust-lang/backtrace-rs.git", rev = "72265be" } color-eyre = { git = "https://github.com/eyre-rs/eyre.git", rev = "e5d92c3" } deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "57795da" } -ethnum = { git = "https://github.com/ariesdevil/ethnum-rs", rev = "4cb05f1" } +ethnum = { git = "https://github.com/datafuse-extras/ethnum-rs", rev = "4cb05f1" } openai_api_rust = { git = "https://github.com/datafuse-extras/openai-api", rev = "819a0ed" } orc-rust = { git = "https://github.com/datafuse-extras/datafusion-orc", rev = "03372b97" } recursive = { git = "https://github.com/zhang2014/recursive.git", rev = "6af35a1" } sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" } +tantivy = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0" } +tantivy-common = { git = "https://github.com/datafuse-extras/tantivy", rev = "37aeac0", package = "tantivy-common" } +tantivy-jieba = { git = "https://github.com/datafuse-extras/tantivy-jieba", rev = "124a8fc" } xorfilter-rs = { git = "https://github.com/datafuse-extras/xorfilter", tag = "databend-alpha.4" } diff --git a/src/query/functions/src/lib.rs b/src/query/functions/src/lib.rs index 734bb4b74248..9d41e6b42a70 100644 --- a/src/query/functions/src/lib.rs +++ b/src/query/functions/src/lib.rs @@ -41,6 +41,15 @@ pub fn is_builtin_function(name: &str) -> bool { || ASYNC_FUNCTIONS.contains(&name) } +// The plan of search function, async function and udf contains some arguments defined in meta, +// which may be modified by user at any time. Those functions are not not suitable for caching. +pub fn is_cacheable_function(name: &str) -> bool { + BUILTIN_FUNCTIONS.contains(name) + || AggregateFunctionFactory::instance().contains(name) + || GENERAL_WINDOW_FUNCTIONS.contains(&name) + || GENERAL_LAMBDA_FUNCTIONS.contains(&name) +} + #[ctor] pub static BUILTIN_FUNCTIONS: FunctionRegistry = builtin_functions(); diff --git a/src/query/sql/src/planner/binder/aggregate.rs b/src/query/sql/src/planner/binder/aggregate.rs index 4da1e1933193..4b7e9af2c862 100644 --- a/src/query/sql/src/planner/binder/aggregate.rs +++ b/src/query/sql/src/planner/binder/aggregate.rs @@ -688,9 +688,7 @@ impl Binder { let f = |scalar: &ScalarExpr| { matches!( scalar, - ScalarExpr::AggregateFunction(_) - | ScalarExpr::WindowFunction(_) - | ScalarExpr::AsyncFunctionCall(_) + ScalarExpr::AggregateFunction(_) | ScalarExpr::WindowFunction(_) ) }; diff --git a/src/query/sql/src/planner/binder/project.rs b/src/query/sql/src/planner/binder/project.rs index b10268e70227..795c2a18f92b 100644 --- a/src/query/sql/src/planner/binder/project.rs +++ b/src/query/sql/src/planner/binder/project.rs @@ -107,11 +107,6 @@ impl Binder { ScalarExpr::WindowFunction(win) => { find_replaced_window_function(window_info, win, &item.alias).unwrap() } - ScalarExpr::AsyncFunctionCall(async_func) => self.create_derived_column_binding( - async_func.display_name.clone(), - async_func.return_type.as_ref().clone(), - Some(item.scalar.clone()), - ), _ => self.create_derived_column_binding( item.alias.clone(), item.scalar.data_type()?, diff --git a/src/query/sql/src/planner/binder/select.rs b/src/query/sql/src/planner/binder/select.rs index c1aa8aa3793f..c52f5679a6ce 100644 --- a/src/query/sql/src/planner/binder/select.rs +++ b/src/query/sql/src/planner/binder/select.rs @@ -91,9 +91,7 @@ impl Binder { let f = |scalar: &ScalarExpr| { matches!( scalar, - ScalarExpr::AggregateFunction(_) - | ScalarExpr::WindowFunction(_) - | ScalarExpr::AsyncFunctionCall(_) + ScalarExpr::AggregateFunction(_) | ScalarExpr::WindowFunction(_) ) }; diff --git a/src/query/sql/src/planner/planner_cache.rs b/src/query/sql/src/planner/planner_cache.rs index 2c83d7379122..8da310f3bb55 100644 --- a/src/query/sql/src/planner/planner_cache.rs +++ b/src/query/sql/src/planner/planner_cache.rs @@ -24,6 +24,7 @@ use databend_common_ast::ast::TableReference; use databend_common_catalog::table_context::TableContext; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; +use databend_common_functions::is_cacheable_function; use databend_common_settings::ChangeValue; use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CacheValue; @@ -162,8 +163,9 @@ impl TableRefVisitor { return; } - // If the function is score, we should not cache the plan - if func.name.name.to_lowercase() == "score" { + let func_name = func.name.name.to_lowercase(); + // If the function is not suitable for caching, we should not cache the plan + if !is_cacheable_function(&func_name) || func_name == "score" { self.cache_miss = true; } } diff --git a/src/query/sql/src/planner/semantic/async_function_rewriter.rs b/src/query/sql/src/planner/semantic/async_function_rewriter.rs index 63039f704412..1596fa3664df 100644 --- a/src/query/sql/src/planner/semantic/async_function_rewriter.rs +++ b/src/query/sql/src/planner/semantic/async_function_rewriter.rs @@ -87,6 +87,29 @@ impl AsyncFunctionRewriter { let new_expr = SExpr::create_unary(Arc::new(plan.into()), child_expr); Ok(new_expr) } + RelOperator::Filter(mut plan) => { + for scalar in &mut plan.predicates { + self.visit(scalar)?; + } + let child_expr = self.create_async_func_expr(s_expr.children[0].clone()); + let new_expr = SExpr::create_unary(Arc::new(plan.into()), child_expr); + Ok(new_expr) + } + RelOperator::Mutation(mut plan) => { + for matched_evaluator in plan.matched_evaluators.iter_mut() { + if let Some(condition) = matched_evaluator.condition.as_mut() { + self.visit(condition)?; + } + if let Some(update) = matched_evaluator.update.as_mut() { + for (_, scalar) in update.iter_mut() { + self.visit(scalar)?; + } + } + } + let child_expr = self.create_async_func_expr(s_expr.children[0].clone()); + let new_expr = SExpr::create_unary(Arc::new(plan.into()), child_expr); + Ok(new_expr) + } _ => Ok(s_expr), } } @@ -136,6 +159,12 @@ impl<'a> VisitorMut<'a> for AsyncFunctionRewriter { } fn visit_async_function_call(&mut self, async_func: &'a mut AsyncFunctionCall) -> Result<()> { + if self + .async_functions_map + .contains_key(&async_func.display_name) + { + return Ok(()); + } for (i, arg) in async_func.arguments.iter_mut().enumerate() { self.visit(arg)?; diff --git a/src/query/sql/src/planner/semantic/udf_rewriter.rs b/src/query/sql/src/planner/semantic/udf_rewriter.rs index 9b3084b9bf9e..1bde72239178 100644 --- a/src/query/sql/src/planner/semantic/udf_rewriter.rs +++ b/src/query/sql/src/planner/semantic/udf_rewriter.rs @@ -162,6 +162,9 @@ impl<'a> VisitorMut<'a> for UdfRewriter { if !udf.udf_type.match_type(self.script_udf) { return Ok(()); } + if self.udf_functions_map.contains_key(&udf.display_name) { + return Ok(()); + } let mut udf_arguments = Vec::with_capacity(udf.arguments.len()); diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml index 0ca11e3627c3..1d1372be7302 100644 --- a/src/query/storages/common/index/Cargo.toml +++ b/src/query/storages/common/index/Cargo.toml @@ -32,7 +32,7 @@ parquet = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } -tantivy-common = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0", package = "tantivy-common" } +tantivy-common = { workspace = true } tantivy-fst = "0.5" thiserror = { workspace = true } xorfilter-rs = { workspace = true, features = ["cbordata"] } diff --git a/src/query/storages/fuse/Cargo.toml b/src/query/storages/fuse/Cargo.toml index ab86128ebef2..2c1997d72c35 100644 --- a/src/query/storages/fuse/Cargo.toml +++ b/src/query/storages/fuse/Cargo.toml @@ -66,7 +66,7 @@ siphasher = "0.3.10" sys-info = "0.9" tantivy = { workspace = true } tantivy-fst = "0.5" -tantivy-jieba = { git = "https://github.com/b41sh/tantivy-jieba", rev = "af84361" } +tantivy-jieba = { workspace = true } thrift = "0.17.0" typetag = { workspace = true } uuid = { workspace = true } diff --git a/tests/sqllogictests/suites/query/functions/02_0077_function_dict_get.test b/tests/sqllogictests/suites/query/functions/02_0077_function_dict_get.test index 42376f086bc9..26decde87ef3 100644 --- a/tests/sqllogictests/suites/query/functions/02_0077_function_dict_get.test +++ b/tests/sqllogictests/suites/query/functions/02_0077_function_dict_get.test @@ -58,6 +58,12 @@ select id, dict_get(d2, 'name', id), dict_get(d2, 'age', id), dict_get(d2, 'sala 4 Tom 55 3000.55 0 5 NULL NULL NULL NULL +query ITI +select id, name, dict_get(d2, 'age', id) as age from t2 where age > 35 +---- +3 Lily 41 +4 Tom 55 + statement ok CREATE OR REPLACE DICTIONARY d3(id int, name string, age uint16, salary float, active bool) PRIMARY KEY name SOURCE(mysql(host='localhost' port='3106' username='root' password='123456' db='test' table='user'));