From b6ff23361140202cb1684baee5b39cc132e32df4 Mon Sep 17 00:00:00 2001 From: leiysky Date: Mon, 20 Jun 2022 16:06:01 +0800 Subject: [PATCH] fix error of semi join --- query/src/sql/exec/data_schema_builder.rs | 23 +++++--- .../sql/optimizer/heuristic/decorrelate.rs | 53 +++++++++++++++---- query/src/sql/planner/metadata.rs | 13 ++++- .../it/sql/optimizer/heuristic/subquery.rs | 24 +++++++++ .../heuristic/testdata/subquery.test | 46 ++++++++++++++++ .../20+_others/20_0001_planner_v2.result | 3 ++ .../20+_others/20_0001_planner_v2.sql | 7 +++ 7 files changed, 150 insertions(+), 19 deletions(-) diff --git a/query/src/sql/exec/data_schema_builder.rs b/query/src/sql/exec/data_schema_builder.rs index 7c8bb6d034b1..d48838366fe0 100644 --- a/query/src/sql/exec/data_schema_builder.rs +++ b/query/src/sql/exec/data_schema_builder.rs @@ -108,14 +108,23 @@ impl DataSchemaBuilder { for field in left.fields().iter() { fields.push(field.clone()); } - if join_type == &JoinType::Left { - for field in right.fields().iter() { - let nullable_field = DataField::new(field.name(), wrap_nullable(field.data_type())); - fields.push(nullable_field); + match join_type { + JoinType::Left => { + for field in right.fields().iter() { + let nullable_field = + DataField::new(field.name(), wrap_nullable(field.data_type())); + fields.push(nullable_field); + } } - } else { - for field in right.fields().iter() { - fields.push(field.clone()); + + JoinType::Semi | JoinType::Anti => { + // Empty right output schema for Semi and Anti join + } + + _ => { + for field in right.fields().iter() { + fields.push(field.clone()); + } } } DataSchemaRefExt::create(fields) diff --git a/query/src/sql/optimizer/heuristic/decorrelate.rs b/query/src/sql/optimizer/heuristic/decorrelate.rs index 7f90a7039361..3406c9e66ce0 100644 --- a/query/src/sql/optimizer/heuristic/decorrelate.rs +++ b/query/src/sql/optimizer/heuristic/decorrelate.rs @@ -18,12 +18,14 @@ use common_exception::Result; use crate::sql::binder::wrap_cast_if_needed; use crate::sql::binder::JoinCondition; use crate::sql::optimizer::heuristic::subquery_rewriter::SubqueryRewriter; +use crate::sql::optimizer::ColumnSet; use crate::sql::optimizer::RelExpr; use crate::sql::optimizer::SExpr; use crate::sql::plans::Filter; use crate::sql::plans::JoinType; use crate::sql::plans::LogicalInnerJoin; use crate::sql::plans::PatternPlan; +use crate::sql::plans::Project; use crate::sql::plans::RelOp; use crate::sql::plans::SubqueryExpr; use crate::sql::plans::SubqueryType; @@ -119,7 +121,8 @@ pub fn try_decorrelate_subquery(input: &SExpr, subquery: &SubqueryExpr) -> Resul // This is not necessary, but it is a good heuristic for most cases. let mut left_conditions = vec![]; let mut right_conditions = vec![]; - let mut extra_predicates = vec![]; + let mut left_filters = vec![]; + let mut right_filters = vec![]; for pred in filter.predicates.iter() { let join_condition = JoinCondition::new(pred, &input_prop, &filter_prop); match join_condition { @@ -128,8 +131,11 @@ pub fn try_decorrelate_subquery(input: &SExpr, subquery: &SubqueryExpr) -> Resul return Ok(None); } - JoinCondition::Left(_) | JoinCondition::Right(_) => { - extra_predicates.push(pred.clone()); + JoinCondition::Left(filter) => { + left_filters.push(filter.clone()); + } + JoinCondition::Right(filter) => { + right_filters.push(filter.clone()); } JoinCondition::Both { left, right } => { @@ -155,27 +161,52 @@ pub fn try_decorrelate_subquery(input: &SExpr, subquery: &SubqueryExpr) -> Resul }; // Rewrite plan to semi-join. - let left_child = input.clone(); + let mut left_child = input.clone(); + if !left_filters.is_empty() { + left_child = SExpr::create_unary( + Filter { + predicates: left_filters, + is_having: false, + } + .into(), + left_child, + ); + } + // Remove `Filter` from subquery. - let right_child = SExpr::create_unary( + let mut right_child = SExpr::create_unary( subquery.subquery.plan().clone(), SExpr::create_unary( subquery.subquery.child(0)?.plan().clone(), SExpr::create_leaf(filter_tree.child(0)?.plan().clone()), ), ); - let mut result = SExpr::create_binary(semi_join.into(), left_child, right_child); - - if !extra_predicates.is_empty() { - result = SExpr::create_unary( + if !right_filters.is_empty() { + right_child = SExpr::create_unary( Filter { - predicates: extra_predicates, + predicates: right_filters, is_having: false, } .into(), - result, + right_child, ); } + // Add project for join keys + let used_columns = semi_join + .right_conditions + .iter() + .fold(ColumnSet::new(), |v, acc| { + v.union(&acc.used_columns()).cloned().collect() + }); + right_child = SExpr::create_unary( + Project { + columns: used_columns, + } + .into(), + right_child, + ); + + let result = SExpr::create_binary(semi_join.into(), left_child, right_child); Ok(Some(result)) } diff --git a/query/src/sql/planner/metadata.rs b/query/src/sql/planner/metadata.rs index 41a24cee5ca5..d835d36ed2cb 100644 --- a/query/src/sql/planner/metadata.rs +++ b/query/src/sql/planner/metadata.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Debug; use std::sync::Arc; use common_ast::ast::Expr; @@ -37,6 +38,16 @@ pub struct TableEntry { pub source: ReadDataSourcePlan, } +impl Debug for TableEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "TableEntry {{ index: {:?}, name: {:?}, catalog: {:?}, database: {:?} }}", + self.index, self.name, self.catalog, self.database + ) + } +} + impl TableEntry { pub fn new( index: IndexType, @@ -86,7 +97,7 @@ impl ColumnEntry { /// Metadata stores information about columns and tables used in a query. /// Tables and columns are identified with its unique index, notice that index value of a column can /// be same with that of a table. -#[derive(Clone, Default)] +#[derive(Clone, Debug, Default)] pub struct Metadata { tables: Vec, columns: Vec, diff --git a/query/tests/it/sql/optimizer/heuristic/subquery.rs b/query/tests/it/sql/optimizer/heuristic/subquery.rs index 6c4b9e30be2c..563ad8ce06fd 100644 --- a/query/tests/it/sql/optimizer/heuristic/subquery.rs +++ b/query/tests/it/sql/optimizer/heuristic/subquery.rs @@ -58,6 +58,30 @@ pub async fn test_heuristic_optimizer_subquery() -> Result<()> { .to_string(), rules: DEFAULT_REWRITE_RULES.clone(), }, + Suite { + comment: "".to_string(), + query: "select * from numbers(1) as t where exists (select number as a from numbers(1) where number = t.number)" + .to_string(), + rules: DEFAULT_REWRITE_RULES.clone(), + }, + Suite { + comment: "# Exists with different kinds of predicate".to_string(), + query: "select t.number from numbers(1) as t where exists (select * from numbers(1) where number = t.number and number = 0 and t.number < 10)" + .to_string(), + rules: DEFAULT_REWRITE_RULES.clone(), + }, + Suite { + comment: "# Exists with non-equi predicate".to_string(), + query: "select t.number from numbers(1) as t where exists (select * from numbers(1) where number = t.number and t.number < number)" + .to_string(), + rules: DEFAULT_REWRITE_RULES.clone(), + }, + Suite { + comment: "# Exists project required columns".to_string(), + query: "select t.number from numbers(1) as t where exists (select number as a, number as b, number as c from numbers(1) where number = t.number)" + .to_string(), + rules: DEFAULT_REWRITE_RULES.clone(), + }, ]; run_suites(ctx, &mut file, &suites).await diff --git a/query/tests/it/sql/optimizer/heuristic/testdata/subquery.test b/query/tests/it/sql/optimizer/heuristic/testdata/subquery.test index 3e7644021eb6..d6031c3e6e48 100644 --- a/query/tests/it/sql/optimizer/heuristic/testdata/subquery.test +++ b/query/tests/it/sql/optimizer/heuristic/testdata/subquery.test @@ -64,3 +64,49 @@ Project: [number] Scan: default.system.numbers +select * from numbers(1) as t where exists (select number as a from numbers(1) where number = t.number) +---- +Project: [number] + HashJoin: SEMI, build keys: [number], probe keys: [number] + Scan: default.system.numbers + Project: [number] + Scan: default.system.numbers + + +# Exists with different kinds of predicate +select t.number from numbers(1) as t where exists (select * from numbers(1) where number = t.number and number = 0 and t.number < 10) +---- +Project: [number] + HashJoin: SEMI, build keys: [number], probe keys: [number] + Filter: [number < 10] + Scan: default.system.numbers + Project: [number] + Filter: [number = 0] + Scan: default.system.numbers + + +# Exists with non-equi predicate +select t.number from numbers(1) as t where exists (select * from numbers(1) where number = t.number and t.number < number) +---- +Project: [number] + Filter: [subquery_3] + CrossApply + Scan: default.system.numbers + Project: [subquery] + EvalScalar: [count(*) > 0] + Aggregate: group items: [], aggregate functions: [count(*)] + Project: [number] + Filter: [number = number, number < number] + Scan: default.system.numbers + + +# Exists project required columns +select t.number from numbers(1) as t where exists (select number as a, number as b, number as c from numbers(1) where number = t.number) +---- +Project: [number] + HashJoin: SEMI, build keys: [number], probe keys: [number] + Scan: default.system.numbers + Project: [number] + Scan: default.system.numbers + + diff --git a/tests/suites/0_stateless/20+_others/20_0001_planner_v2.result b/tests/suites/0_stateless/20+_others/20_0001_planner_v2.result index 0cb30611a3ef..b03d19faf9be 100644 --- a/tests/suites/0_stateless/20+_others/20_0001_planner_v2.result +++ b/tests/suites/0_stateless/20+_others/20_0001_planner_v2.result @@ -387,3 +387,6 @@ NULL NULL NULL 1 2 3 4 +1 +0 +1 diff --git a/tests/suites/0_stateless/20+_others/20_0001_planner_v2.sql b/tests/suites/0_stateless/20+_others/20_0001_planner_v2.sql index 443b33b4e95c..38e419520cfd 100644 --- a/tests/suites/0_stateless/20+_others/20_0001_planner_v2.sql +++ b/tests/suites/0_stateless/20+_others/20_0001_planner_v2.sql @@ -305,6 +305,10 @@ CREATE FUNCTION notnull1 AS (p) -> not(is_null(p)); SELECT notnull1(null); SELECT notnull1('null'); +drop function a_plus_3; +drop function cal1; +drop function notnull1; + --set operator select '====Intersect Distinct==='; create table t1(a int, b int); @@ -339,5 +343,8 @@ drop table n; select * from numbers(5) as t where exists (select * from numbers(3) where number = t.number); select * from numbers(5) as t where not exists (select * from numbers(3) where number = t.number); +select * from numbers(5) as t where exists (select number as a from numbers(3) where number = t.number and number > 0 and t.number < 2); +select * from numbers(5) as t where exists (select * from numbers(3) where number > t.number); + set enable_planner_v2 = 0;