Skip to content

Commit

Permalink
[fix](nereids) refine row count estimation for mark join (#38270) (#3…
Browse files Browse the repository at this point in the history
…8297)

pick from master #38270
  • Loading branch information
xzj7019 authored Jul 25, 2024
1 parent 9a40cd5 commit 8ad4390
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ private static double estimateSemiOrAntiRowCountBySlotsEqual(Statistics leftStat
}

private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) {
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) {
double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) {
double sel = join.isMarkJoin() ? 1.0 : computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
if (join.getJoinType().isLeftSemiOrAntiJoin()) {
return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel)
.putColumnStatistics(leftStats.columnStatistics())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,34 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk]
--------------------PhysicalProject
----------------------PhysicalOlapScan[customer_address] apply RFs: RF3
--------------------PhysicalDistribute[DistributionSpecReplicated]
------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[customer] apply RFs: RF2
--------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------------PhysicalProject
------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------------PhysicalOlapScan[item]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------PhysicalProject
----------------------------------filter((date_dim.d_qoy = 1) and (date_dim.d_year = 2000))
------------------------------------PhysicalOlapScan[date_dim]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk]
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer] apply RFs: RF0
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer_address]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[item]
--------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------PhysicalProject
------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------PhysicalOlapScan[item]

Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,34 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk]
--------------------PhysicalProject
----------------------PhysicalOlapScan[customer_address] apply RFs: RF3
------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[customer] apply RFs: RF2
--------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------------PhysicalProject
------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------------PhysicalOlapScan[item]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------PhysicalProject
----------------------------------filter((date_dim.d_qoy = 1) and (date_dim.d_year = 2000))
------------------------------------PhysicalOlapScan[date_dim]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk]
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer] apply RFs: RF0
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer_address]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[item]
--------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------PhysicalProject
------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------PhysicalOlapScan[item]

Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
--------------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1
------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
----------------------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------------------------PhysicalProject
--------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212))
Expand All @@ -35,9 +35,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
--------------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
--------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
----------------------------------------PhysicalProject
------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0
------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF0
----------------------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------------------------PhysicalProject
--------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,34 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk]
--------------------PhysicalProject
----------------------PhysicalOlapScan[customer_address] apply RFs: RF3
------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=()
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[customer] apply RFs: RF2
--------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF2 i_item_sk->[ws_item_sk]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------------PhysicalProject
------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------------PhysicalOlapScan[item]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2
------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------PhysicalProject
----------------------------------filter((date_dim.d_qoy = 2) and (date_dim.d_year = 2000))
------------------------------------PhysicalOlapScan[date_dim]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[item]
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------PhysicalOlapScan[item]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=()
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[customer]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[customer_address]

42 changes: 23 additions & 19 deletions regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,34 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk]
--------------------PhysicalProject
----------------------PhysicalOlapScan[customer_address] apply RFs: RF3
------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ws_bill_customer_sk]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[customer] apply RFs: RF2
--------------------------PhysicalDistribute[DistributionSpecReplicated]
------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF2 i_item_sk->[ws_item_sk]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------------PhysicalProject
----------------------------------------PhysicalOlapScan[item]
--------------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------------PhysicalProject
------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------------PhysicalOlapScan[item]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3
------------------------------PhysicalDistribute[DistributionSpecReplicated]
--------------------------------PhysicalProject
----------------------------------filter((date_dim.d_qoy = 2) and (date_dim.d_year = 2000))
------------------------------------PhysicalOlapScan[date_dim]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=()
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[item]
--------------------------------PhysicalDistribute[DistributionSpecReplicated]
----------------------------------PhysicalProject
------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7))
--------------------------------------PhysicalOlapScan[item]
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk]
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[customer] apply RFs: RF0
--------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[customer_address]

0 comments on commit 8ad4390

Please sign in to comment.