Skip to content

Commit

Permalink
[opt](nereids) refine left semi/anti cost under short-cut opt (#37951)
Browse files Browse the repository at this point in the history
Refine left semi/anti cost computing under short-cut opt, for the case
whose semi/anti join has the small left side and big right side, which
original solution can't support. This pr reduce the left style cost by
reduce the right side cost and improve the possibility of choosing left
style joins.

Pass the performance test on tpch/tpcds/usercase.

previous work: #37060
  • Loading branch information
xzj7019 authored and dataroaring committed Jul 22, 2024
1 parent ad206d5 commit 18a69df
Show file tree
Hide file tree
Showing 30 changed files with 330 additions and 333 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,9 @@ public Cost visitPhysicalHashJoin(
);
}
double probeShortcutFactor = 1.0;
if (physicalHashJoin.getJoinType().isLeftSemiOrAntiJoin()
if (ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null
&& !ConnectContext.get().getStatementContext().isHasUnknownColStats()
&& physicalHashJoin.getJoinType().isLeftSemiOrAntiJoin()
&& physicalHashJoin.getOtherJoinConjuncts().isEmpty()
&& physicalHashJoin.getMarkJoinConjuncts().isEmpty()) {
// left semi/anti has short-cut opt, add probe side factor for distinguishing from the right ones
Expand All @@ -414,15 +416,14 @@ public Cost visitPhysicalHashJoin(
}
}
return CostV1.of(context.getSessionVariable(),
leftRowCount * probeShortcutFactor
+ rightRowCount * buildSideFactor
leftRowCount * probeShortcutFactor + rightRowCount * probeShortcutFactor * buildSideFactor
+ outputRowCount * probeSideFactor,
rightRowCount,
0
);
}
return CostV1.of(context.getSessionVariable(),
leftRowCount * probeShortcutFactor + rightRowCount + outputRowCount,
leftRowCount * probeShortcutFactor + rightRowCount * probeShortcutFactor + outputRowCount,
rightRowCount, 0
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1297,7 +1297,7 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) {
private double broadcastRightTableScaleFactor = 0.0;

@VariableMgr.VarAttr(name = LEFT_SEMI_OR_ANTI_PROBE_FACTOR)
private double leftSemiOrAntiProbeFactor = 0.1;
private double leftSemiOrAntiProbeFactor = 0.05;

@VariableMgr.VarAttr(name = BROADCAST_ROW_COUNT_LIMIT, needForward = true)
private double broadcastRowCountLimit = 30000000;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------------hashAgg[LOCAL]
--------------PhysicalUnion
----------------PhysicalProject
------------------hashJoin[RIGHT_SEMI_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = frequent_ss_items.item_sk)) otherCondition=() build RFs:RF4 cs_item_sk->[item_sk]
--------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4
------------------hashJoin[LEFT_SEMI_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = frequent_ss_items.item_sk)) otherCondition=()
--------------------PhysicalProject
----------------------hashJoin[LEFT_SEMI_JOIN shuffle] hashCondition=((catalog_sales.cs_bill_customer_sk = best_ss_customer.c_customer_sk)) otherCondition=()
------------------------PhysicalProject
Expand All @@ -64,17 +63,18 @@ PhysicalCteAnchor ( cteId=CTEId#0 )
------------------------------filter((date_dim.d_moy = 7) and (date_dim.d_year = 2000))
--------------------------------PhysicalOlapScan[date_dim]
------------------------PhysicalCteConsumer ( cteId=CTEId#2 )
--------------------PhysicalCteConsumer ( cteId=CTEId#0 )
----------------PhysicalProject
------------------hashJoin[RIGHT_SEMI_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = frequent_ss_items.item_sk)) otherCondition=() build RFs:RF6 ws_item_sk->[item_sk]
--------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF6
------------------hashJoin[LEFT_SEMI_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = frequent_ss_items.item_sk)) otherCondition=()
--------------------PhysicalProject
----------------------hashJoin[LEFT_SEMI_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = best_ss_customer.c_customer_sk)) otherCondition=()
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF5 d_date_sk->[ws_sold_date_sk]
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[ws_sold_date_sk]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_sales] apply RFs: RF5
------------------------------PhysicalOlapScan[web_sales] apply RFs: RF4
----------------------------PhysicalProject
------------------------------filter((date_dim.d_moy = 7) and (date_dim.d_year = 2000))
--------------------------------PhysicalOlapScan[date_dim]
------------------------PhysicalCteConsumer ( cteId=CTEId#2 )
--------------------PhysicalCteConsumer ( cteId=CTEId#0 )

Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalUnion
----------------PhysicalProject
------------------hashJoin[RIGHT_SEMI_JOIN shuffleBucket] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF3 i_manufact_id->[i_manufact_id]
--------------------PhysicalProject
----------------------filter((item.i_category = 'Books'))
------------------------PhysicalOlapScan[item] apply RFs: RF3
------------------hashJoin[LEFT_SEMI_JOIN bucketShuffle] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF3 i_manufact_id->[i_manufact_id]
--------------------hashAgg[GLOBAL]
----------------------PhysicalDistribute[DistributionSpecHash]
------------------------hashAgg[LOCAL]
Expand All @@ -31,12 +28,12 @@ PhysicalResultSink
------------------------------------filter((customer_address.ca_gmt_offset = -5.00))
--------------------------------------PhysicalOlapScan[customer_address]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[item]
----------------PhysicalProject
------------------hashJoin[RIGHT_SEMI_JOIN shuffleBucket] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF7 i_manufact_id->[i_manufact_id]
--------------------------------PhysicalOlapScan[item] apply RFs: RF3
--------------------PhysicalProject
----------------------filter((item.i_category = 'Books'))
------------------------PhysicalOlapScan[item] apply RFs: RF7
------------------------PhysicalOlapScan[item]
----------------PhysicalProject
------------------hashJoin[LEFT_SEMI_JOIN bucketShuffle] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF7 i_manufact_id->[i_manufact_id]
--------------------hashAgg[GLOBAL]
----------------------PhysicalDistribute[DistributionSpecHash]
------------------------hashAgg[LOCAL]
Expand All @@ -55,12 +52,12 @@ PhysicalResultSink
------------------------------------filter((customer_address.ca_gmt_offset = -5.00))
--------------------------------------PhysicalOlapScan[customer_address]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[item]
----------------PhysicalProject
------------------hashJoin[RIGHT_SEMI_JOIN shuffleBucket] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF11 i_manufact_id->[i_manufact_id]
--------------------------------PhysicalOlapScan[item] apply RFs: RF7
--------------------PhysicalProject
----------------------filter((item.i_category = 'Books'))
------------------------PhysicalOlapScan[item] apply RFs: RF11
------------------------PhysicalOlapScan[item]
----------------PhysicalProject
------------------hashJoin[LEFT_SEMI_JOIN bucketShuffle] hashCondition=((item.i_manufact_id = item.i_manufact_id)) otherCondition=() build RFs:RF11 i_manufact_id->[i_manufact_id]
--------------------hashAgg[GLOBAL]
----------------------PhysicalDistribute[DistributionSpecHash]
------------------------hashAgg[LOCAL]
Expand All @@ -79,5 +76,8 @@ PhysicalResultSink
------------------------------------filter((customer_address.ca_gmt_offset = -5.00))
--------------------------------------PhysicalOlapScan[customer_address]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[item]
--------------------------------PhysicalOlapScan[item] apply RFs: RF11
--------------------PhysicalProject
----------------------filter((item.i_category = 'Books'))
------------------------PhysicalOlapScan[item]

Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,38 @@ PhysicalResultSink
--------------hashAgg[LOCAL]
----------------PhysicalProject
------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
--------------------hashJoin[RIGHT_SEMI_JOIN shuffleBucket] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF5 d_date_sk->[cs_sold_date_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF5
--------------------------PhysicalProject
----------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
------------------------------PhysicalOlapScan[date_dim]
----------------------hashJoin[RIGHT_SEMI_JOIN shuffleBucket] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[ws_sold_date_sk]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_sales] apply RFs: RF4
----------------------------PhysicalProject
------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
--------------------------------PhysicalOlapScan[date_dim]
------------------------hashJoin[RIGHT_SEMI_JOIN shuffle] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
--------------------hashJoin[LEFT_SEMI_JOIN bucketShuffle] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
----------------------hashJoin[LEFT_SEMI_JOIN bucketShuffle] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
------------------------hashJoin[RIGHT_SEMI_JOIN shuffle] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF5 c_customer_sk->[ss_customer_sk]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
----------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[ss_sold_date_sk]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
--------------------------------PhysicalOlapScan[store_sales] apply RFs: RF4 RF5
------------------------------PhysicalProject
--------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
----------------------------------PhysicalOlapScan[date_dim]
--------------------------PhysicalProject
----------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF1 cd_demo_sk->[c_current_cdemo_sk]
----------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF3 cd_demo_sk->[c_current_cdemo_sk]
------------------------------PhysicalProject
--------------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk]
--------------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer] apply RFs: RF0 RF1
------------------------------------PhysicalOlapScan[customer] apply RFs: RF2 RF3
----------------------------------PhysicalProject
------------------------------------PhysicalOlapScan[customer_address]
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[customer_demographics]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
----------------------------PhysicalProject
------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
--------------------------------PhysicalOlapScan[date_dim]
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
--------------------------PhysicalProject
----------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
------------------------------PhysicalOlapScan[date_dim]

Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ PhysicalResultSink
----PhysicalDistribute[DistributionSpecGather]
------PhysicalTopN[LOCAL_SORT]
--------PhysicalProject
----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id]
----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id]
------------PhysicalProject
--------------hashAgg[GLOBAL]
----------------PhysicalDistribute[DistributionSpecHash]
------------------hashAgg[LOCAL]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[ws_item_sk]
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk]
--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk]
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 RF12
------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12
----------------------------PhysicalProject
------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date]
--------------------------------PhysicalProject
Expand All @@ -32,7 +32,7 @@ PhysicalResultSink
------------------------PhysicalProject
--------------------------PhysicalOlapScan[item] apply RFs: RF13
------------PhysicalProject
--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id]
--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id]
----------------PhysicalProject
------------------hashAgg[GLOBAL]
--------------------PhysicalDistribute[DistributionSpecHash]
Expand Down Expand Up @@ -63,11 +63,11 @@ PhysicalResultSink
--------------------PhysicalDistribute[DistributionSpecHash]
----------------------hashAgg[LOCAL]
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[cs_item_sk]
--------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
----------------------------PhysicalProject
------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk]
------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk]
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3
----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 RF3
--------------------------------PhysicalProject
----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date]
------------------------------------PhysicalProject
Expand Down
Loading

0 comments on commit 18a69df

Please sign in to comment.