-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement LimitPushDown for ExecutionPlan #9815
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! The [`LimitPushdown`] The LimitPushdown optimization rule is designed | ||
//! to improve the performance of query execution by pushing the LIMIT clause down | ||
//! through the execution plan as far as possible, ideally directly | ||
//! to the [`CoalesceBatchesExec`]. to reduce target_batch_size This means that instead of processing | ||
//! a large amount of data and then applying the limit at the end, | ||
//! the system tries to limit the amount of data being processed throughout the execution of the query. | ||
|
||
use std::sync::Arc; | ||
|
||
use crate::physical_optimizer::PhysicalOptimizerRule; | ||
|
||
use crate::physical_plan::limit::GlobalLimitExec; | ||
use crate::physical_plan::ExecutionPlan; | ||
use datafusion_common::config::ConfigOptions; | ||
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; | ||
use datafusion_common::Result; | ||
|
||
use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec; | ||
use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; | ||
|
||
#[allow(missing_docs)] | ||
pub struct LimitPushdown {} | ||
|
||
impl LimitPushdown { | ||
#[allow(missing_docs)] | ||
pub fn new() -> Self { | ||
Self {} | ||
} | ||
} | ||
impl Default for LimitPushdown { | ||
fn default() -> Self { | ||
Self::new() | ||
} | ||
} | ||
impl PhysicalOptimizerRule for LimitPushdown { | ||
fn optimize( | ||
&self, | ||
plan: Arc<dyn ExecutionPlan>, | ||
_config: &ConfigOptions, | ||
) -> Result<Arc<dyn ExecutionPlan>> { | ||
// we traverse the treenode to try to push down the limit same logic as project push down | ||
plan.transform_down(&push_down_limit).data() | ||
} | ||
|
||
fn name(&self) -> &str { | ||
"LimitPushdown" | ||
} | ||
|
||
fn schema_check(&self) -> bool { | ||
true | ||
} | ||
} | ||
impl LimitPushdown {} | ||
// try to push down current limit, based on the son | ||
fn push_down_limit( | ||
plan: Arc<dyn ExecutionPlan>, | ||
) -> Result<Transformed<Arc<dyn ExecutionPlan>>> { | ||
// for pattern like GlobalLimit -> CoalescePartitionsExec -> CoalesceBatchesExec , we convert it into | ||
// GlobalLimit->CloalescePartitionExec->CoalesceBatchesExec(new fetch) | ||
if let Some(global_limit) = plan.as_any().downcast_ref::<GlobalLimitExec>() { | ||
let input = global_limit.input().as_any(); | ||
if let Some(coalesce_partition_batch) = | ||
input.downcast_ref::<CoalescePartitionsExec>() | ||
{ | ||
let new_input = coalesce_partition_batch.input().as_any(); | ||
if let Some(coalesce_batch) = new_input.downcast_ref::<CoalesceBatchesExec>() | ||
{ | ||
Ok(Transformed::yes(generate_new_limit_pattern( | ||
global_limit, | ||
coalesce_batch, | ||
))) | ||
} else { | ||
Ok(Transformed::no(plan)) | ||
} | ||
} else { | ||
Ok(Transformed::no(plan)) | ||
} | ||
} else { | ||
Ok(Transformed::no(plan)) | ||
} | ||
} | ||
// generate corresponding pattern | ||
fn generate_new_limit_pattern( | ||
limit_exec: &GlobalLimitExec, | ||
coalesce_batch: &CoalesceBatchesExec, | ||
) -> Arc<dyn ExecutionPlan> { | ||
let mut grand_exec = CoalesceBatchesExec::new( | ||
coalesce_batch.input().clone(), | ||
coalesce_batch.target_batch_size(), | ||
); | ||
grand_exec.set_inner_fetch(limit_exec.fetch()); | ||
let grand_child = Arc::new(grand_exec); | ||
Arc::new(GlobalLimitExec::new( | ||
Arc::new(CoalescePartitionsExec::new(grand_child)), | ||
limit_exec.skip(), | ||
limit_exec.fetch(), | ||
)) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
//! vectorized processing by upstream operators. | ||
|
||
use std::any::Any; | ||
|
||
use std::pin::Pin; | ||
use std::sync::Arc; | ||
use std::task::{Context, Poll}; | ||
|
@@ -49,6 +50,8 @@ pub struct CoalesceBatchesExec { | |
/// Execution metrics | ||
metrics: ExecutionPlanMetricsSet, | ||
cache: PlanProperties, | ||
/// the fetch count | ||
fetch: Option<usize>, | ||
} | ||
|
||
impl CoalesceBatchesExec { | ||
|
@@ -60,6 +63,7 @@ impl CoalesceBatchesExec { | |
target_batch_size, | ||
metrics: ExecutionPlanMetricsSet::new(), | ||
cache, | ||
fetch: None, | ||
} | ||
} | ||
|
||
|
@@ -83,6 +87,9 @@ impl CoalesceBatchesExec { | |
input.execution_mode(), // Execution Mode | ||
) | ||
} | ||
pub fn set_inner_fetch(&mut self, siz: Option<usize>) { | ||
self.fetch = siz; | ||
} | ||
} | ||
|
||
impl DisplayAs for CoalesceBatchesExec { | ||
|
@@ -93,11 +100,19 @@ impl DisplayAs for CoalesceBatchesExec { | |
) -> std::fmt::Result { | ||
match t { | ||
DisplayFormatType::Default | DisplayFormatType::Verbose => { | ||
write!( | ||
f, | ||
"CoalesceBatchesExec: target_batch_size={}", | ||
self.target_batch_size | ||
) | ||
if let Some(fetch) = self.fetch { | ||
write!( | ||
f, | ||
"CoalesceBatchesExec: target_batch_size={} fetch= {}", | ||
self.target_batch_size, fetch | ||
) | ||
} else { | ||
write!( | ||
f, | ||
"CoalesceBatchesExec: target_batch_size={}", | ||
self.target_batch_size | ||
) | ||
} | ||
} | ||
} | ||
} | ||
|
@@ -148,6 +163,11 @@ impl ExecutionPlan for CoalesceBatchesExec { | |
input: self.input.execute(partition, context)?, | ||
schema: self.input.schema(), | ||
target_batch_size: self.target_batch_size, | ||
fetch: if let Some(fetch_cnt) = self.fetch { | ||
fetch_cnt | ||
} else { | ||
usize::MAX | ||
}, | ||
buffer: Vec::new(), | ||
buffered_rows: 0, | ||
is_closed: false, | ||
|
@@ -171,6 +191,8 @@ struct CoalesceBatchesStream { | |
schema: SchemaRef, | ||
/// Minimum number of rows for coalesces batches | ||
target_batch_size: usize, | ||
/// fetch count passed by upper LimitExec | ||
fetch: usize, | ||
/// Buffered batches | ||
buffer: Vec<RecordBatch>, | ||
/// Buffered row count | ||
|
@@ -216,7 +238,8 @@ impl CoalesceBatchesStream { | |
match input_batch { | ||
Poll::Ready(x) => match x { | ||
Some(Ok(batch)) => { | ||
if batch.num_rows() >= self.target_batch_size | ||
if (batch.num_rows() >= self.target_batch_size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need some tests of this new logic added to repartition exec |
||
|| batch.num_rows() >= self.fetch) | ||
&& self.buffer.is_empty() | ||
{ | ||
return Poll::Ready(Some(Ok(batch))); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,6 +167,10 @@ impl ExecutionPlan for GlobalLimitExec { | |
|
||
// GlobalLimitExec requires a single input partition | ||
if 1 != self.input.output_partitioning().partition_count() { | ||
println!( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this println should be removed |
||
"**************** \n partition_count is {:?} \n **************** \n", | ||
self.input.output_partitioning().partition_count() | ||
); | ||
return internal_err!("GlobalLimitExec requires a single input partition"); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -123,7 +123,7 @@ Limit: skip=0, fetch=5 | |
physical_plan | ||
GlobalLimitExec: skip=0, fetch=5 | ||
--CoalescePartitionsExec | ||
----CoalesceBatchesExec: target_batch_size=8192 | ||
----CoalesceBatchesExec: target_batch_size=8192 fetch= 5 | ||
------FilterExec: c3@2 > 0 | ||
--------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1 | ||
----------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we also push the limit to the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe we can also set a
fetch
count forCoalesceBatchesExec
without changing the plan order. A global fetch count may be carried across the subtree until facing with breaking plan, but I don't know if it would bring more capability. Can there be plans which cannot swap with limit but also do not break the required fetch count?