-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix panic propagation in CoalescePartitions
, consolidates panic propagation into RecordBatchReceiverStream
#6507
Changes from 17 commits
0fbc3dd
742597a
e1c827a
8ffc015
e5c4e03
e7c0c6f
3f80690
d50c819
76270f0
2cea75e
5ae3620
a531afe
56a26eb
b1a817c
6a50ee9
79fcbfa
70a3d57
fb17af8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,25 +20,19 @@ | |
|
||
use std::any::Any; | ||
use std::sync::Arc; | ||
use std::task::Poll; | ||
|
||
use futures::Stream; | ||
use tokio::sync::mpsc; | ||
|
||
use arrow::datatypes::SchemaRef; | ||
use arrow::record_batch::RecordBatch; | ||
|
||
use super::common::AbortOnDropMany; | ||
use super::expressions::PhysicalSortExpr; | ||
use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; | ||
use super::{RecordBatchStream, Statistics}; | ||
use super::stream::{ObservedStream, RecordBatchReceiverStream}; | ||
use super::Statistics; | ||
use crate::error::{DataFusionError, Result}; | ||
use crate::physical_plan::{ | ||
DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, | ||
}; | ||
|
||
use super::SendableRecordBatchStream; | ||
use crate::physical_plan::common::spawn_execution; | ||
use datafusion_execution::TaskContext; | ||
|
||
/// Merge execution plan executes partitions in parallel and combines them into a single | ||
|
@@ -137,27 +131,17 @@ impl ExecutionPlan for CoalescePartitionsExec { | |
// use a stream that allows each sender to put in at | ||
// least one result in an attempt to maximize | ||
// parallelism. | ||
let (sender, receiver) = | ||
mpsc::channel::<Result<RecordBatch>>(input_partitions); | ||
let mut builder = | ||
RecordBatchReceiverStream::builder(self.schema(), input_partitions); | ||
|
||
// spawn independent tasks whose resulting streams (of batches) | ||
// are sent to the channel for consumption. | ||
let mut join_handles = Vec::with_capacity(input_partitions); | ||
for part_i in 0..input_partitions { | ||
join_handles.push(spawn_execution( | ||
self.input.clone(), | ||
sender.clone(), | ||
part_i, | ||
context.clone(), | ||
)); | ||
builder.run_input(self.input.clone(), part_i, context.clone()); | ||
} | ||
|
||
Ok(Box::pin(MergeStream { | ||
input: receiver, | ||
schema: self.schema(), | ||
baseline_metrics, | ||
drop_helper: AbortOnDropMany(join_handles), | ||
})) | ||
let stream = builder.build(); | ||
Ok(Box::pin(ObservedStream::new(stream, baseline_metrics))) | ||
} | ||
} | ||
} | ||
|
@@ -183,32 +167,6 @@ impl ExecutionPlan for CoalescePartitionsExec { | |
} | ||
} | ||
|
||
struct MergeStream { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I basically taught |
||
schema: SchemaRef, | ||
input: mpsc::Receiver<Result<RecordBatch>>, | ||
baseline_metrics: BaselineMetrics, | ||
#[allow(unused)] | ||
drop_helper: AbortOnDropMany<()>, | ||
} | ||
|
||
impl Stream for MergeStream { | ||
type Item = Result<RecordBatch>; | ||
|
||
fn poll_next( | ||
mut self: std::pin::Pin<&mut Self>, | ||
cx: &mut std::task::Context<'_>, | ||
) -> Poll<Option<Self::Item>> { | ||
let poll = self.input.poll_recv(cx); | ||
self.baseline_metrics.record_poll(poll) | ||
} | ||
} | ||
|
||
impl RecordBatchStream for MergeStream { | ||
fn schema(&self) -> SchemaRef { | ||
self.schema.clone() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
|
||
|
@@ -218,7 +176,9 @@ mod tests { | |
use super::*; | ||
use crate::physical_plan::{collect, common}; | ||
use crate::prelude::SessionContext; | ||
use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; | ||
use crate::test::exec::{ | ||
assert_strong_count_converges_to_zero, BlockingExec, PanicExec, | ||
}; | ||
use crate::test::{self, assert_is_pending}; | ||
|
||
#[tokio::test] | ||
|
@@ -270,4 +230,19 @@ mod tests { | |
|
||
Ok(()) | ||
} | ||
|
||
#[tokio::test] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the new test from @nvartolomei |
||
#[should_panic(expected = "PanickingStream did panic")] | ||
async fn test_panic() { | ||
let session_ctx = SessionContext::new(); | ||
let task_ctx = session_ctx.task_ctx(); | ||
let schema = | ||
Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)])); | ||
|
||
let panicking_exec = Arc::new(PanicExec::new(Arc::clone(&schema), 2)); | ||
let coalesce_partitions_exec = | ||
Arc::new(CoalescePartitionsExec::new(panicking_exec)); | ||
|
||
collect(coalesce_partitions_exec, task_ctx).await.unwrap(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am quite pleased that this is now all encapsulated into
RecordBatchReceiverStream