-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support InsertInto Sorted ListingTable #7743
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,6 +223,7 @@ impl TableProvider for MemTable { | |
input, | ||
sink, | ||
self.schema.clone(), | ||
None, | ||
))) | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -73,6 +73,8 @@ pub struct FileSinkExec { | |||||
sink_schema: SchemaRef, | ||||||
/// Schema describing the structure of the output data. | ||||||
count_schema: SchemaRef, | ||||||
/// Optional required sort order for output data. | ||||||
sort_order: Option<Vec<Option<Vec<PhysicalSortRequirement>>>>, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since FileSink can have only a single input, I think it only needs a single sort order per In other words, I think this could be simplified to
Suggested change
And then adjust required_input_order appropriately There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes sense. Will do! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just pushed an update with this change |
||||||
} | ||||||
|
||||||
impl fmt::Debug for FileSinkExec { | ||||||
|
@@ -87,12 +89,14 @@ impl FileSinkExec { | |||||
input: Arc<dyn ExecutionPlan>, | ||||||
sink: Arc<dyn DataSink>, | ||||||
sink_schema: SchemaRef, | ||||||
sort_order: Option<Vec<Option<Vec<PhysicalSortRequirement>>>>, | ||||||
) -> Self { | ||||||
Self { | ||||||
input, | ||||||
sink, | ||||||
sink_schema, | ||||||
count_schema: make_count_schema(), | ||||||
sort_order, | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -192,16 +196,20 @@ impl ExecutionPlan for FileSinkExec { | |||||
} | ||||||
|
||||||
fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> { | ||||||
// Require that the InsertExec gets the data in the order the | ||||||
// The input order is either exlicitly set (such as by a ListingTable), | ||||||
// or require that the [FileSinkExec] gets the data in the order the | ||||||
// input produced it (otherwise the optimizer may chose to reorder | ||||||
// the input which could result in unintended / poor UX) | ||||||
// | ||||||
// More rationale: | ||||||
// https://github.com/apache/arrow-datafusion/pull/6354#discussion_r1195284178 | ||||||
vec![self | ||||||
.input | ||||||
.output_ordering() | ||||||
.map(PhysicalSortRequirement::from_sort_exprs)] | ||||||
match &self.sort_order { | ||||||
Some(requirements) => requirements.clone(), | ||||||
None => vec![self | ||||||
.input | ||||||
.output_ordering() | ||||||
.map(PhysicalSortRequirement::from_sort_exprs)], | ||||||
} | ||||||
} | ||||||
|
||||||
fn maintains_input_order(&self) -> Vec<bool> { | ||||||
|
@@ -221,6 +229,7 @@ impl ExecutionPlan for FileSinkExec { | |||||
sink: self.sink.clone(), | ||||||
sink_schema: self.sink_schema.clone(), | ||||||
count_schema: self.count_schema.clone(), | ||||||
sort_order: self.sort_order.clone(), | ||||||
})) | ||||||
} | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,35 @@ LOCATION '../../testing/data/csv/aggregate_test_100.csv' | |
statement ok | ||
set datafusion.execution.target_partitions = 8; | ||
|
||
statement ok | ||
CREATE EXTERNAL TABLE | ||
ordered_insert_test(a bigint, b bigint) | ||
STORED AS csv | ||
LOCATION 'test_files/scratch/insert_to_external/insert_to_ordered/' | ||
WITH ORDER (a ASC, B DESC) | ||
OPTIONS( | ||
create_local_path 'true', | ||
insert_mode 'append_new_files', | ||
); | ||
|
||
query II | ||
INSERT INTO ordered_insert_test values (5, 1), (4, 2), (7,7), (7,8), (7,9), (7,10), (3, 3), (2, 4), (1, 5); | ||
---- | ||
9 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please also add an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just pushed an update with explain test. |
||
|
||
query II | ||
SELECT * from ordered_insert_test; | ||
---- | ||
1 5 | ||
2 4 | ||
3 3 | ||
4 2 | ||
5 1 | ||
7 10 | ||
7 9 | ||
7 8 | ||
7 7 | ||
|
||
statement ok | ||
CREATE EXTERNAL TABLE | ||
single_file_test(a bigint, b bigint) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍