-
Notifications
You must be signed in to change notification settings - Fork 254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use column projection during update #322
Changes from all commits
4825c2d
9317aa6
e4f7456
63eb1e3
21a3ec9
088b83b
0a619a9
1c3919e
4ad2336
d79ceb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
#include "lance/arrow/dataset.h" | ||
|
||
#include <arrow/array.h> | ||
#include <arrow/array/concatenate.h> | ||
#include <arrow/dataset/api.h> | ||
#include <arrow/status.h> | ||
#include <arrow/table.h> | ||
|
@@ -316,6 +317,39 @@ ::arrow::Result<std::shared_ptr<UpdaterBuilder>> LanceDataset::NewUpdate( | |
std::move(new_field)); | ||
} | ||
|
||
::arrow::Result<std::shared_ptr<LanceDataset>> LanceDataset::AddColumn( | ||
const std::shared_ptr<::arrow::Field>& field, ::arrow::compute::Expression expression) { | ||
if (!expression.IsScalarExpression()) { | ||
return ::arrow::Status::Invalid( | ||
"LanceDataset::AddColumn: expression is not a scalar expression."); | ||
} | ||
ARROW_ASSIGN_OR_RAISE(expression, expression.Bind(*schema())); | ||
ARROW_ASSIGN_OR_RAISE(auto builder, NewUpdate(field)); | ||
ARROW_ASSIGN_OR_RAISE(auto updater, builder->Finish()); | ||
|
||
// TODO: add projection via FieldRef. | ||
while (true) { | ||
ARROW_ASSIGN_OR_RAISE(auto batch, updater->Next()); | ||
if (!batch) { | ||
break; | ||
} | ||
|
||
ARROW_ASSIGN_OR_RAISE(auto datum, | ||
::arrow::compute::ExecuteScalarExpression(expression, *schema(), batch)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this bind the expression or is it required to be bound before the AddColumn method is called? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the expression is bound when entering this |
||
std::shared_ptr<::arrow::Array> arr; | ||
if (datum.is_scalar()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok so this is a constant literal value? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is for case like |
||
ARROW_ASSIGN_OR_RAISE(arr, CreateArray(datum.scalar(), batch->num_rows())); | ||
} else if (datum.is_chunked_array()) { | ||
auto chunked_arr = datum.chunked_array(); | ||
ARROW_ASSIGN_OR_RAISE(arr, ::arrow::Concatenate(chunked_arr->chunks())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ExtensionArray's cannot be concatenated currently - tho compute expressions won't either so ExtensionArray's probably won't make it past ExecuteScalarExpression? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add a test as follow up? But also, there is no function / kernel is available for extension types right now, this method might fail earlier. |
||
} else { | ||
arr = datum.make_array(); | ||
} | ||
ARROW_RETURN_NOT_OK(updater->UpdateBatch(arr)); | ||
} | ||
return updater->Finish(); | ||
} | ||
|
||
::arrow::Result<std::shared_ptr<::arrow::dataset::Dataset>> LanceDataset::ReplaceSchema( | ||
[[maybe_unused]] std::shared_ptr<::arrow::Schema> schema) const { | ||
return std::make_shared<LanceDataset>(*this); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
again the problem here is if the compute expression contains aggregates
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be checked via
bool Expression::IsScalarExpression() const
.I can throw a invalid status from AddColumn