From 2240ee3f1eede6e4cd352277aa1f165b94d2ae90 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 27 Jun 2024 11:24:08 -0300 Subject: [PATCH] GH-42116: [C++] Support list-view typed arrays in array_take and array_filter (#42117) ### Rationale for this change Completing the type coverage in array_take and array_filter. ### What changes are included in this PR? Add support for `ListView` and `LargeListView` in `"array_take"`, `"array_filter"` and all the functions that indirectly rely on these to do their thing. ### Are these changes tested? New test cases were added. * GitHub Issue: #42116 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute/kernels/vector_selection.cc | 36 ++-- .../vector_selection_filter_internal.cc | 4 + .../kernels/vector_selection_internal.cc | 74 ++++++++ .../kernels/vector_selection_internal.h | 4 + .../kernels/vector_selection_take_internal.cc | 2 + .../compute/kernels/vector_selection_test.cc | 178 +++++++++++------- 6 files changed, 207 insertions(+), 91 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 64c3db204c9ee..b265673e23c86 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -68,12 +68,10 @@ using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- // DropNull Implementation -Result> GetDropNullFilter(const Array& values, - MemoryPool* memory_pool) { - auto bitmap_buffer = values.null_bitmap(); - std::shared_ptr out_array = std::make_shared( - values.length(), bitmap_buffer, nullptr, 0, values.offset()); - return out_array; +std::shared_ptr MakeDropNullFilter(const Array& values) { + auto& bitmap_buffer = values.null_bitmap(); + return std::make_shared(values.length(), bitmap_buffer, nullptr, 0, + values.offset()); } Result DropNullArray(const std::shared_ptr& values, ExecContext* ctx) { @@ -86,8 +84,7 @@ Result DropNullArray(const std::shared_ptr& values, ExecContext* c if (values->type()->id() == Type::type::NA) { return std::make_shared(0); } - ARROW_ASSIGN_OR_RAISE(auto drop_null_filter, - GetDropNullFilter(*values, ctx->memory_pool())); + auto drop_null_filter = Datum{MakeDropNullFilter(*values)}; return Filter(values, drop_null_filter, FilterOptions::Defaults(), ctx); } @@ -185,19 +182,16 @@ class DropNullMetaFunction : public MetaFunction { Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { - switch (args[0].kind()) { - case Datum::ARRAY: { - return DropNullArray(args[0].make_array(), ctx); - } break; - case Datum::CHUNKED_ARRAY: { - return DropNullChunkedArray(args[0].chunked_array(), ctx); - } break; - case Datum::RECORD_BATCH: { - return DropNullRecordBatch(args[0].record_batch(), ctx); - } break; - case Datum::TABLE: { - return DropNullTable(args[0].table(), ctx); - } break; + auto& values = args[0]; + switch (values.kind()) { + case Datum::ARRAY: + return DropNullArray(values.make_array(), ctx); + case Datum::CHUNKED_ARRAY: + return DropNullChunkedArray(values.chunked_array(), ctx); + case Datum::RECORD_BATCH: + return DropNullRecordBatch(values.record_batch(), ctx); + case Datum::TABLE: + return DropNullTable(values.table(), ctx); default: break; } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 5e24331fe96f2..bf67a474f31e2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -1101,6 +1101,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), plain_filter, ExtensionFilterExec}, {InputType(Type::LIST), plain_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), plain_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), plain_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), plain_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), plain_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), plain_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), plain_filter, SparseUnionFilterExec}, @@ -1119,6 +1121,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(Type::EXTENSION), ree_filter, ExtensionFilterExec}, {InputType(Type::LIST), ree_filter, ListFilterExec}, {InputType(Type::LARGE_LIST), ree_filter, LargeListFilterExec}, + {InputType(Type::LIST_VIEW), ree_filter, ListViewFilterExec}, + {InputType(Type::LARGE_LIST_VIEW), ree_filter, LargeListViewFilterExec}, {InputType(Type::FIXED_SIZE_LIST), ree_filter, FSLFilterExec}, {InputType(Type::DENSE_UNION), ree_filter, DenseUnionFilterExec}, {InputType(Type::SPARSE_UNION), ree_filter, SparseUnionFilterExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index 1009bea5e7b1b..7189d42850e79 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -612,6 +612,63 @@ struct ListSelectionImpl : public Selection, Type> { } }; +template +struct ListViewSelectionImpl : public Selection, Type> { + using offset_type = typename Type::offset_type; + + using Base = Selection, Type>; + LIFT_BASE_MEMBERS(); + + TypedBufferBuilder offsets_builder; + TypedBufferBuilder sizes_builder; + + ListViewSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, + ExecResult* out) + : Base(ctx, batch, output_length, out), + offsets_builder(ctx->memory_pool()), + sizes_builder(ctx->memory_pool()) {} + + template + Status GenerateOutput() { + auto* offsets = this->values.template GetValues(1); + auto* sizes = this->values.template GetValues(2); + + offset_type null_list_view_offset = 0; + Adapter adapter(this); + RETURN_NOT_OK(adapter.Generate( + [&](int64_t index) { + offset_type value_offset = offsets[index]; + offset_type value_length = sizes[index]; + offsets_builder.UnsafeAppend(value_offset); + sizes_builder.UnsafeAppend(value_length); + null_list_view_offset = value_offset + value_length; + return Status::OK(); + }, + [&]() { + // 0 could be appended here, but by adding the last offset, we keep + // the buffer compatible with how offsets behave in ListType as well. + // The invariant that `offsets[i] + sizes[i] <= values.length` is + // trivially maintained by having `sizes[i]` set to 0 here. + offsets_builder.UnsafeAppend(null_list_view_offset); + sizes_builder.UnsafeAppend(0); + return Status::OK(); + })); + return Status::OK(); + } + + Status Init() override { + RETURN_NOT_OK(offsets_builder.Reserve(output_length)); + return sizes_builder.Reserve(output_length); + } + + Status Finish() override { + RETURN_NOT_OK(offsets_builder.Finish(&out->buffers[1])); + RETURN_NOT_OK(sizes_builder.Finish(&out->buffers[2])); + out->child_data = {this->values.child_data[0].ToArrayData()}; + return Status::OK(); + } +}; + struct DenseUnionSelectionImpl : public Selection { using Base = Selection; @@ -858,6 +915,15 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return FilterExec>(ctx, batch, out); } +Status ListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + +Status LargeListViewFilterExec(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { + return FilterExec>(ctx, batch, out); +} + Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; @@ -914,6 +980,14 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* return TakeExec>(ctx, batch, out); } +Status ListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + +Status LargeListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return TakeExec>(ctx, batch, out); +} + Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index c5075d6dfe87b..887bf08354120 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -67,6 +67,8 @@ void VisitPlainxREEFilterOutputSegments( Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*); @@ -76,6 +78,8 @@ Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status LargeListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 8b3f0431e6505..c45cc552a2cc5 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -745,6 +745,8 @@ void PopulateTakeKernels(std::vector* out) { {InputType(Type::EXTENSION), take_indices, ExtensionTake}, {InputType(Type::LIST), take_indices, ListTakeExec}, {InputType(Type::LARGE_LIST), take_indices, LargeListTakeExec}, + {InputType(Type::LIST_VIEW), take_indices, ListViewTakeExec}, + {InputType(Type::LARGE_LIST_VIEW), take_indices, LargeListViewTakeExec}, {InputType(Type::FIXED_SIZE_LIST), take_indices, FSLTakeExec}, {InputType(Type::DENSE_UNION), take_indices, DenseUnionTakeExec}, {InputType(Type::SPARSE_UNION), take_indices, SparseUnionTakeExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index cafd88901576c..aba016d6b7e8d 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -679,18 +679,38 @@ TYPED_TEST(TestFilterKernelWithString, FilterDictionary) { this->AssertFilterDictionary(dict, "[3, 4, 2]", "[null, 1, 0]", "[null, 4]"); } +const auto kListAndListViewTypes = std::vector>{ + list(int32()), + list_view(int32()), +}; + +const auto kNestedListAndListViewTypes = std::vector>{ + list(list(int32())), + list_view(list_view(int32())), + list(list_view(int32())), + list_view(list(int32())), +}; + +const auto kLargeListAndListViewTypes = std::vector>{ + large_list(int32()), + large_list_view(int32()), +}; + class TestFilterKernelWithList : public TestFilterKernel { public: }; TEST_F(TestFilterKernelWithList, FilterListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - this->AssertFilter(list(int32()), list_json, "[0, 0, 0, 0]", "[]"); - this->AssertFilter(list(int32()), list_json, "[0, 1, 1, null]", "[[1,2], null, null]"); - this->AssertFilter(list(int32()), list_json, "[0, 0, 1, null]", "[null, null]"); - this->AssertFilter(list(int32()), list_json, "[1, 0, 0, 1]", "[[], [3]]"); - this->AssertFilter(list(int32()), list_json, "[1, 1, 1, 1]", list_json); - this->AssertFilter(list(int32()), list_json, "[0, 1, 0, 1]", "[[1,2], [3]]"); + for (auto& type : kListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + this->AssertFilter(type, list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(type, list_json, "[0, 1, 1, null]", "[[1,2], null, null]"); + this->AssertFilter(type, list_json, "[0, 0, 1, null]", "[null, null]"); + this->AssertFilter(type, list_json, "[1, 0, 0, 1]", "[[], [3]]"); + this->AssertFilter(type, list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(type, list_json, "[0, 1, 0, 1]", "[[1,2], [3]]"); + } } TEST_F(TestFilterKernelWithList, FilterListListInt32) { @@ -700,32 +720,36 @@ TEST_F(TestFilterKernelWithList, FilterListListInt32) { null, [[3, null], null] ])"; - auto type = list(list(int32())); - this->AssertFilter(type, list_json, "[0, 0, 0, 0]", "[]"); - this->AssertFilter(type, list_json, "[0, 1, 1, null]", R"([ - [[1], [2, null, 2], []], - null, - null - ])"); - this->AssertFilter(type, list_json, "[0, 0, 1, null]", "[null, null]"); - this->AssertFilter(type, list_json, "[1, 0, 0, 1]", R"([ - [], - [[3, null], null] - ])"); - this->AssertFilter(type, list_json, "[1, 1, 1, 1]", list_json); - this->AssertFilter(type, list_json, "[0, 1, 0, 1]", R"([ - [[1], [2, null, 2], []], - [[3, null], null] - ])"); + for (auto& type : kNestedListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + this->AssertFilter(type, list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(type, list_json, "[0, 1, 1, null]", R"([ + [[1], [2, null, 2], []], + null, + null + ])"); + this->AssertFilter(type, list_json, "[0, 0, 1, null]", "[null, null]"); + this->AssertFilter(type, list_json, "[1, 0, 0, 1]", R"([ + [], + [[3, null], null] + ])"); + this->AssertFilter(type, list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(type, list_json, "[0, 1, 0, 1]", R"([ + [[1], [2, null, 2], []], + [[3, null], null] + ])"); + } } class TestFilterKernelWithLargeList : public TestFilterKernel {}; TEST_F(TestFilterKernelWithLargeList, FilterListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - this->AssertFilter(large_list(int32()), list_json, "[0, 0, 0, 0]", "[]"); - this->AssertFilter(large_list(int32()), list_json, "[0, 1, 1, null]", - "[[1,2], null, null]"); + for (auto& type : kLargeListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + this->AssertFilter(type, list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(type, list_json, "[0, 1, 1, null]", "[[1,2], null, null]"); + } } class TestFilterKernelWithFixedSizeList : public TestFilterKernel { @@ -1449,17 +1473,18 @@ class TestTakeKernelWithList : public TestTakeKernelTyped {}; TEST_F(TestTakeKernelWithList, TakeListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - CheckTake(list(int32()), list_json, "[]", "[]"); - CheckTake(list(int32()), list_json, "[3, 2, 1]", "[[3], null, [1,2]]"); - CheckTake(list(int32()), list_json, "[null, 3, 0]", "[null, [3], []]"); - CheckTake(list(int32()), list_json, "[null, null]", "[null, null]"); - CheckTake(list(int32()), list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]"); - CheckTake(list(int32()), list_json, "[0, 1, 2, 3]", list_json); - CheckTake(list(int32()), list_json, "[0, 0, 0, 0, 0, 0, 1]", - "[[], [], [], [], [], [], [1, 2]]"); - - this->TestNoValidityBitmapButUnknownNullCount(list(int32()), "[[], [1,2], [3]]", - "[0, 1, 0]"); + for (auto& type : kListAndListViewTypes) { + CheckTake(type, list_json, "[]", "[]"); + CheckTake(type, list_json, "[3, 2, 1]", "[[3], null, [1,2]]"); + CheckTake(type, list_json, "[null, 3, 0]", "[null, [3], []]"); + CheckTake(type, list_json, "[null, null]", "[null, null]"); + CheckTake(type, list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]"); + CheckTake(type, list_json, "[0, 1, 2, 3]", list_json); + CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]", + "[[], [], [], [], [], [], [1, 2]]"); + + this->TestNoValidityBitmapButUnknownNullCount(type, "[[], [1,2], [3]]", "[0, 1, 0]"); + } } TEST_F(TestTakeKernelWithList, TakeListListInt32) { @@ -1469,35 +1494,40 @@ TEST_F(TestTakeKernelWithList, TakeListListInt32) { null, [[3, null], null] ])"; - auto type = list(list(int32())); - CheckTake(type, list_json, "[]", "[]"); - CheckTake(type, list_json, "[3, 2, 1]", R"([ - [[3, null], null], - null, - [[1], [2, null, 2], []] - ])"); - CheckTake(type, list_json, "[null, 3, 0]", R"([ - null, - [[3, null], null], - [] - ])"); - CheckTake(type, list_json, "[null, null]", "[null, null]"); - CheckTake(type, list_json, "[3, 0, 0, 3]", - "[[[3, null], null], [], [], [[3, null], null]]"); - CheckTake(type, list_json, "[0, 1, 2, 3]", list_json); - CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]", - "[[], [], [], [], [], [], [[1], [2, null, 2], []]]"); + for (auto& type : kNestedListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + CheckTake(type, list_json, "[]", "[]"); + CheckTake(type, list_json, "[3, 2, 1]", R"([ + [[3, null], null], + null, + [[1], [2, null, 2], []] + ])"); + CheckTake(type, list_json, "[null, 3, 0]", R"([ + null, + [[3, null], null], + [] + ])"); + CheckTake(type, list_json, "[null, null]", "[null, null]"); + CheckTake(type, list_json, "[3, 0, 0, 3]", + "[[[3, null], null], [], [], [[3, null], null]]"); + CheckTake(type, list_json, "[0, 1, 2, 3]", list_json); + CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]", + "[[], [], [], [], [], [], [[1], [2, null, 2], []]]"); - this->TestNoValidityBitmapButUnknownNullCount( - type, "[[[1], [2, null, 2], []], [[3, null]]]", "[0, 1, 0]"); + this->TestNoValidityBitmapButUnknownNullCount( + type, "[[[1], [2, null, 2], []], [[3, null]]]", "[0, 1, 0]"); + } } class TestTakeKernelWithLargeList : public TestTakeKernelTyped {}; TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - CheckTake(large_list(int32()), list_json, "[]", "[]"); - CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); + for (auto& type : kLargeListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + CheckTake(type, list_json, "[]", "[]"); + CheckTake(type, list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); + } } class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped { @@ -2275,8 +2305,11 @@ class TestDropNullKernelWithList : public TestDropNullKernelTyped {}; TEST_F(TestDropNullKernelWithList, DropNullListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - CheckDropNull(list(int32()), list_json, "[[], [1,2], [3]]"); - this->TestNoValidityBitmapButUnknownNullCount(list(int32()), "[[], [1,2], [3]]"); + for (const auto& type : kListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + CheckDropNull(type, list_json, "[[], [1,2], [3]]"); + this->TestNoValidityBitmapButUnknownNullCount(type, "[[], [1,2], [3]]"); + } } TEST_F(TestDropNullKernelWithList, DropNullListListInt32) { @@ -2286,22 +2319,27 @@ TEST_F(TestDropNullKernelWithList, DropNullListListInt32) { null, [[3, null], null] ])"; - auto type = list(list(int32())); - CheckDropNull(type, list_json, R"([ - [], - [[1], [2, null, 2], []], - [[3, null], null] - ])"); + for (auto& type : kNestedListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + CheckDropNull(type, list_json, R"([ + [], + [[1], [2, null, 2], []], + [[3, null], null] + ])"); - this->TestNoValidityBitmapButUnknownNullCount(type, - "[[[1], [2, null, 2], []], [[3, null]]]"); + this->TestNoValidityBitmapButUnknownNullCount( + type, "[[[1], [2, null, 2], []], [[3, null]]]"); + } } class TestDropNullKernelWithLargeList : public TestDropNullKernelTyped {}; TEST_F(TestDropNullKernelWithLargeList, DropNullLargeListInt32) { std::string list_json = "[[], [1,2], null, [3]]"; - CheckDropNull(large_list(int32()), list_json, "[[], [1,2], [3]]"); + for (auto& type : kLargeListAndListViewTypes) { + ARROW_SCOPED_TRACE("type = ", *type); + CheckDropNull(type, list_json, "[[], [1,2], [3]]"); + } this->TestNoValidityBitmapButUnknownNullCount( fixed_size_list(int32(), 3), "[[1, null, 3], [4, 5, 6], [7, 8, null]]");