From 44d1b6153b505ad76869198d59881b900c6057b8 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 12:18:38 -0300 Subject: [PATCH] GH-35765: [C++] Split vector_selection.cc into more compilation units (#35751) ### Rationale for this change When working on #35001 I had a hard time figuring where to place the code for all possible combinations of filters and REE data. `vector_selection.cc` is hard to follow with so many kernels implemented in a single file. This PR splits the two biggest ones: filter and take. Stuff that can be shared by both stays is in `vector_selection_internal.cc` and `vector_selection.cc` is concerned with the registering of the functions and a few smaller kernels. ### What changes are included in this PR? - [x] `vector_selection_(internal|take|filter).(cc|h)` source files were extracted from `vector_selection.cc` ### Are these changes tested? Yes, by existing tests. * Closes: #35765 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Antoine Pitrou --- cpp/src/arrow/CMakeLists.txt | 3 + .../arrow/compute/kernels/vector_selection.cc | 2194 +---------------- .../vector_selection_filter_internal.cc | 922 +++++++ .../vector_selection_filter_internal.h | 39 + .../kernels/vector_selection_internal.cc | 814 ++++++ .../kernels/vector_selection_internal.h | 69 + .../kernels/vector_selection_take_internal.cc | 740 ++++++ .../kernels/vector_selection_take_internal.h | 40 + 8 files changed, 2635 insertions(+), 2186 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_filter_internal.h create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_internal.cc create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_internal.h create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_take_internal.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d928cdf58b11a..88aa79270a3f9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -415,6 +415,9 @@ list(APPEND compute/kernels/util_internal.cc compute/kernels/vector_hash.cc compute/kernels/vector_selection.cc + compute/kernels/vector_selection_filter_internal.cc + compute/kernels/vector_selection_internal.cc + compute/kernels/vector_selection_take_internal.cc compute/row/encode_internal.cc compute/row/compare_internal.cc compute/row/grouper.cc diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index f1b09583deb84..3469744966942 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -32,6 +32,8 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/kernels/vector_selection_filter_internal.h" +#include "arrow/compute/kernels/vector_selection_take_internal.h" #include "arrow/extension_type.h" #include "arrow/record_batch.h" #include "arrow/result.h" @@ -58,2119 +60,11 @@ using internal::OptionalBitIndexer; namespace compute { namespace internal { -int64_t GetFilterOutputSize(const ArraySpan& filter, - FilterOptions::NullSelectionBehavior null_selection) { - int64_t output_size = 0; - - if (filter.MayHaveNulls()) { - const uint8_t* filter_is_valid = filter.buffers[0].data; - BinaryBitBlockCounter bit_counter(filter.buffers[1].data, filter.offset, - filter_is_valid, filter.offset, filter.length); - int64_t position = 0; - if (null_selection == FilterOptions::EMIT_NULL) { - while (position < filter.length) { - BitBlockCount block = bit_counter.NextOrNotWord(); - output_size += block.popcount; - position += block.length; - } - } else { - while (position < filter.length) { - BitBlockCount block = bit_counter.NextAndWord(); - output_size += block.popcount; - position += block.length; - } - } - } else { - // The filter has no nulls, so we can use CountSetBits - output_size = CountSetBits(filter.buffers[1].data, filter.offset, filter.length); - } - return output_size; -} - -namespace { - -template -Result> GetTakeIndicesImpl( - const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, - MemoryPool* memory_pool) { - using T = typename IndexType::c_type; - - const uint8_t* filter_data = filter.buffers[1].data; - const bool have_filter_nulls = filter.MayHaveNulls(); - const uint8_t* filter_is_valid = filter.buffers[0].data; - - if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) { - // Most complex case: the filter may have nulls and we don't drop them. - // The logic is ternary: - // - filter is null: emit null - // - filter is valid and true: emit index - // - filter is valid and false: don't emit anything - - typename TypeTraits::BuilderType builder(memory_pool); - - // The position relative to the start of the filter - T position = 0; - // The current position taking the filter offset into account - int64_t position_with_offset = filter.offset; - - // To count blocks where filter_data[i] || !filter_is_valid[i] - BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid, - filter.offset, filter.length); - BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length); - while (position < filter.length) { - // true OR NOT valid - BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord(); - if (selected_or_null_block.NoneSet()) { - position += selected_or_null_block.length; - position_with_offset += selected_or_null_block.length; - continue; - } - RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount)); - - // If the values are all valid and the selected_or_null_block is full, - // then we can infer that all the values are true and skip the bit checking - BitBlockCount is_valid_block = is_valid_counter.NextWord(); - - if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) { - // All the values are selected and non-null - for (int64_t i = 0; i < selected_or_null_block.length; ++i) { - builder.UnsafeAppend(position++); - } - position_with_offset += selected_or_null_block.length; - } else { - // Some of the values are false or null - for (int64_t i = 0; i < selected_or_null_block.length; ++i) { - if (bit_util::GetBit(filter_is_valid, position_with_offset)) { - if (bit_util::GetBit(filter_data, position_with_offset)) { - builder.UnsafeAppend(position); - } - } else { - // Null slot, so append a null - builder.UnsafeAppendNull(); - } - ++position; - ++position_with_offset; - } - } - } - std::shared_ptr result; - RETURN_NOT_OK(builder.FinishInternal(&result)); - return result; - } - - // Other cases don't emit nulls and are therefore simpler. - TypedBufferBuilder builder(memory_pool); - - if (have_filter_nulls) { - // The filter may have nulls, so we scan the validity bitmap and the filter - // data bitmap together. - DCHECK_EQ(null_selection, FilterOptions::DROP); - - // The position relative to the start of the filter - T position = 0; - // The current position taking the filter offset into account - int64_t position_with_offset = filter.offset; - - BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid, - filter.offset, filter.length); - while (position < filter.length) { - BitBlockCount and_block = filter_counter.NextAndWord(); - RETURN_NOT_OK(builder.Reserve(and_block.popcount)); - if (and_block.AllSet()) { - // All the values are selected and non-null - for (int64_t i = 0; i < and_block.length; ++i) { - builder.UnsafeAppend(position++); - } - position_with_offset += and_block.length; - } else if (!and_block.NoneSet()) { - // Some of the values are false or null - for (int64_t i = 0; i < and_block.length; ++i) { - if (bit_util::GetBit(filter_is_valid, position_with_offset) && - bit_util::GetBit(filter_data, position_with_offset)) { - builder.UnsafeAppend(position); - } - ++position; - ++position_with_offset; - } - } else { - position += and_block.length; - position_with_offset += and_block.length; - } - } - } else { - // The filter has no nulls, so we need only look for true values - RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns( - filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) { - // Append the consecutive run of indices - RETURN_NOT_OK(builder.Reserve(length)); - for (int64_t i = 0; i < length; ++i) { - builder.UnsafeAppend(static_cast(offset + i)); - } - return Status::OK(); - })); - } - - const int64_t length = builder.length(); - std::shared_ptr out_buffer; - RETURN_NOT_OK(builder.Finish(&out_buffer)); - return std::make_shared(TypeTraits::type_singleton(), length, - BufferVector{nullptr, out_buffer}, /*null_count=*/0); -} - -} // namespace - -Result> GetTakeIndices( - const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, - MemoryPool* memory_pool) { - DCHECK_EQ(filter.type->id(), Type::BOOL); - if (filter.length <= std::numeric_limits::max()) { - return GetTakeIndicesImpl(filter, null_selection, memory_pool); - } else if (filter.length <= std::numeric_limits::max()) { - return GetTakeIndicesImpl(filter, null_selection, memory_pool); - } else { - // Arrays over 4 billion elements, not especially likely. - return Status::NotImplemented( - "Filter length exceeds UINT32_MAX, " - "consider a different strategy for selecting elements"); - } -} - namespace { using FilterState = OptionsWrapper; using TakeState = OptionsWrapper; -Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(length * bit_width / 8)); - } - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte -// C-type based types. Use common implementation for every byte width and only -// generate code for unsigned integer indices, since after boundschecking to -// check for negative numbers in the indices we can safely reinterpret_cast -// signed integers as unsigned. - -/// \brief The Take implementation for primitive (fixed-width) types does not -/// use the logical Arrow type but rather the physical C type. This way we -/// only generate one take function for each byte width. -/// -/// This function assumes that the indices have been boundschecked. -template -struct PrimitiveTakeImpl { - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - const ValueCType* values_data = values.GetValues(1); - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const IndexCType* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->GetMutableValues(1); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; - int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - out[position] = values_data[indices_data[position]]; - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some indices but not all are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - out[position] = values_data[indices_data[position]]; - } else { - out[position] = ValueCType{}; - } - ++position; - } - } else { - memset(out + position, 0, sizeof(ValueCType) * block.length); - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - out[position] = values_data[indices_data[position]]; - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - out[position] = ValueCType{}; - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position) && - bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // index is not null && value is not null - out[position] = values_data[indices_data[position]]; - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - out[position] = ValueCType{}; - } - ++position; - } - } else { - memset(out + position, 0, sizeof(ValueCType) * block.length); - position += block.length; - } - } - } - out_arr->null_count = out_arr->length - valid_count; - } -}; - -template -struct BooleanTakeImpl { - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - const uint8_t* values_data = values.buffers[1].data; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const IndexCType* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->buffers[1]->mutable_data(); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - // Avoid uninitialized data in values array - bit_util::SetBitsTo(out, out_offset, indices.length, false); - - auto PlaceDataBit = [&](int64_t loc, IndexCType index) { - bit_util::SetBitTo(out, out_offset + loc, - bit_util::GetBit(values_data, values_offset + index)); - }; - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; - int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - PlaceDataBit(position, indices_data[position]); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - } - ++position; - } - } else { - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - ++valid_count; - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - PlaceDataBit(position, indices_data[position]); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } - } - ++position; - } - } else { - position += block.length; - } - } - } - out_arr->null_count = out_arr->length - valid_count; - } -}; - -template