From 44d1b6153b505ad76869198d59881b900c6057b8 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 30 May 2023 12:18:38 -0300
Subject: [PATCH] GH-35765: [C++] Split vector_selection.cc into more
 compilation units (#35751)

### Rationale for this change

When working on #35001 I had a hard time figuring where to place the code for all possible combinations of filters and REE data. `vector_selection.cc` is hard to follow with so many kernels implemented in a single file. This PR splits the two biggest ones: filter and take. Stuff that can be shared by both stays is in `vector_selection_internal.cc` and `vector_selection.cc` is concerned with the registering of the functions and a few smaller kernels.

### What changes are included in this PR?

- [x]  `vector_selection_(internal|take|filter).(cc|h)` source files were extracted from `vector_selection.cc`

### Are these changes tested?

Yes, by existing tests.

* Closes: #35765

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                  |    3 +
 .../arrow/compute/kernels/vector_selection.cc | 2194 +----------------
 .../vector_selection_filter_internal.cc       |  922 +++++++
 .../vector_selection_filter_internal.h        |   39 +
 .../kernels/vector_selection_internal.cc      |  814 ++++++
 .../kernels/vector_selection_internal.h       |   69 +
 .../kernels/vector_selection_take_internal.cc |  740 ++++++
 .../kernels/vector_selection_take_internal.h  |   40 +
 8 files changed, 2635 insertions(+), 2186 deletions(-)
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_filter_internal.h
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_internal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_internal.h
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/vector_selection_take_internal.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index d928cdf58b11a..88aa79270a3f9 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -415,6 +415,9 @@ list(APPEND
      compute/kernels/util_internal.cc
      compute/kernels/vector_hash.cc
      compute/kernels/vector_selection.cc
+     compute/kernels/vector_selection_filter_internal.cc
+     compute/kernels/vector_selection_internal.cc
+     compute/kernels/vector_selection_take_internal.cc
      compute/row/encode_internal.cc
      compute/row/compare_internal.cc
      compute/row/grouper.cc
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index f1b09583deb84..3469744966942 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -32,6 +32,8 @@
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/kernels/common_internal.h"
 #include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/vector_selection_filter_internal.h"
+#include "arrow/compute/kernels/vector_selection_take_internal.h"
 #include "arrow/extension_type.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
@@ -58,2119 +60,11 @@ using internal::OptionalBitIndexer;
 namespace compute {
 namespace internal {
 
-int64_t GetFilterOutputSize(const ArraySpan& filter,
-                            FilterOptions::NullSelectionBehavior null_selection) {
-  int64_t output_size = 0;
-
-  if (filter.MayHaveNulls()) {
-    const uint8_t* filter_is_valid = filter.buffers[0].data;
-    BinaryBitBlockCounter bit_counter(filter.buffers[1].data, filter.offset,
-                                      filter_is_valid, filter.offset, filter.length);
-    int64_t position = 0;
-    if (null_selection == FilterOptions::EMIT_NULL) {
-      while (position < filter.length) {
-        BitBlockCount block = bit_counter.NextOrNotWord();
-        output_size += block.popcount;
-        position += block.length;
-      }
-    } else {
-      while (position < filter.length) {
-        BitBlockCount block = bit_counter.NextAndWord();
-        output_size += block.popcount;
-        position += block.length;
-      }
-    }
-  } else {
-    // The filter has no nulls, so we can use CountSetBits
-    output_size = CountSetBits(filter.buffers[1].data, filter.offset, filter.length);
-  }
-  return output_size;
-}
-
-namespace {
-
-template <typename IndexType>
-Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
-    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
-    MemoryPool* memory_pool) {
-  using T = typename IndexType::c_type;
-
-  const uint8_t* filter_data = filter.buffers[1].data;
-  const bool have_filter_nulls = filter.MayHaveNulls();
-  const uint8_t* filter_is_valid = filter.buffers[0].data;
-
-  if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
-    // Most complex case: the filter may have nulls and we don't drop them.
-    // The logic is ternary:
-    // - filter is null: emit null
-    // - filter is valid and true: emit index
-    // - filter is valid and false: don't emit anything
-
-    typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
-
-    // The position relative to the start of the filter
-    T position = 0;
-    // The current position taking the filter offset into account
-    int64_t position_with_offset = filter.offset;
-
-    // To count blocks where filter_data[i] || !filter_is_valid[i]
-    BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
-                                         filter.offset, filter.length);
-    BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
-    while (position < filter.length) {
-      // true OR NOT valid
-      BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
-      if (selected_or_null_block.NoneSet()) {
-        position += selected_or_null_block.length;
-        position_with_offset += selected_or_null_block.length;
-        continue;
-      }
-      RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
-
-      // If the values are all valid and the selected_or_null_block is full,
-      // then we can infer that all the values are true and skip the bit checking
-      BitBlockCount is_valid_block = is_valid_counter.NextWord();
-
-      if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
-        // All the values are selected and non-null
-        for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
-          builder.UnsafeAppend(position++);
-        }
-        position_with_offset += selected_or_null_block.length;
-      } else {
-        // Some of the values are false or null
-        for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
-          if (bit_util::GetBit(filter_is_valid, position_with_offset)) {
-            if (bit_util::GetBit(filter_data, position_with_offset)) {
-              builder.UnsafeAppend(position);
-            }
-          } else {
-            // Null slot, so append a null
-            builder.UnsafeAppendNull();
-          }
-          ++position;
-          ++position_with_offset;
-        }
-      }
-    }
-    std::shared_ptr<ArrayData> result;
-    RETURN_NOT_OK(builder.FinishInternal(&result));
-    return result;
-  }
-
-  // Other cases don't emit nulls and are therefore simpler.
-  TypedBufferBuilder<T> builder(memory_pool);
-
-  if (have_filter_nulls) {
-    // The filter may have nulls, so we scan the validity bitmap and the filter
-    // data bitmap together.
-    DCHECK_EQ(null_selection, FilterOptions::DROP);
-
-    // The position relative to the start of the filter
-    T position = 0;
-    // The current position taking the filter offset into account
-    int64_t position_with_offset = filter.offset;
-
-    BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
-                                         filter.offset, filter.length);
-    while (position < filter.length) {
-      BitBlockCount and_block = filter_counter.NextAndWord();
-      RETURN_NOT_OK(builder.Reserve(and_block.popcount));
-      if (and_block.AllSet()) {
-        // All the values are selected and non-null
-        for (int64_t i = 0; i < and_block.length; ++i) {
-          builder.UnsafeAppend(position++);
-        }
-        position_with_offset += and_block.length;
-      } else if (!and_block.NoneSet()) {
-        // Some of the values are false or null
-        for (int64_t i = 0; i < and_block.length; ++i) {
-          if (bit_util::GetBit(filter_is_valid, position_with_offset) &&
-              bit_util::GetBit(filter_data, position_with_offset)) {
-            builder.UnsafeAppend(position);
-          }
-          ++position;
-          ++position_with_offset;
-        }
-      } else {
-        position += and_block.length;
-        position_with_offset += and_block.length;
-      }
-    }
-  } else {
-    // The filter has no nulls, so we need only look for true values
-    RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
-        filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
-          // Append the consecutive run of indices
-          RETURN_NOT_OK(builder.Reserve(length));
-          for (int64_t i = 0; i < length; ++i) {
-            builder.UnsafeAppend(static_cast<T>(offset + i));
-          }
-          return Status::OK();
-        }));
-  }
-
-  const int64_t length = builder.length();
-  std::shared_ptr<Buffer> out_buffer;
-  RETURN_NOT_OK(builder.Finish(&out_buffer));
-  return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
-                                     BufferVector{nullptr, out_buffer}, /*null_count=*/0);
-}
-
-}  // namespace
-
-Result<std::shared_ptr<ArrayData>> GetTakeIndices(
-    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
-    MemoryPool* memory_pool) {
-  DCHECK_EQ(filter.type->id(), Type::BOOL);
-  if (filter.length <= std::numeric_limits<uint16_t>::max()) {
-    return GetTakeIndicesImpl<UInt16Type>(filter, null_selection, memory_pool);
-  } else if (filter.length <= std::numeric_limits<uint32_t>::max()) {
-    return GetTakeIndicesImpl<UInt32Type>(filter, null_selection, memory_pool);
-  } else {
-    // Arrays over 4 billion elements, not especially likely.
-    return Status::NotImplemented(
-        "Filter length exceeds UINT32_MAX, "
-        "consider a different strategy for selecting elements");
-  }
-}
-
 namespace {
 
 using FilterState = OptionsWrapper<FilterOptions>;
 using TakeState = OptionsWrapper<TakeOptions>;
 
-Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
-                       bool allocate_validity, ArrayData* out) {
-  // Preallocate memory
-  out->length = length;
-  out->buffers.resize(2);
-
-  if (allocate_validity) {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
-  }
-  if (bit_width == 1) {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
-  } else {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(length * bit_width / 8));
-  }
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
-// C-type based types. Use common implementation for every byte width and only
-// generate code for unsigned integer indices, since after boundschecking to
-// check for negative numbers in the indices we can safely reinterpret_cast
-// signed integers as unsigned.
-
-/// \brief The Take implementation for primitive (fixed-width) types does not
-/// use the logical Arrow type but rather the physical C type. This way we
-/// only generate one take function for each byte width.
-///
-/// This function assumes that the indices have been boundschecked.
-template <typename IndexCType, typename ValueCType>
-struct PrimitiveTakeImpl {
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    const ValueCType* values_data = values.GetValues<ValueCType>(1);
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
-
-    const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
-    const uint8_t* indices_is_valid = indices.buffers[0].data;
-    auto indices_offset = indices.offset;
-
-    auto out = out_arr->GetMutableValues<ValueCType>(1);
-    auto out_is_valid = out_arr->buffers[0]->mutable_data();
-    auto out_offset = out_arr->offset;
-
-    // If either the values or indices have nulls, we preemptively zero out the
-    // out validity bitmap so that we don't have to use ClearBit in each
-    // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
-      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
-    }
-
-    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
-                                                indices.length);
-    int64_t position = 0;
-    int64_t valid_count = 0;
-    while (position < indices.length) {
-      BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
-        // Values are never null, so things are easier
-        valid_count += block.popcount;
-        if (block.popcount == block.length) {
-          // Fastest path: neither values nor index nulls
-          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
-          for (int64_t i = 0; i < block.length; ++i) {
-            out[position] = values_data[indices_data[position]];
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some indices but not all are null
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              out[position] = values_data[indices_data[position]];
-            } else {
-              out[position] = ValueCType{};
-            }
-            ++position;
-          }
-        } else {
-          memset(out + position, 0, sizeof(ValueCType) * block.length);
-          position += block.length;
-        }
-      } else {
-        // Values have nulls, so we must do random access into the values bitmap
-        if (block.popcount == block.length) {
-          // Faster path: indices are not null but values may be
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // value is not null
-              out[position] = values_data[indices_data[position]];
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              ++valid_count;
-            } else {
-              out[position] = ValueCType{};
-            }
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null. Since we are doing
-          // random access in general we have to check the value nullness one by
-          // one.
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
-                bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // index is not null && value is not null
-              out[position] = values_data[indices_data[position]];
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              ++valid_count;
-            } else {
-              out[position] = ValueCType{};
-            }
-            ++position;
-          }
-        } else {
-          memset(out + position, 0, sizeof(ValueCType) * block.length);
-          position += block.length;
-        }
-      }
-    }
-    out_arr->null_count = out_arr->length - valid_count;
-  }
-};
-
-template <typename IndexCType>
-struct BooleanTakeImpl {
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    const uint8_t* values_data = values.buffers[1].data;
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
-
-    const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
-    const uint8_t* indices_is_valid = indices.buffers[0].data;
-    auto indices_offset = indices.offset;
-
-    auto out = out_arr->buffers[1]->mutable_data();
-    auto out_is_valid = out_arr->buffers[0]->mutable_data();
-    auto out_offset = out_arr->offset;
-
-    // If either the values or indices have nulls, we preemptively zero out the
-    // out validity bitmap so that we don't have to use ClearBit in each
-    // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
-      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
-    }
-    // Avoid uninitialized data in values array
-    bit_util::SetBitsTo(out, out_offset, indices.length, false);
-
-    auto PlaceDataBit = [&](int64_t loc, IndexCType index) {
-      bit_util::SetBitTo(out, out_offset + loc,
-                         bit_util::GetBit(values_data, values_offset + index));
-    };
-
-    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
-                                                indices.length);
-    int64_t position = 0;
-    int64_t valid_count = 0;
-    while (position < indices.length) {
-      BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
-        // Values are never null, so things are easier
-        valid_count += block.popcount;
-        if (block.popcount == block.length) {
-          // Fastest path: neither values nor index nulls
-          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
-          for (int64_t i = 0; i < block.length; ++i) {
-            PlaceDataBit(position, indices_data[position]);
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              PlaceDataBit(position, indices_data[position]);
-            }
-            ++position;
-          }
-        } else {
-          position += block.length;
-        }
-      } else {
-        // Values have nulls, so we must do random access into the values bitmap
-        if (block.popcount == block.length) {
-          // Faster path: indices are not null but values may be
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // value is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              PlaceDataBit(position, indices_data[position]);
-              ++valid_count;
-            }
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null. Since we are doing
-          // random access in general we have to check the value nullness one by
-          // one.
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              if (bit_util::GetBit(values_is_valid,
-                                   values_offset + indices_data[position])) {
-                // value is not null
-                PlaceDataBit(position, indices_data[position]);
-                bit_util::SetBit(out_is_valid, out_offset + position);
-                ++valid_count;
-              }
-            }
-            ++position;
-          }
-        } else {
-          position += block.length;
-        }
-      }
-    }
-    out_arr->null_count = out_arr->length - valid_count;
-  }
-};
-
-template <template <typename...> class TakeImpl, typename... Args>
-void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
-                       ArrayData* out) {
-  // With the simplifying assumption that boundschecking has taken place
-  // already at a higher level, we can now assume that the index values are all
-  // non-negative. Thus, we can interpret signed integers as unsigned and avoid
-  // having to generate double the amount of binary code to handle each integer
-  // width.
-  switch (indices.type->byte_width()) {
-    case 1:
-      return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
-    case 2:
-      return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
-    case 4:
-      return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
-    case 8:
-      return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
-    default:
-      DCHECK(false) << "Invalid indices byte width";
-      break;
-  }
-}
-
-Status PrimitiveTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-  const ArraySpan& indices = batch[1].array;
-
-  if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(indices, values.length));
-  }
-
-  ArrayData* out_arr = out->array_data().get();
-
-  const int bit_width = values.type->bit_width();
-
-  // TODO: When neither values nor indices contain nulls, we can skip
-  // allocating the validity bitmap altogether and save time and space. A
-  // streamlined PrimitiveTakeImpl would need to be written that skips all
-  // interactions with the output validity bitmap, though.
-  RETURN_NOT_OK(PreallocateData(ctx, indices.length, bit_width,
-                                /*allocate_validity=*/true, out_arr));
-  switch (bit_width) {
-    case 1:
-      TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
-      break;
-    case 8:
-      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
-      break;
-    case 16:
-      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
-      break;
-    case 32:
-      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
-      break;
-    case 64:
-      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
-      break;
-    default:
-      DCHECK(false) << "Invalid values byte width";
-      break;
-  }
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Optimized and streamlined filter for primitive types
-
-// Use either BitBlockCounter or BinaryBitBlockCounter to quickly scan filter a
-// word at a time for the DROP selection type.
-class DropNullCounter {
- public:
-  // validity bitmap may be null
-  DropNullCounter(const uint8_t* validity, const uint8_t* data, int64_t offset,
-                  int64_t length)
-      : data_counter_(data, offset, length),
-        data_and_validity_counter_(data, offset, validity, offset, length),
-        has_validity_(validity != nullptr) {}
-
-  BitBlockCount NextBlock() {
-    if (has_validity_) {
-      // filter is true AND not null
-      return data_and_validity_counter_.NextAndWord();
-    } else {
-      return data_counter_.NextWord();
-    }
-  }
-
- private:
-  // For when just data is present, but no validity bitmap
-  BitBlockCounter data_counter_;
-
-  // For when both validity bitmap and data are present
-  BinaryBitBlockCounter data_and_validity_counter_;
-  const bool has_validity_;
-};
-
-/// \brief The Filter implementation for primitive (fixed-width) types does not
-/// use the logical Arrow type but rather the physical C type. This way we only
-/// generate one take function for each byte width. We use the same
-/// implementation here for boolean and fixed-byte-size inputs with some
-/// template specialization.
-template <typename ArrowType>
-class PrimitiveFilterImpl {
- public:
-  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
-                                      uint8_t, typename ArrowType::c_type>::type;
-
-  PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
-                      FilterOptions::NullSelectionBehavior null_selection,
-                      ArrayData* out_arr)
-      : values_is_valid_(values.buffers[0].data),
-        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
-        values_null_count_(values.null_count),
-        values_offset_(values.offset),
-        values_length_(values.length),
-        filter_is_valid_(filter.buffers[0].data),
-        filter_data_(filter.buffers[1].data),
-        filter_null_count_(filter.null_count),
-        filter_offset_(filter.offset),
-        null_selection_(null_selection) {
-    if (values.type->id() != Type::BOOL) {
-      // No offset applied for boolean because it's a bitmap
-      values_data_ += values.offset;
-    }
-
-    if (out_arr->buffers[0] != nullptr) {
-      // May not be allocated if neither filter nor values contains nulls
-      out_is_valid_ = out_arr->buffers[0]->mutable_data();
-    }
-    out_data_ = reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data());
-    out_offset_ = out_arr->offset;
-    out_length_ = out_arr->length;
-    out_position_ = 0;
-  }
-
-  void ExecNonNull() {
-    // Fast filter when values and filter are not null
-    ::arrow::internal::VisitSetBitRunsVoid(
-        filter_data_, filter_offset_, values_length_,
-        [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
-  }
-
-  void Exec() {
-    if (filter_null_count_ == 0 && values_null_count_ == 0) {
-      return ExecNonNull();
-    }
-
-    // Bit counters used for both null_selection behaviors
-    DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
-                                      values_length_);
-    OptionalBitBlockCounter data_counter(values_is_valid_, values_offset_,
-                                         values_length_);
-    OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
-                                                 values_length_);
-
-    auto WriteNotNull = [&](int64_t index) {
-      bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
-      // Increments out_position_
-      WriteValue(index);
-    };
-
-    auto WriteMaybeNull = [&](int64_t index) {
-      bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
-                         bit_util::GetBit(values_is_valid_, values_offset_ + index));
-      // Increments out_position_
-      WriteValue(index);
-    };
-
-    int64_t in_position = 0;
-    while (in_position < values_length_) {
-      BitBlockCount filter_block = drop_null_counter.NextBlock();
-      BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
-      BitBlockCount data_block = data_counter.NextWord();
-      if (filter_block.AllSet() && data_block.AllSet()) {
-        // Fastest path: all values in block are included and not null
-        bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
-                            filter_block.length, true);
-        WriteValueSegment(in_position, filter_block.length);
-        in_position += filter_block.length;
-      } else if (filter_block.AllSet()) {
-        // Faster: all values are selected, but some values are null
-        // Batch copy bits from values validity bitmap to output validity bitmap
-        CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
-                   out_is_valid_, out_offset_ + out_position_);
-        WriteValueSegment(in_position, filter_block.length);
-        in_position += filter_block.length;
-      } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
-        // For this exceedingly common case in low-selectivity filters we can
-        // skip further analysis of the data and move on to the next block.
-        in_position += filter_block.length;
-      } else {
-        // Some filter values are false or null
-        if (data_block.AllSet()) {
-          // No values are null
-          if (filter_valid_block.AllSet()) {
-            // Filter is non-null but some values are false
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                WriteNotNull(in_position);
-              }
-              ++in_position;
-            }
-          } else if (null_selection_ == FilterOptions::DROP) {
-            // If any values are selected, they ARE NOT null
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
-                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                WriteNotNull(in_position);
-              }
-              ++in_position;
-            }
-          } else {  // null_selection == FilterOptions::EMIT_NULL
-            // Data values in this block are not null
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              const bool is_valid =
-                  bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position);
-              if (is_valid &&
-                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                // Filter slot is non-null and set
-                WriteNotNull(in_position);
-              } else if (!is_valid) {
-                // Filter slot is null, so we have a null in the output
-                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
-                WriteNull();
-              }
-              ++in_position;
-            }
-          }
-        } else {  // !data_block.AllSet()
-          // Some values are null
-          if (filter_valid_block.AllSet()) {
-            // Filter is non-null but some values are false
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                WriteMaybeNull(in_position);
-              }
-              ++in_position;
-            }
-          } else if (null_selection_ == FilterOptions::DROP) {
-            // If any values are selected, they ARE NOT null
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
-                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                WriteMaybeNull(in_position);
-              }
-              ++in_position;
-            }
-          } else {  // null_selection == FilterOptions::EMIT_NULL
-            // Data values in this block are not null
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              const bool is_valid =
-                  bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position);
-              if (is_valid &&
-                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
-                // Filter slot is non-null and set
-                WriteMaybeNull(in_position);
-              } else if (!is_valid) {
-                // Filter slot is null, so we have a null in the output
-                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
-                WriteNull();
-              }
-              ++in_position;
-            }
-          }
-        }
-      }  // !filter_block.AllSet()
-    }    // while(in_position < values_length_)
-  }
-
-  // Write the next out_position given the selected in_position for the input
-  // data and advance out_position
-  void WriteValue(int64_t in_position) {
-    out_data_[out_position_++] = values_data_[in_position];
-  }
-
-  void WriteValueSegment(int64_t in_start, int64_t length) {
-    std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
-    out_position_ += length;
-  }
-
-  void WriteNull() {
-    // Zero the memory
-    out_data_[out_position_++] = T{};
-  }
-
- private:
-  const uint8_t* values_is_valid_;
-  const T* values_data_;
-  int64_t values_null_count_;
-  int64_t values_offset_;
-  int64_t values_length_;
-  const uint8_t* filter_is_valid_;
-  const uint8_t* filter_data_;
-  int64_t filter_null_count_;
-  int64_t filter_offset_;
-  FilterOptions::NullSelectionBehavior null_selection_;
-  uint8_t* out_is_valid_;
-  T* out_data_;
-  int64_t out_offset_;
-  int64_t out_length_;
-  int64_t out_position_;
-};
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
-  bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
-                     bit_util::GetBit(values_data_, values_offset_ + in_position));
-}
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
-                                                                int64_t length) {
-  CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
-             out_offset_ + out_position_);
-  out_position_ += length;
-}
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
-  // Zero the bit
-  bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
-}
-
-Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-  const ArraySpan& filter = batch[1].array;
-  FilterOptions::NullSelectionBehavior null_selection =
-      FilterState::Get(ctx).null_selection_behavior;
-
-  int64_t output_length = GetFilterOutputSize(filter, null_selection);
-
-  ArrayData* out_arr = out->array_data().get();
-
-  // The output precomputed null count is unknown except in the narrow
-  // condition that all the values are non-null and the filter will not cause
-  // any new nulls to be created.
-  if (values.null_count == 0 &&
-      (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
-    out_arr->null_count = 0;
-  } else {
-    out_arr->null_count = kUnknownNullCount;
-  }
-
-  // When neither the values nor filter is known to have any nulls, we will
-  // elect the optimized ExecNonNull path where there is no need to populate a
-  // validity bitmap.
-  bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
-
-  const int bit_width = values.type->bit_width();
-  RETURN_NOT_OK(
-      PreallocateData(ctx, output_length, bit_width, allocate_validity, out_arr));
-
-  switch (bit_width) {
-    case 1:
-      PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
-      break;
-    case 8:
-      PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
-      break;
-    case 16:
-      PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
-      break;
-    case 32:
-      PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
-      break;
-    case 64:
-      PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
-      break;
-    default:
-      DCHECK(false) << "Invalid values bit width";
-      break;
-  }
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Optimized filter for base binary types (32-bit and 64-bit)
-
-#define BINARY_FILTER_SETUP_COMMON()                                                    \
-  const auto raw_offsets = values.GetValues<offset_type>(1);                            \
-  const uint8_t* raw_data = values.buffers[2].data;                                     \
-                                                                                        \
-  TypedBufferBuilder<offset_type> offset_builder(ctx->memory_pool());                   \
-  TypedBufferBuilder<uint8_t> data_builder(ctx->memory_pool());                         \
-  RETURN_NOT_OK(offset_builder.Reserve(output_length + 1));                             \
-                                                                                        \
-  /* Presize the data builder with a rough estimate */                                  \
-  if (values.length > 0) {                                                              \
-    const double mean_value_length = (raw_offsets[values.length] - raw_offsets[0]) /    \
-                                     static_cast<double>(values.length);                \
-    RETURN_NOT_OK(                                                                      \
-        data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length))); \
-  }                                                                                     \
-  int64_t space_available = data_builder.capacity();                                    \
-  offset_type offset = 0;
-
-#define APPEND_RAW_DATA(DATA, NBYTES)                                  \
-  if (ARROW_PREDICT_FALSE(NBYTES > space_available)) {                 \
-    RETURN_NOT_OK(data_builder.Reserve(NBYTES));                       \
-    space_available = data_builder.capacity() - data_builder.length(); \
-  }                                                                    \
-  data_builder.UnsafeAppend(DATA, NBYTES);                             \
-  space_available -= NBYTES
-
-#define APPEND_SINGLE_VALUE()                                                       \
-  do {                                                                              \
-    offset_type val_size = raw_offsets[in_position + 1] - raw_offsets[in_position]; \
-    APPEND_RAW_DATA(raw_data + raw_offsets[in_position], val_size);                 \
-    offset += val_size;                                                             \
-  } while (0)
-
-// Optimized binary filter for the case where neither values nor filter have
-// nulls
-template <typename Type>
-Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
-                               const ArraySpan& filter, int64_t output_length,
-                               FilterOptions::NullSelectionBehavior null_selection,
-                               ArrayData* out) {
-  using offset_type = typename Type::offset_type;
-  const auto filter_data = filter.buffers[1].data;
-
-  BINARY_FILTER_SETUP_COMMON();
-
-  RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
-      filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
-        // Bulk-append raw data
-        const offset_type run_data_bytes =
-            (raw_offsets[position + length] - raw_offsets[position]);
-        APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
-        // Append offsets
-        offset_type cur_offset = raw_offsets[position];
-        for (int64_t i = 0; i < length; ++i) {
-          offset_builder.UnsafeAppend(offset);
-          offset += raw_offsets[i + position + 1] - cur_offset;
-          cur_offset = raw_offsets[i + position + 1];
-        }
-        return Status::OK();
-      }));
-
-  offset_builder.UnsafeAppend(offset);
-  out->length = output_length;
-  RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
-  return data_builder.Finish(&out->buffers[2]);
-}
-
-template <typename Type>
-Status BinaryFilterImpl(KernelContext* ctx, const ArraySpan& values,
-                        const ArraySpan& filter, int64_t output_length,
-                        FilterOptions::NullSelectionBehavior null_selection,
-                        ArrayData* out) {
-  using offset_type = typename Type::offset_type;
-
-  const auto filter_data = filter.buffers[1].data;
-  const uint8_t* filter_is_valid = filter.buffers[0].data;
-  const int64_t filter_offset = filter.offset;
-
-  const uint8_t* values_is_valid = values.buffers[0].data;
-  const int64_t values_offset = values.offset;
-
-  uint8_t* out_is_valid = out->buffers[0]->mutable_data();
-  // Zero bits and then only have to set valid values to true
-  bit_util::SetBitsTo(out_is_valid, 0, output_length, false);
-
-  // We use 3 block counters for fast scanning of the filter
-  //
-  // * values_valid_counter: for values null/not-null
-  // * filter_valid_counter: for filter null/not-null
-  // * filter_counter: for filter true/false
-  OptionalBitBlockCounter values_valid_counter(values_is_valid, values_offset,
-                                               values.length);
-  OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
-                                               filter.length);
-  BitBlockCounter filter_counter(filter_data, filter_offset, filter.length);
-
-  BINARY_FILTER_SETUP_COMMON();
-
-  int64_t in_position = 0;
-  int64_t out_position = 0;
-  while (in_position < filter.length) {
-    BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
-    BitBlockCount values_valid_block = values_valid_counter.NextWord();
-    BitBlockCount filter_block = filter_counter.NextWord();
-    if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
-      // For this exceedingly common case in low-selectivity filters we can
-      // skip further analysis of the data and move on to the next block.
-      in_position += filter_block.length;
-    } else if (filter_valid_block.AllSet()) {
-      // Simpler path: no filter values are null
-      if (filter_block.AllSet()) {
-        // Fastest path: filter values are all true and not null
-        if (values_valid_block.AllSet()) {
-          // The values aren't null either
-          bit_util::SetBitsTo(out_is_valid, out_position, filter_block.length, true);
-
-          // Bulk-append raw data
-          offset_type block_data_bytes =
-              (raw_offsets[in_position + filter_block.length] - raw_offsets[in_position]);
-          APPEND_RAW_DATA(raw_data + raw_offsets[in_position], block_data_bytes);
-          // Append offsets
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            offset_builder.UnsafeAppend(offset);
-            offset += raw_offsets[in_position + 1] - raw_offsets[in_position];
-          }
-          out_position += filter_block.length;
-        } else {
-          // Some of the values in this block are null
-          for (int64_t i = 0; i < filter_block.length;
-               ++i, ++in_position, ++out_position) {
-            offset_builder.UnsafeAppend(offset);
-            if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
-              bit_util::SetBit(out_is_valid, out_position);
-              APPEND_SINGLE_VALUE();
-            }
-          }
-        }
-      } else {  // !filter_block.AllSet()
-        // Some of the filter values are false, but all not null
-        if (values_valid_block.AllSet()) {
-          // All the values are not-null, so we can skip null checking for
-          // them
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              bit_util::SetBit(out_is_valid, out_position++);
-              APPEND_SINGLE_VALUE();
-            }
-          }
-        } else {
-          // Some of the values in the block are null, so we have to check
-          // each one
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
-                bit_util::SetBit(out_is_valid, out_position);
-                APPEND_SINGLE_VALUE();
-              }
-              ++out_position;
-            }
-          }
-        }
-      }
-    } else {  // !filter_valid_block.AllSet()
-      // Some of the filter values are null, so we have to handle the DROP
-      // versus EMIT_NULL null selection behavior.
-      if (null_selection == FilterOptions::DROP) {
-        // Filter null values are treated as false.
-        if (values_valid_block.AllSet()) {
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              bit_util::SetBit(out_is_valid, out_position++);
-              APPEND_SINGLE_VALUE();
-            }
-          }
-        } else {
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
-                bit_util::SetBit(out_is_valid, out_position);
-                APPEND_SINGLE_VALUE();
-              }
-              ++out_position;
-            }
-          }
-        }
-      } else {
-        // EMIT_NULL
-
-        // Filter null values are appended to output as null whether the
-        // value in the corresponding slot is valid or not
-        if (values_valid_block.AllSet()) {
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            const bool filter_not_null =
-                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
-            if (filter_not_null &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              bit_util::SetBit(out_is_valid, out_position++);
-              APPEND_SINGLE_VALUE();
-            } else if (!filter_not_null) {
-              offset_builder.UnsafeAppend(offset);
-              ++out_position;
-            }
-          }
-        } else {
-          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
-            const bool filter_not_null =
-                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
-            if (filter_not_null &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              offset_builder.UnsafeAppend(offset);
-              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
-                bit_util::SetBit(out_is_valid, out_position);
-                APPEND_SINGLE_VALUE();
-              }
-              ++out_position;
-            } else if (!filter_not_null) {
-              offset_builder.UnsafeAppend(offset);
-              ++out_position;
-            }
-          }
-        }
-      }
-    }
-  }
-  offset_builder.UnsafeAppend(offset);
-  out->length = output_length;
-  RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
-  return data_builder.Finish(&out->buffers[2]);
-}
-
-#undef BINARY_FILTER_SETUP_COMMON
-#undef APPEND_RAW_DATA
-#undef APPEND_SINGLE_VALUE
-
-Status BinaryFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  FilterOptions::NullSelectionBehavior null_selection =
-      FilterState::Get(ctx).null_selection_behavior;
-
-  const ArraySpan& values = batch[0].array;
-  const ArraySpan& filter = batch[1].array;
-  int64_t output_length = GetFilterOutputSize(filter, null_selection);
-
-  ArrayData* out_arr = out->array_data().get();
-
-  // The output precomputed null count is unknown except in the narrow
-  // condition that all the values are non-null and the filter will not cause
-  // any new nulls to be created.
-  if (values.null_count == 0 &&
-      (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
-    out_arr->null_count = 0;
-  } else {
-    out_arr->null_count = kUnknownNullCount;
-  }
-  Type::type type_id = values.type->id();
-  if (values.null_count == 0 && filter.null_count == 0) {
-    // Faster no-nulls case
-    if (is_binary_like(type_id)) {
-      RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
-          ctx, values, filter, output_length, null_selection, out_arr));
-    } else if (is_large_binary_like(type_id)) {
-      RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
-          ctx, values, filter, output_length, null_selection, out_arr));
-    } else {
-      DCHECK(false);
-    }
-  } else {
-    // Output may have nulls
-    RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
-    if (is_binary_like(type_id)) {
-      RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
-                                                 null_selection, out_arr));
-    } else if (is_large_binary_like(type_id)) {
-      RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
-                                                      null_selection, out_arr));
-    } else {
-      DCHECK(false);
-    }
-  }
-
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Null take and filter
-
-Status NullTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
-  }
-  // batch.length doesn't take into account the take indices
-  auto new_length = batch[1].array.length;
-  out->value = std::make_shared<NullArray>(new_length)->data();
-  return Status::OK();
-}
-
-Status NullFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  int64_t output_length =
-      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
-  out->value = std::make_shared<NullArray>(output_length)->data();
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Dictionary take and filter
-
-Status DictionaryTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  DictionaryArray values(batch[0].array.ToArrayData());
-  Datum result;
-  RETURN_NOT_OK(Take(Datum(values.indices()), batch[1].array.ToArrayData(),
-                     TakeState::Get(ctx), ctx->exec_context())
-                    .Value(&result));
-  DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
-  out->value = taken_values.data();
-  return Status::OK();
-}
-
-Status DictionaryFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  DictionaryArray dict_values(batch[0].array.ToArrayData());
-  Datum result;
-  RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array.ToArrayData(),
-                       FilterState::Get(ctx), ctx->exec_context())
-                    .Value(&result));
-  DictionaryArray filtered_values(dict_values.type(), result.make_array(),
-                                  dict_values.dictionary());
-  out->value = filtered_values.data();
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Extension take and filter
-
-Status ExtensionTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  ExtensionArray values(batch[0].array.ToArrayData());
-  Datum result;
-  RETURN_NOT_OK(Take(Datum(values.storage()), batch[1].array.ToArrayData(),
-                     TakeState::Get(ctx), ctx->exec_context())
-                    .Value(&result));
-  ExtensionArray taken_values(values.type(), result.make_array());
-  out->value = taken_values.data();
-  return Status::OK();
-}
-
-Status ExtensionFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  ExtensionArray ext_values(batch[0].array.ToArrayData());
-  Datum result;
-  RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array.ToArrayData(),
-                       FilterState::Get(ctx), ctx->exec_context())
-                    .Value(&result));
-  ExtensionArray filtered_values(ext_values.type(), result.make_array());
-  out->value = filtered_values.data();
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Implement take for other data types where there is less performance
-// sensitivity by visiting the selected indices.
-
-// Use CRTP to dispatch to type-specific processing of take indices for each
-// unsigned integer type.
-template <typename Impl, typename Type>
-struct Selection {
-  using ValuesArrayType = typename TypeTraits<Type>::ArrayType;
-
-  // Forwards the generic value visitors to the take index visitor template
-  template <typename IndexCType>
-  struct TakeAdapter {
-    static constexpr bool is_take = true;
-
-    Impl* impl;
-    explicit TakeAdapter(Impl* impl) : impl(impl) {}
-    template <typename ValidVisitor, typename NullVisitor>
-    Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-      return impl->template VisitTake<IndexCType>(std::forward<ValidVisitor>(visit_valid),
-                                                  std::forward<NullVisitor>(visit_null));
-    }
-  };
-
-  // Forwards the generic value visitors to the VisitFilter template
-  struct FilterAdapter {
-    static constexpr bool is_take = false;
-
-    Impl* impl;
-    explicit FilterAdapter(Impl* impl) : impl(impl) {}
-    template <typename ValidVisitor, typename NullVisitor>
-    Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-      return impl->VisitFilter(std::forward<ValidVisitor>(visit_valid),
-                               std::forward<NullVisitor>(visit_null));
-    }
-  };
-
-  KernelContext* ctx;
-  const ArraySpan& values;
-  const ArraySpan& selection;
-  int64_t output_length;
-  ArrayData* out;
-  TypedBufferBuilder<bool> validity_builder;
-
-  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-            ExecResult* out)
-      : ctx(ctx),
-        values(batch[0].array),
-        selection(batch[1].array),
-        output_length(output_length),
-        out(out->array_data().get()),
-        validity_builder(ctx->memory_pool()) {}
-
-  virtual ~Selection() = default;
-
-  Status FinishCommon() {
-    out->buffers.resize(values.num_buffers());
-    out->length = validity_builder.length();
-    out->null_count = validity_builder.false_count();
-    return validity_builder.Finish(&out->buffers[0]);
-  }
-
-  template <typename IndexCType, typename ValidVisitor, typename NullVisitor>
-  Status VisitTake(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-    const auto indices_values = selection.GetValues<IndexCType>(1);
-    const uint8_t* is_valid = selection.buffers[0].data;
-    OptionalBitIndexer indices_is_valid(is_valid, selection.offset);
-    OptionalBitIndexer values_is_valid(values.buffers[0].data, values.offset);
-
-    const bool values_have_nulls = values.MayHaveNulls();
-    OptionalBitBlockCounter bit_counter(is_valid, selection.offset, selection.length);
-    int64_t position = 0;
-    while (position < selection.length) {
-      BitBlockCount block = bit_counter.NextBlock();
-      const bool indices_have_nulls = block.popcount < block.length;
-      if (!indices_have_nulls && !values_have_nulls) {
-        // Fastest path, neither indices nor values have nulls
-        validity_builder.UnsafeAppend(block.length, true);
-        for (int64_t i = 0; i < block.length; ++i) {
-          RETURN_NOT_OK(visit_valid(indices_values[position++]));
-        }
-      } else if (block.popcount > 0) {
-        // Since we have to branch on whether the indices are null or not, we
-        // combine the "non-null indices block but some values null" and
-        // "some-null indices block but values non-null" into a single loop.
-        for (int64_t i = 0; i < block.length; ++i) {
-          if ((!indices_have_nulls || indices_is_valid[position]) &&
-              values_is_valid[indices_values[position]]) {
-            validity_builder.UnsafeAppend(true);
-            RETURN_NOT_OK(visit_valid(indices_values[position]));
-          } else {
-            validity_builder.UnsafeAppend(false);
-            RETURN_NOT_OK(visit_null());
-          }
-          ++position;
-        }
-      } else {
-        // The whole block is null
-        validity_builder.UnsafeAppend(block.length, false);
-        for (int64_t i = 0; i < block.length; ++i) {
-          RETURN_NOT_OK(visit_null());
-        }
-        position += block.length;
-      }
-    }
-    return Status::OK();
-  }
-
-  // We use the NullVisitor both for "selected" nulls as well as "emitted"
-  // nulls coming from the filter when using FilterOptions::EMIT_NULL
-  template <typename ValidVisitor, typename NullVisitor>
-  Status VisitFilter(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-    auto null_selection = FilterState::Get(ctx).null_selection_behavior;
-
-    const uint8_t* filter_data = selection.buffers[1].data;
-
-    const uint8_t* filter_is_valid = selection.buffers[0].data;
-    const int64_t filter_offset = selection.offset;
-    OptionalBitIndexer values_is_valid(values.buffers[0].data, values.offset);
-
-    // We use 3 block counters for fast scanning of the filter
-    //
-    // * values_valid_counter: for values null/not-null
-    // * filter_valid_counter: for filter null/not-null
-    // * filter_counter: for filter true/false
-    OptionalBitBlockCounter values_valid_counter(values.buffers[0].data, values.offset,
-                                                 values.length);
-    OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
-                                                 selection.length);
-    BitBlockCounter filter_counter(filter_data, filter_offset, selection.length);
-    int64_t in_position = 0;
-
-    auto AppendNotNull = [&](int64_t index) -> Status {
-      validity_builder.UnsafeAppend(true);
-      return visit_valid(index);
-    };
-
-    auto AppendNull = [&]() -> Status {
-      validity_builder.UnsafeAppend(false);
-      return visit_null();
-    };
-
-    auto AppendMaybeNull = [&](int64_t index) -> Status {
-      if (values_is_valid[index]) {
-        return AppendNotNull(index);
-      } else {
-        return AppendNull();
-      }
-    };
-
-    while (in_position < selection.length) {
-      BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
-      BitBlockCount values_valid_block = values_valid_counter.NextWord();
-      BitBlockCount filter_block = filter_counter.NextWord();
-      if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
-        // For this exceedingly common case in low-selectivity filters we can
-        // skip further analysis of the data and move on to the next block.
-        in_position += filter_block.length;
-      } else if (filter_valid_block.AllSet()) {
-        // Simpler path: no filter values are null
-        if (filter_block.AllSet()) {
-          // Fastest path: filter values are all true and not null
-          if (values_valid_block.AllSet()) {
-            // The values aren't null either
-            validity_builder.UnsafeAppend(filter_block.length, true);
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              RETURN_NOT_OK(visit_valid(in_position++));
-            }
-          } else {
-            // Some of the values in this block are null
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              RETURN_NOT_OK(AppendMaybeNull(in_position++));
-            }
-          }
-        } else {  // !filter_block.AllSet()
-          // Some of the filter values are false, but all not null
-          if (values_valid_block.AllSet()) {
-            // All the values are not-null, so we can skip null checking for
-            // them
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
-                RETURN_NOT_OK(AppendNotNull(in_position));
-              }
-              ++in_position;
-            }
-          } else {
-            // Some of the values in the block are null, so we have to check
-            // each one
-            for (int64_t i = 0; i < filter_block.length; ++i) {
-              if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
-                RETURN_NOT_OK(AppendMaybeNull(in_position));
-              }
-              ++in_position;
-            }
-          }
-        }
-      } else {  // !filter_valid_block.AllSet()
-        // Some of the filter values are null, so we have to handle the DROP
-        // versus EMIT_NULL null selection behavior.
-        if (null_selection == FilterOptions::DROP) {
-          // Filter null values are treated as false.
-          for (int64_t i = 0; i < filter_block.length; ++i) {
-            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              RETURN_NOT_OK(AppendMaybeNull(in_position));
-            }
-            ++in_position;
-          }
-        } else {
-          // Filter null values are appended to output as null whether the
-          // value in the corresponding slot is valid or not
-          for (int64_t i = 0; i < filter_block.length; ++i) {
-            const bool filter_not_null =
-                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
-            if (filter_not_null &&
-                bit_util::GetBit(filter_data, filter_offset + in_position)) {
-              RETURN_NOT_OK(AppendMaybeNull(in_position));
-            } else if (!filter_not_null) {
-              // EMIT_NULL case
-              RETURN_NOT_OK(AppendNull());
-            }
-            ++in_position;
-          }
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  virtual Status Init() { return Status::OK(); }
-
-  // Implementation specific finish logic
-  virtual Status Finish() = 0;
-
-  Status ExecTake() {
-    RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
-    RETURN_NOT_OK(Init());
-    int index_width = this->selection.type->byte_width();
-
-    // CTRP dispatch here
-    switch (index_width) {
-      case 1: {
-        Status s =
-            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint8_t>>();
-        RETURN_NOT_OK(s);
-      } break;
-      case 2: {
-        Status s =
-            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint16_t>>();
-        RETURN_NOT_OK(s);
-      } break;
-      case 4: {
-        Status s =
-            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint32_t>>();
-        RETURN_NOT_OK(s);
-      } break;
-      case 8: {
-        Status s =
-            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint64_t>>();
-        RETURN_NOT_OK(s);
-      } break;
-      default:
-        DCHECK(false) << "Invalid index width";
-        break;
-    }
-    RETURN_NOT_OK(this->FinishCommon());
-    return Finish();
-  }
-
-  Status ExecFilter() {
-    RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
-    RETURN_NOT_OK(Init());
-    // CRTP dispatch
-    Status s = static_cast<Impl*>(this)->template GenerateOutput<FilterAdapter>();
-    RETURN_NOT_OK(s);
-    RETURN_NOT_OK(this->FinishCommon());
-    return Finish();
-  }
-};
-
-#define LIFT_BASE_MEMBERS()                               \
-  using ValuesArrayType = typename Base::ValuesArrayType; \
-  using Base::ctx;                                        \
-  using Base::values;                                     \
-  using Base::selection;                                  \
-  using Base::output_length;                              \
-  using Base::out;                                        \
-  using Base::validity_builder
-
-static inline Status VisitNoop() { return Status::OK(); }
-
-// A selection implementation for 32-bit and 64-bit variable binary
-// types. Common generated kernels are shared between Binary/String and
-// LargeBinary/LargeString
-template <typename Type>
-struct VarBinaryImpl : public Selection<VarBinaryImpl<Type>, Type> {
-  using offset_type = typename Type::offset_type;
-
-  using Base = Selection<VarBinaryImpl<Type>, Type>;
-  LIFT_BASE_MEMBERS();
-
-  TypedBufferBuilder<offset_type> offset_builder;
-  TypedBufferBuilder<uint8_t> data_builder;
-
-  static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
-
-  VarBinaryImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                ExecResult* out)
-      : Base(ctx, batch, output_length, out),
-        offset_builder(ctx->memory_pool()),
-        data_builder(ctx->memory_pool()) {}
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    const auto raw_offsets = this->values.template GetValues<offset_type>(1);
-    const uint8_t* raw_data = this->values.buffers[2].data;
-
-    // Presize the data builder with a rough estimate of the required data size
-    if (this->values.length > 0) {
-      int64_t data_length = raw_offsets[this->values.length] - raw_offsets[0];
-      const double mean_value_length =
-          data_length / static_cast<double>(this->values.length);
-
-      // TODO: See if possible to reduce output_length for take/filter cases
-      // where there are nulls in the selection array
-      RETURN_NOT_OK(
-          data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length)));
-    }
-    int64_t space_available = data_builder.capacity();
-
-    offset_type offset = 0;
-    Adapter adapter(this);
-    RETURN_NOT_OK(adapter.Generate(
-        [&](int64_t index) {
-          offset_builder.UnsafeAppend(offset);
-          offset_type val_offset = raw_offsets[index];
-          offset_type val_size = raw_offsets[index + 1] - val_offset;
-
-          // Use static property to prune this code from the filter path in
-          // optimized builds
-          if (Adapter::is_take &&
-              ARROW_PREDICT_FALSE(static_cast<int64_t>(offset) +
-                                  static_cast<int64_t>(val_size)) > kOffsetLimit) {
-            return Status::Invalid("Take operation overflowed binary array capacity");
-          }
-          offset += val_size;
-          if (ARROW_PREDICT_FALSE(val_size > space_available)) {
-            RETURN_NOT_OK(data_builder.Reserve(val_size));
-            space_available = data_builder.capacity() - data_builder.length();
-          }
-          data_builder.UnsafeAppend(raw_data + val_offset, val_size);
-          space_available -= val_size;
-          return Status::OK();
-        },
-        [&]() {
-          offset_builder.UnsafeAppend(offset);
-          return Status::OK();
-        }));
-    offset_builder.UnsafeAppend(offset);
-    return Status::OK();
-  }
-
-  Status Init() override { return offset_builder.Reserve(output_length + 1); }
-
-  Status Finish() override {
-    RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
-    return data_builder.Finish(&out->buffers[2]);
-  }
-};
-
-struct FSBImpl : public Selection<FSBImpl, FixedSizeBinaryType> {
-  using Base = Selection<FSBImpl, FixedSizeBinaryType>;
-  LIFT_BASE_MEMBERS();
-
-  TypedBufferBuilder<uint8_t> data_builder;
-
-  FSBImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-          ExecResult* out)
-      : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    FixedSizeBinaryArray typed_values(this->values.ToArrayData());
-    int32_t value_size = typed_values.byte_width();
-
-    RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
-    Adapter adapter(this);
-    return adapter.Generate(
-        [&](int64_t index) {
-          auto val = typed_values.GetView(index);
-          data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(val.data()),
-                                    value_size);
-          return Status::OK();
-        },
-        [&]() {
-          data_builder.UnsafeAppend(value_size, static_cast<uint8_t>(0x00));
-          return Status::OK();
-        });
-  }
-
-  Status Finish() override { return data_builder.Finish(&out->buffers[1]); }
-};
-
-template <typename Type>
-struct ListImpl : public Selection<ListImpl<Type>, Type> {
-  using offset_type = typename Type::offset_type;
-
-  using Base = Selection<ListImpl<Type>, Type>;
-  LIFT_BASE_MEMBERS();
-
-  TypedBufferBuilder<offset_type> offset_builder;
-  typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
-
-  ListImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-           ExecResult* out)
-      : Base(ctx, batch, output_length, out),
-        offset_builder(ctx->memory_pool()),
-        child_index_builder(ctx->memory_pool()) {}
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    ValuesArrayType typed_values(this->values.ToArrayData());
-
-    // TODO presize child_index_builder with a similar heuristic as VarBinaryImpl
-
-    offset_type offset = 0;
-    Adapter adapter(this);
-    RETURN_NOT_OK(adapter.Generate(
-        [&](int64_t index) {
-          offset_builder.UnsafeAppend(offset);
-          offset_type value_offset = typed_values.value_offset(index);
-          offset_type value_length = typed_values.value_length(index);
-          offset += value_length;
-          RETURN_NOT_OK(child_index_builder.Reserve(value_length));
-          for (offset_type j = value_offset; j < value_offset + value_length; ++j) {
-            child_index_builder.UnsafeAppend(j);
-          }
-          return Status::OK();
-        },
-        [&]() {
-          offset_builder.UnsafeAppend(offset);
-          return Status::OK();
-        }));
-    offset_builder.UnsafeAppend(offset);
-    return Status::OK();
-  }
-
-  Status Init() override {
-    RETURN_NOT_OK(offset_builder.Reserve(output_length + 1));
-    return Status::OK();
-  }
-
-  Status Finish() override {
-    std::shared_ptr<Array> child_indices;
-    RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
-
-    ValuesArrayType typed_values(this->values.ToArrayData());
-
-    // No need to boundscheck the child values indices
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
-                          Take(*typed_values.values(), *child_indices,
-                               TakeOptions::NoBoundsCheck(), ctx->exec_context()));
-    RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
-    out->child_data = {taken_child->data()};
-    return Status::OK();
-  }
-};
-
-struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
-  using Base = Selection<DenseUnionImpl, DenseUnionType>;
-  LIFT_BASE_MEMBERS();
-
-  TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
-  TypedBufferBuilder<int8_t> child_id_buffer_builder_;
-  std::vector<int8_t> type_codes_;
-  std::vector<Int32Builder> child_indices_builders_;
-
-  DenseUnionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                 ExecResult* out)
-      : Base(ctx, batch, output_length, out),
-        value_offset_buffer_builder_(ctx->memory_pool()),
-        child_id_buffer_builder_(ctx->memory_pool()),
-        type_codes_(checked_cast<const UnionType&>(*this->values.type).type_codes()),
-        child_indices_builders_(type_codes_.size()) {
-    for (auto& child_indices_builder : child_indices_builders_) {
-      child_indices_builder = Int32Builder(ctx->memory_pool());
-    }
-  }
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    DenseUnionArray typed_values(this->values.ToArrayData());
-    Adapter adapter(this);
-    RETURN_NOT_OK(adapter.Generate(
-        [&](int64_t index) {
-          int8_t child_id = typed_values.child_id(index);
-          child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
-          int32_t value_offset = typed_values.value_offset(index);
-          value_offset_buffer_builder_.UnsafeAppend(
-              static_cast<int32_t>(child_indices_builders_[child_id].length()));
-          RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
-          child_indices_builders_[child_id].UnsafeAppend(value_offset);
-          return Status::OK();
-        },
-        [&]() {
-          int8_t child_id = 0;
-          child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
-          value_offset_buffer_builder_.UnsafeAppend(
-              static_cast<int32_t>(child_indices_builders_[child_id].length()));
-          RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
-          child_indices_builders_[child_id].UnsafeAppendNull();
-          return Status::OK();
-        }));
-    return Status::OK();
-  }
-
-  Status Init() override {
-    RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
-    RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
-    return Status::OK();
-  }
-
-  Status Finish() override {
-    ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
-    ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
-                          value_offset_buffer_builder_.Finish());
-    DenseUnionArray typed_values(this->values.ToArrayData());
-    auto num_fields = typed_values.num_fields();
-    auto num_rows = child_ids_buffer->size();
-    BufferVector buffers{nullptr, std::move(child_ids_buffer),
-                         std::move(value_offsets_buffer)};
-    *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
-    for (auto i = 0; i < num_fields; i++) {
-      ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
-                            child_indices_builders_[i].Finish());
-      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
-                            Take(*typed_values.field(i), *child_indices_array));
-      out->child_data.push_back(child_array->data());
-    }
-    return Status::OK();
-  }
-};
-
-struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
-  Int64Builder child_index_builder;
-
-  using Base = Selection<FSLImpl, FixedSizeListType>;
-  LIFT_BASE_MEMBERS();
-
-  FSLImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-          ExecResult* out)
-      : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    ValuesArrayType typed_values(this->values.ToArrayData());
-    const int32_t list_size = typed_values.list_type()->list_size();
-    const int64_t base_offset = typed_values.offset();
-
-    // We must take list_size elements even for null elements of
-    // indices.
-    RETURN_NOT_OK(child_index_builder.Reserve(output_length * list_size));
-
-    Adapter adapter(this);
-    return adapter.Generate(
-        [&](int64_t index) {
-          int64_t offset = (base_offset + index) * list_size;
-          for (int64_t j = offset; j < offset + list_size; ++j) {
-            child_index_builder.UnsafeAppend(j);
-          }
-          return Status::OK();
-        },
-        [&]() { return child_index_builder.AppendNulls(list_size); });
-  }
-
-  Status Finish() override {
-    std::shared_ptr<Array> child_indices;
-    RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
-
-    ValuesArrayType typed_values(this->values.ToArrayData());
-
-    // No need to boundscheck the child values indices
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
-                          Take(*typed_values.values(), *child_indices,
-                               TakeOptions::NoBoundsCheck(), ctx->exec_context()));
-    out->child_data = {taken_child->data()};
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// Struct selection implementations
-
-// We need a slightly different approach for StructType. For Take, we can
-// invoke Take on each struct field's data with boundschecking disabled. For
-// Filter on the other hand, if we naively call Filter on each field, then the
-// filter output length will have to be redundantly computed. Thus, for Filter
-// we instead convert the filter to selection indices and then invoke take.
-
-// Struct selection implementation. ONLY used for Take
-struct StructImpl : public Selection<StructImpl, StructType> {
-  using Base = Selection<StructImpl, StructType>;
-  LIFT_BASE_MEMBERS();
-  using Base::Base;
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    StructArray typed_values(this->values.ToArrayData());
-    Adapter adapter(this);
-    // There's nothing to do for Struct except to generate the validity bitmap
-    return adapter.Generate([&](int64_t index) { return Status::OK(); },
-                            /*visit_null=*/VisitNoop);
-  }
-
-  Status Finish() override {
-    StructArray typed_values(this->values.ToArrayData());
-
-    // Select from children without boundschecking
-    out->child_data.resize(this->values.type->num_fields());
-    for (int field_index = 0; field_index < this->values.type->num_fields();
-         ++field_index) {
-      ARROW_ASSIGN_OR_RAISE(Datum taken_field,
-                            Take(Datum(typed_values.field(field_index)),
-                                 Datum(this->selection.ToArrayData()),
-                                 TakeOptions::NoBoundsCheck(), ctx->exec_context()));
-      out->child_data[field_index] = taken_field.array();
-    }
-    return Status::OK();
-  }
-};
-
-Status StructFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  // Transform filter to selection indices and then use Take.
-  std::shared_ptr<ArrayData> indices;
-  RETURN_NOT_OK(GetTakeIndices(batch[1].array,
-                               FilterState::Get(ctx).null_selection_behavior,
-                               ctx->memory_pool())
-                    .Value(&indices));
-
-  Datum result;
-  RETURN_NOT_OK(Take(batch[0].array.ToArrayData(), Datum(indices),
-                     TakeOptions::NoBoundsCheck(), ctx->exec_context())
-                    .Value(&result));
-  out->value = result.array();
-  return Status::OK();
-}
-
-#undef LIFT_BASE_MEMBERS
-
-// ----------------------------------------------------------------------
-// Implement Filter metafunction
-
-Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
-                                                       const Datum& filter,
-                                                       const FunctionOptions* options,
-                                                       ExecContext* ctx) {
-  if (batch.num_rows() != filter.length()) {
-    return Status::Invalid("Filter inputs must all be the same length");
-  }
-
-  // Convert filter to selection vector/indices and use Take
-  const auto& filter_opts = *static_cast<const FilterOptions*>(options);
-  ARROW_ASSIGN_OR_RAISE(
-      std::shared_ptr<ArrayData> indices,
-      GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
-                     ctx->memory_pool()));
-  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
-                                          TakeOptions::NoBoundsCheck(), ctx));
-    columns[i] = out.make_array();
-  }
-  return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
-}
-
-Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
-                                           const FunctionOptions* options,
-                                           ExecContext* ctx) {
-  if (table.num_rows() != filter.length()) {
-    return Status::Invalid("Filter inputs must all be the same length");
-  }
-  if (table.num_rows() == 0) {
-    return Table::Make(table.schema(), table.columns(), 0);
-  }
-
-  // Last input element will be the filter array
-  const int num_columns = table.num_columns();
-  std::vector<ArrayVector> inputs(num_columns + 1);
-
-  // Fetch table columns
-  for (int i = 0; i < num_columns; ++i) {
-    inputs[i] = table.column(i)->chunks();
-  }
-  // Fetch filter
-  const auto& filter_opts = *static_cast<const FilterOptions*>(options);
-  switch (filter.kind()) {
-    case Datum::ARRAY:
-      inputs.back().push_back(filter.make_array());
-      break;
-    case Datum::CHUNKED_ARRAY:
-      inputs.back() = filter.chunked_array()->chunks();
-      break;
-    default:
-      return Status::NotImplemented("Filter should be array-like");
-  }
-
-  // Rechunk inputs to allow consistent iteration over their respective chunks
-  inputs = arrow::internal::RechunkArraysConsistently(inputs);
-
-  // Instead of filtering each column with the boolean filter
-  // (which would be slow if the table has a large number of columns: ARROW-10569),
-  // convert each filter chunk to indices, and take() the column.
-  const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
-  std::vector<ArrayVector> out_columns(num_columns);
-  int64_t out_num_rows = 0;
-
-  for (int64_t i = 0; i < num_chunks; ++i) {
-    const ArrayData& filter_chunk = *inputs.back()[i]->data();
-    ARROW_ASSIGN_OR_RAISE(
-        const auto indices,
-        GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
-                       ctx->memory_pool()));
-
-    if (indices->length > 0) {
-      // Take from all input columns
-      Datum indices_datum{std::move(indices)};
-      for (int col = 0; col < num_columns; ++col) {
-        const auto& column_chunk = inputs[col][i];
-        ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
-                                              TakeOptions::NoBoundsCheck(), ctx));
-        out_columns[col].push_back(std::move(out).make_array());
-      }
-      out_num_rows += indices->length;
-    }
-  }
-
-  ChunkedArrayVector out_chunks(num_columns);
-  for (int i = 0; i < num_columns; ++i) {
-    out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
-                                                   table.column(i)->type());
-  }
-  return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
-}
-
-const FilterOptions* GetDefaultFilterOptions() {
-  static const auto kDefaultFilterOptions = FilterOptions::Defaults();
-  return &kDefaultFilterOptions;
-}
-
-const FunctionDoc filter_doc(
-    "Filter with a boolean selection filter",
-    ("The output is populated with values from the input at positions\n"
-     "where the selection filter is non-zero.  Nulls in the selection filter\n"
-     "are handled based on FilterOptions."),
-    {"input", "selection_filter"}, "FilterOptions");
-
-class FilterMetaFunction : public MetaFunction {
- public:
-  FilterMetaFunction()
-      : MetaFunction("filter", Arity::Binary(), filter_doc, GetDefaultFilterOptions()) {}
-
-  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
-                            const FunctionOptions* options,
-                            ExecContext* ctx) const override {
-    if (args[1].type()->id() != Type::BOOL) {
-      return Status::NotImplemented("Filter argument must be boolean type");
-    }
-
-    if (args[0].kind() == Datum::RECORD_BATCH) {
-      auto values_batch = args[0].record_batch();
-      ARROW_ASSIGN_OR_RAISE(
-          std::shared_ptr<RecordBatch> out_batch,
-          FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx));
-      return Datum(out_batch);
-    } else if (args[0].kind() == Datum::TABLE) {
-      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> out_table,
-                            FilterTable(*args[0].table(), args[1], options, ctx));
-      return Datum(out_table);
-    } else {
-      return CallFunction("array_filter", args, options, ctx);
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// Implement Take metafunction
-
-// Shorthand naming of these functions
-// A -> Array
-// C -> ChunkedArray
-// R -> RecordBatch
-// T -> Table
-
-Result<std::shared_ptr<ArrayData>> TakeAA(const std::shared_ptr<ArrayData>& values,
-                                          const std::shared_ptr<ArrayData>& indices,
-                                          const TakeOptions& options, ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(Datum result,
-                        CallFunction("array_take", {values, indices}, &options, ctx));
-  return result.array();
-}
-
-Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
-                                             const Array& indices,
-                                             const TakeOptions& options,
-                                             ExecContext* ctx) {
-  auto num_chunks = values.num_chunks();
-  std::shared_ptr<Array> current_chunk;
-
-  // Case 1: `values` has a single chunk, so just use it
-  if (num_chunks == 1) {
-    current_chunk = values.chunk(0);
-  } else {
-    // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
-    // See
-    // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
-    // TODO Case 3: If indices are sorted, can slice them and call Array Take
-
-    // Case 4: Else, concatenate chunks and call Array Take
-    if (values.chunks().empty()) {
-      ARROW_ASSIGN_OR_RAISE(current_chunk, MakeArrayOfNull(values.type(), /*length=*/0,
-                                                           ctx->memory_pool()));
-    } else {
-      ARROW_ASSIGN_OR_RAISE(current_chunk,
-                            Concatenate(values.chunks(), ctx->memory_pool()));
-    }
-  }
-  // Call Array Take on our single chunk
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> new_chunk,
-                        TakeAA(current_chunk->data(), indices.data(), options, ctx));
-  std::vector<std::shared_ptr<Array>> chunks = {MakeArray(new_chunk)};
-  return std::make_shared<ChunkedArray>(std::move(chunks));
-}
-
-Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
-                                             const ChunkedArray& indices,
-                                             const TakeOptions& options,
-                                             ExecContext* ctx) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    // Note that as currently implemented, this is inefficient because `values`
-    // will get concatenated on every iteration of this loop
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> current_chunk,
-                          TakeCA(values, *indices.chunk(i), options, ctx));
-    // Concatenate the result to make a single array for this chunk
-    ARROW_ASSIGN_OR_RAISE(new_chunks[i],
-                          Concatenate(current_chunk->chunks(), ctx->memory_pool()));
-  }
-  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
-}
-
-Result<std::shared_ptr<ChunkedArray>> TakeAC(const Array& values,
-                                             const ChunkedArray& indices,
-                                             const TakeOptions& options,
-                                             ExecContext* ctx) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> chunk,
-                          TakeAA(values.data(), indices.chunk(i)->data(), options, ctx));
-    new_chunks[i] = MakeArray(chunk);
-  }
-  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
-}
-
-Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
-                                            const Array& indices,
-                                            const TakeOptions& options,
-                                            ExecContext* ctx) {
-  auto ncols = batch.num_columns();
-  auto nrows = indices.length();
-  std::vector<std::shared_ptr<Array>> columns(ncols);
-  for (int j = 0; j < ncols; j++) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> col_data,
-                          TakeAA(batch.column(j)->data(), indices.data(), options, ctx));
-    columns[j] = MakeArray(col_data);
-  }
-  return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
-}
-
-Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
-                                      const TakeOptions& options, ExecContext* ctx) {
-  auto ncols = table.num_columns();
-  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-
-  for (int j = 0; j < ncols; j++) {
-    ARROW_ASSIGN_OR_RAISE(columns[j], TakeCA(*table.column(j), indices, options, ctx));
-  }
-  return Table::Make(table.schema(), std::move(columns));
-}
-
-Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& indices,
-                                      const TakeOptions& options, ExecContext* ctx) {
-  auto ncols = table.num_columns();
-  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-  for (int j = 0; j < ncols; j++) {
-    ARROW_ASSIGN_OR_RAISE(columns[j], TakeCC(*table.column(j), indices, options, ctx));
-  }
-  return Table::Make(table.schema(), std::move(columns));
-}
-
-const TakeOptions* GetDefaultTakeOptions() {
-  static const auto kDefaultTakeOptions = TakeOptions::Defaults();
-  return &kDefaultTakeOptions;
-}
-
-const FunctionDoc take_doc(
-    "Select values from an input based on indices from another array",
-    ("The output is populated with values from the input at positions\n"
-     "given by `indices`.  Nulls in `indices` emit null in the output."),
-    {"input", "indices"}, "TakeOptions");
-
-// Metafunction for dispatching to different Take implementations other than
-// Array-Array.
-//
-// TODO: Revamp approach to executing Take operations. In addition to being
-// overly complex dispatching, there is no parallelization.
-class TakeMetaFunction : public MetaFunction {
- public:
-  TakeMetaFunction()
-      : MetaFunction("take", Arity::Binary(), take_doc, GetDefaultTakeOptions()) {}
-
-  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
-                            const FunctionOptions* options,
-                            ExecContext* ctx) const override {
-    Datum::Kind index_kind = args[1].kind();
-    const TakeOptions& take_opts = static_cast<const TakeOptions&>(*options);
-    switch (args[0].kind()) {
-      case Datum::ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeAA(args[0].array(), args[1].array(), take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeAC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
-        }
-        break;
-      case Datum::CHUNKED_ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeCA(*args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeCC(*args[0].chunked_array(), *args[1].chunked_array(), take_opts,
-                        ctx);
-        }
-        break;
-      case Datum::RECORD_BATCH:
-        if (index_kind == Datum::ARRAY) {
-          return TakeRA(*args[0].record_batch(), *args[1].make_array(), take_opts, ctx);
-        }
-        break;
-      case Datum::TABLE:
-        if (index_kind == Datum::ARRAY) {
-          return TakeTA(*args[0].table(), *args[1].make_array(), take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeTC(*args[0].table(), *args[1].chunked_array(), take_opts, ctx);
-        }
-        break;
-      default:
-        break;
-    }
-    return Status::NotImplemented(
-        "Unsupported types for take operation: "
-        "values=",
-        args[0].ToString(), "indices=", args[1].ToString());
-  }
-};
-
 // ----------------------------------------------------------------------
 // DropNull Implementation
 
@@ -2276,7 +170,6 @@ Result<Datum> DropNullTable(const std::shared_ptr<Table>& table, ExecContext* ct
 
   return Table::FromRecordBatches(table->schema(), filtered_batches);
 }
-
 const FunctionDoc drop_null_doc(
     "Drop nulls from the input",
     ("The output is populated with values from the input (Array, ChunkedArray,\n"
@@ -2317,45 +210,6 @@ class DropNullMetaFunction : public MetaFunction {
 
 // ----------------------------------------------------------------------
 
-template <typename Impl>
-Status FilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  // TODO: where are the values and filter length equality checked?
-  int64_t output_length =
-      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
-  Impl kernel(ctx, batch, output_length, out);
-  return kernel.ExecFilter();
-}
-
-template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
-  }
-  Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
-  return kernel.ExecTake();
-}
-
-struct SelectionKernelData {
-  InputType input;
-  ArrayKernelExec exec;
-};
-
-void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
-                               VectorKernel base_kernel, InputType selection_type,
-                               const std::vector<SelectionKernelData>& kernels,
-                               const FunctionOptions* default_options,
-                               FunctionRegistry* registry) {
-  auto func = std::make_shared<VectorFunction>(name, Arity::Binary(), std::move(doc),
-                                               default_options);
-  for (auto& kernel_data : kernels) {
-    base_kernel.signature =
-        KernelSignature::Make({std::move(kernel_data.input), selection_type}, FirstType);
-    base_kernel.exec = kernel_data.exec;
-    DCHECK_OK(func->AddKernel(base_kernel));
-  }
-  DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
 const FunctionDoc array_filter_doc(
     "Filter with a boolean selection filter",
     ("The output is populated with values from the input `array` at positions\n"
@@ -2472,24 +326,8 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
 
 void RegisterVectorSelection(FunctionRegistry* registry) {
   // Filter kernels
-  std::vector<SelectionKernelData> filter_kernels = {
-      {InputType(match::Primitive()), PrimitiveFilter},
-      {InputType(match::BinaryLike()), BinaryFilter},
-      {InputType(match::LargeBinaryLike()), BinaryFilter},
-      {InputType(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>},
-      {InputType(null()), NullFilter},
-      {InputType(Type::DECIMAL128), FilterExec<FSBImpl>},
-      {InputType(Type::DECIMAL256), FilterExec<FSBImpl>},
-      {InputType(Type::DICTIONARY), DictionaryFilter},
-      {InputType(Type::EXTENSION), ExtensionFilter},
-      {InputType(Type::LIST), FilterExec<ListImpl<ListType>>},
-      {InputType(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>},
-      {InputType(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>},
-      {InputType(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
-      {InputType(Type::STRUCT), StructFilter},
-      // TODO: Reuse ListType kernel for MAP
-      {InputType(Type::MAP), FilterExec<ListImpl<MapType>>},
-  };
+  std::vector<SelectionKernelData> filter_kernels;
+  PopulateFilterKernels(&filter_kernels);
 
   VectorKernel filter_base;
   filter_base.init = FilterState::Init;
@@ -2497,27 +335,11 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
                             /*selection_type=*/boolean(), filter_kernels,
                             GetDefaultFilterOptions(), registry);
 
-  DCHECK_OK(registry->AddFunction(std::make_shared<FilterMetaFunction>()));
+  DCHECK_OK(registry->AddFunction(MakeFilterMetaFunction()));
 
   // Take kernels
-  std::vector<SelectionKernelData> take_kernels = {
-      {InputType(match::Primitive()), PrimitiveTake},
-      {InputType(match::BinaryLike()), TakeExec<VarBinaryImpl<BinaryType>>},
-      {InputType(match::LargeBinaryLike()), TakeExec<VarBinaryImpl<LargeBinaryType>>},
-      {InputType(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>},
-      {InputType(null()), NullTake},
-      {InputType(Type::DECIMAL128), TakeExec<FSBImpl>},
-      {InputType(Type::DECIMAL256), TakeExec<FSBImpl>},
-      {InputType(Type::DICTIONARY), DictionaryTake},
-      {InputType(Type::EXTENSION), ExtensionTake},
-      {InputType(Type::LIST), TakeExec<ListImpl<ListType>>},
-      {InputType(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>},
-      {InputType(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>},
-      {InputType(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
-      {InputType(Type::STRUCT), TakeExec<StructImpl>},
-      // TODO: Reuse ListType kernel for MAP
-      {InputType(Type::MAP), TakeExec<ListImpl<MapType>>},
-  };
+  std::vector<SelectionKernelData> take_kernels;
+  PopulateTakeKernels(&take_kernels);
 
   VectorKernel take_base;
   take_base.init = TakeState::Init;
@@ -2526,7 +348,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
                             /*selection_type=*/match::Integer(), take_kernels,
                             GetDefaultTakeOptions(), registry);
 
-  DCHECK_OK(registry->AddFunction(std::make_shared<TakeMetaFunction>()));
+  DCHECK_OK(registry->AddFunction(MakeTakeMetaFunction()));
 
   // DropNull kernel
   DCHECK_OK(registry->AddFunction(std::make_shared<DropNullMetaFunction>()));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
new file mode 100644
index 0000000000000..7a3ced995310a
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
@@ -0,0 +1,922 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/vector_selection_filter_internal.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
+#include "arrow/datum.h"
+#include "arrow/extension_type.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+
+using internal::BinaryBitBlockCounter;
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::CopyBitmap;
+using internal::CountSetBits;
+using internal::OptionalBitBlockCounter;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+using FilterState = OptionsWrapper<FilterOptions>;
+
+int64_t GetBitmapFilterOutputSize(const ArraySpan& filter,
+                                  FilterOptions::NullSelectionBehavior null_selection) {
+  int64_t output_size = 0;
+
+  if (filter.MayHaveNulls()) {
+    const uint8_t* filter_is_valid = filter.buffers[0].data;
+    BinaryBitBlockCounter bit_counter(filter.buffers[1].data, filter.offset,
+                                      filter_is_valid, filter.offset, filter.length);
+    int64_t position = 0;
+    if (null_selection == FilterOptions::EMIT_NULL) {
+      while (position < filter.length) {
+        BitBlockCount block = bit_counter.NextOrNotWord();
+        output_size += block.popcount;
+        position += block.length;
+      }
+    } else {
+      while (position < filter.length) {
+        BitBlockCount block = bit_counter.NextAndWord();
+        output_size += block.popcount;
+        position += block.length;
+      }
+    }
+  } else {
+    // The filter has no nulls, so we can use CountSetBits
+    output_size = CountSetBits(filter.buffers[1].data, filter.offset, filter.length);
+  }
+  return output_size;
+}
+
+// TODO(pr-35750): Handle run-end encoded filters in compute kernels
+
+}  // namespace
+
+int64_t GetFilterOutputSize(const ArraySpan& filter,
+                            FilterOptions::NullSelectionBehavior null_selection) {
+  return GetBitmapFilterOutputSize(filter, null_selection);
+}
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Optimized and streamlined filter for primitive types
+
+// Use either BitBlockCounter or BinaryBitBlockCounter to quickly scan filter a
+// word at a time for the DROP selection type.
+class DropNullCounter {
+ public:
+  // validity bitmap may be null
+  DropNullCounter(const uint8_t* validity, const uint8_t* data, int64_t offset,
+                  int64_t length)
+      : data_counter_(data, offset, length),
+        data_and_validity_counter_(data, offset, validity, offset, length),
+        has_validity_(validity != nullptr) {}
+
+  BitBlockCount NextBlock() {
+    if (has_validity_) {
+      // filter is true AND not null
+      return data_and_validity_counter_.NextAndWord();
+    } else {
+      return data_counter_.NextWord();
+    }
+  }
+
+ private:
+  // For when just data is present, but no validity bitmap
+  BitBlockCounter data_counter_;
+
+  // For when both validity bitmap and data are present
+  BinaryBitBlockCounter data_and_validity_counter_;
+  const bool has_validity_;
+};
+
+/// \brief The Filter implementation for primitive (fixed-width) types does not
+/// use the logical Arrow type but rather the physical C type. This way we only
+/// generate one take function for each byte width. We use the same
+/// implementation here for boolean and fixed-byte-size inputs with some
+/// template specialization.
+template <typename ArrowType>
+class PrimitiveFilterImpl {
+ public:
+  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
+                                      uint8_t, typename ArrowType::c_type>::type;
+
+  PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
+                      FilterOptions::NullSelectionBehavior null_selection,
+                      ArrayData* out_arr)
+      : values_is_valid_(values.buffers[0].data),
+        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
+        values_null_count_(values.null_count),
+        values_offset_(values.offset),
+        values_length_(values.length),
+        filter_is_valid_(filter.buffers[0].data),
+        filter_data_(filter.buffers[1].data),
+        filter_null_count_(filter.null_count),
+        filter_offset_(filter.offset),
+        null_selection_(null_selection) {
+    if (values.type->id() != Type::BOOL) {
+      // No offset applied for boolean because it's a bitmap
+      values_data_ += values.offset;
+    }
+
+    if (out_arr->buffers[0] != nullptr) {
+      // May not be allocated if neither filter nor values contains nulls
+      out_is_valid_ = out_arr->buffers[0]->mutable_data();
+    }
+    out_data_ = reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data());
+    out_offset_ = out_arr->offset;
+    out_length_ = out_arr->length;
+    out_position_ = 0;
+  }
+
+  void ExecNonNull() {
+    // Fast filter when values and filter are not null
+    ::arrow::internal::VisitSetBitRunsVoid(
+        filter_data_, filter_offset_, values_length_,
+        [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
+  }
+
+  void Exec() {
+    if (filter_null_count_ == 0 && values_null_count_ == 0) {
+      return ExecNonNull();
+    }
+
+    // Bit counters used for both null_selection behaviors
+    DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
+                                      values_length_);
+    OptionalBitBlockCounter data_counter(values_is_valid_, values_offset_,
+                                         values_length_);
+    OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
+                                                 values_length_);
+
+    auto WriteNotNull = [&](int64_t index) {
+      bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
+      // Increments out_position_
+      WriteValue(index);
+    };
+
+    auto WriteMaybeNull = [&](int64_t index) {
+      bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
+                         bit_util::GetBit(values_is_valid_, values_offset_ + index));
+      // Increments out_position_
+      WriteValue(index);
+    };
+
+    int64_t in_position = 0;
+    while (in_position < values_length_) {
+      BitBlockCount filter_block = drop_null_counter.NextBlock();
+      BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+      BitBlockCount data_block = data_counter.NextWord();
+      if (filter_block.AllSet() && data_block.AllSet()) {
+        // Fastest path: all values in block are included and not null
+        bit_util::SetBitsTo(out_is_valid_, out_offset_ + out_position_,
+                            filter_block.length, true);
+        WriteValueSegment(in_position, filter_block.length);
+        in_position += filter_block.length;
+      } else if (filter_block.AllSet()) {
+        // Faster: all values are selected, but some values are null
+        // Batch copy bits from values validity bitmap to output validity bitmap
+        CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
+                   out_is_valid_, out_offset_ + out_position_);
+        WriteValueSegment(in_position, filter_block.length);
+        in_position += filter_block.length;
+      } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
+        // For this exceedingly common case in low-selectivity filters we can
+        // skip further analysis of the data and move on to the next block.
+        in_position += filter_block.length;
+      } else {
+        // Some filter values are false or null
+        if (data_block.AllSet()) {
+          // No values are null
+          if (filter_valid_block.AllSet()) {
+            // Filter is non-null but some values are false
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                WriteNotNull(in_position);
+              }
+              ++in_position;
+            }
+          } else if (null_selection_ == FilterOptions::DROP) {
+            // If any values are selected, they ARE NOT null
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
+                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                WriteNotNull(in_position);
+              }
+              ++in_position;
+            }
+          } else {  // null_selection == FilterOptions::EMIT_NULL
+            // Data values in this block are not null
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              const bool is_valid =
+                  bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position);
+              if (is_valid &&
+                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                // Filter slot is non-null and set
+                WriteNotNull(in_position);
+              } else if (!is_valid) {
+                // Filter slot is null, so we have a null in the output
+                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
+                WriteNull();
+              }
+              ++in_position;
+            }
+          }
+        } else {  // !data_block.AllSet()
+          // Some values are null
+          if (filter_valid_block.AllSet()) {
+            // Filter is non-null but some values are false
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                WriteMaybeNull(in_position);
+              }
+              ++in_position;
+            }
+          } else if (null_selection_ == FilterOptions::DROP) {
+            // If any values are selected, they ARE NOT null
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position) &&
+                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                WriteMaybeNull(in_position);
+              }
+              ++in_position;
+            }
+          } else {  // null_selection == FilterOptions::EMIT_NULL
+            // Data values in this block are not null
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              const bool is_valid =
+                  bit_util::GetBit(filter_is_valid_, filter_offset_ + in_position);
+              if (is_valid &&
+                  bit_util::GetBit(filter_data_, filter_offset_ + in_position)) {
+                // Filter slot is non-null and set
+                WriteMaybeNull(in_position);
+              } else if (!is_valid) {
+                // Filter slot is null, so we have a null in the output
+                bit_util::ClearBit(out_is_valid_, out_offset_ + out_position_);
+                WriteNull();
+              }
+              ++in_position;
+            }
+          }
+        }
+      }  // !filter_block.AllSet()
+    }    // while(in_position < values_length_)
+  }
+
+  // Write the next out_position given the selected in_position for the input
+  // data and advance out_position
+  void WriteValue(int64_t in_position) {
+    out_data_[out_position_++] = values_data_[in_position];
+  }
+
+  void WriteValueSegment(int64_t in_start, int64_t length) {
+    std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
+    out_position_ += length;
+  }
+
+  void WriteNull() {
+    // Zero the memory
+    out_data_[out_position_++] = T{};
+  }
+
+ private:
+  const uint8_t* values_is_valid_;
+  const T* values_data_;
+  int64_t values_null_count_;
+  int64_t values_offset_;
+  int64_t values_length_;
+  const uint8_t* filter_is_valid_;
+  const uint8_t* filter_data_;
+  int64_t filter_null_count_;
+  int64_t filter_offset_;
+  FilterOptions::NullSelectionBehavior null_selection_;
+  uint8_t* out_is_valid_;
+  T* out_data_;
+  int64_t out_offset_;
+  int64_t out_length_;
+  int64_t out_position_;
+};
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
+  bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
+                     bit_util::GetBit(values_data_, values_offset_ + in_position));
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
+                                                                int64_t length) {
+  CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
+             out_offset_ + out_position_);
+  out_position_ += length;
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
+  // Zero the bit
+  bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
+}
+
+Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& filter = batch[1].array;
+  FilterOptions::NullSelectionBehavior null_selection =
+      FilterState::Get(ctx).null_selection_behavior;
+
+  int64_t output_length = GetFilterOutputSize(filter, null_selection);
+
+  ArrayData* out_arr = out->array_data().get();
+
+  // The output precomputed null count is unknown except in the narrow
+  // condition that all the values are non-null and the filter will not cause
+  // any new nulls to be created.
+  if (values.null_count == 0 &&
+      (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
+    out_arr->null_count = 0;
+  } else {
+    out_arr->null_count = kUnknownNullCount;
+  }
+
+  // When neither the values nor filter is known to have any nulls, we will
+  // elect the optimized ExecNonNull path where there is no need to populate a
+  // validity bitmap.
+  bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
+
+  const int bit_width = values.type->bit_width();
+  RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width,
+                                              allocate_validity, out_arr));
+
+  switch (bit_width) {
+    case 1:
+      PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 8:
+      PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 16:
+      PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 32:
+      PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+      break;
+    case 64:
+      PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+      break;
+    default:
+      DCHECK(false) << "Invalid values bit width";
+      break;
+  }
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Optimized filter for base binary types (32-bit and 64-bit)
+
+#define BINARY_FILTER_SETUP_COMMON()                                                    \
+  const auto raw_offsets = values.GetValues<offset_type>(1);                            \
+  const uint8_t* raw_data = values.buffers[2].data;                                     \
+                                                                                        \
+  TypedBufferBuilder<offset_type> offset_builder(ctx->memory_pool());                   \
+  TypedBufferBuilder<uint8_t> data_builder(ctx->memory_pool());                         \
+  RETURN_NOT_OK(offset_builder.Reserve(output_length + 1));                             \
+                                                                                        \
+  /* Presize the data builder with a rough estimate */                                  \
+  if (values.length > 0) {                                                              \
+    const double mean_value_length = (raw_offsets[values.length] - raw_offsets[0]) /    \
+                                     static_cast<double>(values.length);                \
+    RETURN_NOT_OK(                                                                      \
+        data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length))); \
+  }                                                                                     \
+  int64_t space_available = data_builder.capacity();                                    \
+  offset_type offset = 0;
+
+#define APPEND_RAW_DATA(DATA, NBYTES)                                  \
+  if (ARROW_PREDICT_FALSE(NBYTES > space_available)) {                 \
+    RETURN_NOT_OK(data_builder.Reserve(NBYTES));                       \
+    space_available = data_builder.capacity() - data_builder.length(); \
+  }                                                                    \
+  data_builder.UnsafeAppend(DATA, NBYTES);                             \
+  space_available -= NBYTES
+
+#define APPEND_SINGLE_VALUE()                                                       \
+  do {                                                                              \
+    offset_type val_size = raw_offsets[in_position + 1] - raw_offsets[in_position]; \
+    APPEND_RAW_DATA(raw_data + raw_offsets[in_position], val_size);                 \
+    offset += val_size;                                                             \
+  } while (0)
+
+// Optimized binary filter for the case where neither values nor filter have
+// nulls
+template <typename Type>
+Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
+                               const ArraySpan& filter, int64_t output_length,
+                               FilterOptions::NullSelectionBehavior null_selection,
+                               ArrayData* out) {
+  using offset_type = typename Type::offset_type;
+  const auto filter_data = filter.buffers[1].data;
+
+  BINARY_FILTER_SETUP_COMMON();
+
+  RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+      filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
+        // Bulk-append raw data
+        const offset_type run_data_bytes =
+            (raw_offsets[position + length] - raw_offsets[position]);
+        APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+        // Append offsets
+        offset_type cur_offset = raw_offsets[position];
+        for (int64_t i = 0; i < length; ++i) {
+          offset_builder.UnsafeAppend(offset);
+          offset += raw_offsets[i + position + 1] - cur_offset;
+          cur_offset = raw_offsets[i + position + 1];
+        }
+        return Status::OK();
+      }));
+
+  offset_builder.UnsafeAppend(offset);
+  out->length = output_length;
+  RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+  return data_builder.Finish(&out->buffers[2]);
+}
+
+template <typename Type>
+Status BinaryFilterImpl(KernelContext* ctx, const ArraySpan& values,
+                        const ArraySpan& filter, int64_t output_length,
+                        FilterOptions::NullSelectionBehavior null_selection,
+                        ArrayData* out) {
+  using offset_type = typename Type::offset_type;
+
+  const auto filter_data = filter.buffers[1].data;
+  const uint8_t* filter_is_valid = filter.buffers[0].data;
+  const int64_t filter_offset = filter.offset;
+
+  const uint8_t* values_is_valid = values.buffers[0].data;
+  const int64_t values_offset = values.offset;
+
+  uint8_t* out_is_valid = out->buffers[0]->mutable_data();
+  // Zero bits and then only have to set valid values to true
+  bit_util::SetBitsTo(out_is_valid, 0, output_length, false);
+
+  // We use 3 block counters for fast scanning of the filter
+  //
+  // * values_valid_counter: for values null/not-null
+  // * filter_valid_counter: for filter null/not-null
+  // * filter_counter: for filter true/false
+  OptionalBitBlockCounter values_valid_counter(values_is_valid, values_offset,
+                                               values.length);
+  OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
+                                               filter.length);
+  BitBlockCounter filter_counter(filter_data, filter_offset, filter.length);
+
+  BINARY_FILTER_SETUP_COMMON();
+
+  int64_t in_position = 0;
+  int64_t out_position = 0;
+  while (in_position < filter.length) {
+    BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+    BitBlockCount values_valid_block = values_valid_counter.NextWord();
+    BitBlockCount filter_block = filter_counter.NextWord();
+    if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
+      // For this exceedingly common case in low-selectivity filters we can
+      // skip further analysis of the data and move on to the next block.
+      in_position += filter_block.length;
+    } else if (filter_valid_block.AllSet()) {
+      // Simpler path: no filter values are null
+      if (filter_block.AllSet()) {
+        // Fastest path: filter values are all true and not null
+        if (values_valid_block.AllSet()) {
+          // The values aren't null either
+          bit_util::SetBitsTo(out_is_valid, out_position, filter_block.length, true);
+
+          // Bulk-append raw data
+          offset_type block_data_bytes =
+              (raw_offsets[in_position + filter_block.length] - raw_offsets[in_position]);
+          APPEND_RAW_DATA(raw_data + raw_offsets[in_position], block_data_bytes);
+          // Append offsets
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            offset_builder.UnsafeAppend(offset);
+            offset += raw_offsets[in_position + 1] - raw_offsets[in_position];
+          }
+          out_position += filter_block.length;
+        } else {
+          // Some of the values in this block are null
+          for (int64_t i = 0; i < filter_block.length;
+               ++i, ++in_position, ++out_position) {
+            offset_builder.UnsafeAppend(offset);
+            if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+              bit_util::SetBit(out_is_valid, out_position);
+              APPEND_SINGLE_VALUE();
+            }
+          }
+        }
+      } else {  // !filter_block.AllSet()
+        // Some of the filter values are false, but all not null
+        if (values_valid_block.AllSet()) {
+          // All the values are not-null, so we can skip null checking for
+          // them
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              bit_util::SetBit(out_is_valid, out_position++);
+              APPEND_SINGLE_VALUE();
+            }
+          }
+        } else {
+          // Some of the values in the block are null, so we have to check
+          // each one
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+                bit_util::SetBit(out_is_valid, out_position);
+                APPEND_SINGLE_VALUE();
+              }
+              ++out_position;
+            }
+          }
+        }
+      }
+    } else {  // !filter_valid_block.AllSet()
+      // Some of the filter values are null, so we have to handle the DROP
+      // versus EMIT_NULL null selection behavior.
+      if (null_selection == FilterOptions::DROP) {
+        // Filter null values are treated as false.
+        if (values_valid_block.AllSet()) {
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              bit_util::SetBit(out_is_valid, out_position++);
+              APPEND_SINGLE_VALUE();
+            }
+          }
+        } else {
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+                bit_util::SetBit(out_is_valid, out_position);
+                APPEND_SINGLE_VALUE();
+              }
+              ++out_position;
+            }
+          }
+        }
+      } else {
+        // EMIT_NULL
+
+        // Filter null values are appended to output as null whether the
+        // value in the corresponding slot is valid or not
+        if (values_valid_block.AllSet()) {
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            const bool filter_not_null =
+                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
+            if (filter_not_null &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              bit_util::SetBit(out_is_valid, out_position++);
+              APPEND_SINGLE_VALUE();
+            } else if (!filter_not_null) {
+              offset_builder.UnsafeAppend(offset);
+              ++out_position;
+            }
+          }
+        } else {
+          for (int64_t i = 0; i < filter_block.length; ++i, ++in_position) {
+            const bool filter_not_null =
+                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
+            if (filter_not_null &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              offset_builder.UnsafeAppend(offset);
+              if (bit_util::GetBit(values_is_valid, values_offset + in_position)) {
+                bit_util::SetBit(out_is_valid, out_position);
+                APPEND_SINGLE_VALUE();
+              }
+              ++out_position;
+            } else if (!filter_not_null) {
+              offset_builder.UnsafeAppend(offset);
+              ++out_position;
+            }
+          }
+        }
+      }
+    }
+  }
+  offset_builder.UnsafeAppend(offset);
+  out->length = output_length;
+  RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+  return data_builder.Finish(&out->buffers[2]);
+}
+
+#undef BINARY_FILTER_SETUP_COMMON
+#undef APPEND_RAW_DATA
+#undef APPEND_SINGLE_VALUE
+
+Status BinaryFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  FilterOptions::NullSelectionBehavior null_selection =
+      FilterState::Get(ctx).null_selection_behavior;
+
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& filter = batch[1].array;
+  int64_t output_length = GetFilterOutputSize(filter, null_selection);
+
+  ArrayData* out_arr = out->array_data().get();
+
+  // The output precomputed null count is unknown except in the narrow
+  // condition that all the values are non-null and the filter will not cause
+  // any new nulls to be created.
+  if (values.null_count == 0 &&
+      (null_selection == FilterOptions::DROP || filter.null_count == 0)) {
+    out_arr->null_count = 0;
+  } else {
+    out_arr->null_count = kUnknownNullCount;
+  }
+  Type::type type_id = values.type->id();
+  if (values.null_count == 0 && filter.null_count == 0) {
+    // Faster no-nulls case
+    if (is_binary_like(type_id)) {
+      RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
+          ctx, values, filter, output_length, null_selection, out_arr));
+    } else if (is_large_binary_like(type_id)) {
+      RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
+          ctx, values, filter, output_length, null_selection, out_arr));
+    } else {
+      DCHECK(false);
+    }
+  } else {
+    // Output may have nulls
+    RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
+    if (is_binary_like(type_id)) {
+      RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
+                                                 null_selection, out_arr));
+    } else if (is_large_binary_like(type_id)) {
+      RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
+                                                      null_selection, out_arr));
+    } else {
+      DCHECK(false);
+    }
+  }
+
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Null filter
+
+Status NullFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  int64_t output_length =
+      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
+  out->value = std::make_shared<NullArray>(output_length)->data();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dictionary filter
+
+Status DictionaryFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DictionaryArray dict_values(batch[0].array.ToArrayData());
+  Datum result;
+  RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array.ToArrayData(),
+                       FilterState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
+  DictionaryArray filtered_values(dict_values.type(), result.make_array(),
+                                  dict_values.dictionary());
+  out->value = filtered_values.data();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Extension filter
+
+Status ExtensionFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ExtensionArray ext_values(batch[0].array.ToArrayData());
+  Datum result;
+  RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array.ToArrayData(),
+                       FilterState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
+  ExtensionArray filtered_values(ext_values.type(), result.make_array());
+  out->value = filtered_values.data();
+  return Status::OK();
+}
+
+Status StructFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  // Transform filter to selection indices and then use Take.
+  std::shared_ptr<ArrayData> indices;
+  RETURN_NOT_OK(GetTakeIndices(batch[1].array,
+                               FilterState::Get(ctx).null_selection_behavior,
+                               ctx->memory_pool())
+                    .Value(&indices));
+
+  Datum result;
+  RETURN_NOT_OK(Take(batch[0].array.ToArrayData(), Datum(indices),
+                     TakeOptions::NoBoundsCheck(), ctx->exec_context())
+                    .Value(&result));
+  out->value = result.array();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Implement Filter metafunction
+
+Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
+                                                       const Datum& filter,
+                                                       const FunctionOptions* options,
+                                                       ExecContext* ctx) {
+  if (batch.num_rows() != filter.length()) {
+    return Status::Invalid("Filter inputs must all be the same length");
+  }
+
+  // Convert filter to selection vector/indices and use Take
+  const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<ArrayData> indices,
+      GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
+                     ctx->memory_pool()));
+  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
+                                          TakeOptions::NoBoundsCheck(), ctx));
+    columns[i] = out.make_array();
+  }
+  return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
+                                           const FunctionOptions* options,
+                                           ExecContext* ctx) {
+  if (table.num_rows() != filter.length()) {
+    return Status::Invalid("Filter inputs must all be the same length");
+  }
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+
+  // Last input element will be the filter array
+  const int num_columns = table.num_columns();
+  std::vector<ArrayVector> inputs(num_columns + 1);
+
+  // Fetch table columns
+  for (int i = 0; i < num_columns; ++i) {
+    inputs[i] = table.column(i)->chunks();
+  }
+  // Fetch filter
+  const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+  switch (filter.kind()) {
+    case Datum::ARRAY:
+      inputs.back().push_back(filter.make_array());
+      break;
+    case Datum::CHUNKED_ARRAY:
+      inputs.back() = filter.chunked_array()->chunks();
+      break;
+    default:
+      return Status::NotImplemented("Filter should be array-like");
+  }
+
+  // Rechunk inputs to allow consistent iteration over their respective chunks
+  inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+  // Instead of filtering each column with the boolean filter
+  // (which would be slow if the table has a large number of columns: ARROW-10569),
+  // convert each filter chunk to indices, and take() the column.
+  const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+  std::vector<ArrayVector> out_columns(num_columns);
+  int64_t out_num_rows = 0;
+
+  for (int64_t i = 0; i < num_chunks; ++i) {
+    const ArrayData& filter_chunk = *inputs.back()[i]->data();
+    ARROW_ASSIGN_OR_RAISE(
+        const auto indices,
+        GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
+                       ctx->memory_pool()));
+
+    if (indices->length > 0) {
+      // Take from all input columns
+      Datum indices_datum{std::move(indices)};
+      for (int col = 0; col < num_columns; ++col) {
+        const auto& column_chunk = inputs[col][i];
+        ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
+                                              TakeOptions::NoBoundsCheck(), ctx));
+        out_columns[col].push_back(std::move(out).make_array());
+      }
+      out_num_rows += indices->length;
+    }
+  }
+
+  ChunkedArrayVector out_chunks(num_columns);
+  for (int i = 0; i < num_columns; ++i) {
+    out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
+                                                   table.column(i)->type());
+  }
+  return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
+}
+
+const FunctionDoc filter_doc(
+    "Filter with a boolean selection filter",
+    ("The output is populated with values from the input at positions\n"
+     "where the selection filter is non-zero.  Nulls in the selection filter\n"
+     "are handled based on FilterOptions."),
+    {"input", "selection_filter"}, "FilterOptions");
+
+class FilterMetaFunction : public MetaFunction {
+ public:
+  FilterMetaFunction()
+      : MetaFunction("filter", Arity::Binary(), filter_doc, GetDefaultFilterOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options,
+                            ExecContext* ctx) const override {
+    if (args[1].type()->id() != Type::BOOL) {
+      return Status::NotImplemented("Filter argument must be boolean type");
+    }
+
+    if (args[0].kind() == Datum::RECORD_BATCH) {
+      auto values_batch = args[0].record_batch();
+      ARROW_ASSIGN_OR_RAISE(
+          std::shared_ptr<RecordBatch> out_batch,
+          FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx));
+      return Datum(out_batch);
+    } else if (args[0].kind() == Datum::TABLE) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> out_table,
+                            FilterTable(*args[0].table(), args[1], options, ctx));
+      return Datum(out_table);
+    } else {
+      return CallFunction("array_filter", args, options, ctx);
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+
+}  // namespace
+
+const FilterOptions* GetDefaultFilterOptions() {
+  static const auto kDefaultFilterOptions = FilterOptions::Defaults();
+  return &kDefaultFilterOptions;
+}
+
+std::unique_ptr<Function> MakeFilterMetaFunction() {
+  return std::make_unique<FilterMetaFunction>();
+}
+
+void PopulateFilterKernels(std::vector<SelectionKernelData>* out) {
+  *out = {
+      {InputType(match::Primitive()), PrimitiveFilterExec},
+      {InputType(match::BinaryLike()), BinaryFilterExec},
+      {InputType(match::LargeBinaryLike()), BinaryFilterExec},
+      {InputType(Type::FIXED_SIZE_BINARY), FSBFilterExec},
+      {InputType(null()), NullFilterExec},
+      {InputType(Type::DECIMAL128), FSBFilterExec},
+      {InputType(Type::DECIMAL256), FSBFilterExec},
+      {InputType(Type::DICTIONARY), DictionaryFilterExec},
+      {InputType(Type::EXTENSION), ExtensionFilterExec},
+      {InputType(Type::LIST), ListFilterExec},
+      {InputType(Type::LARGE_LIST), LargeListFilterExec},
+      {InputType(Type::FIXED_SIZE_LIST), FSLFilterExec},
+      {InputType(Type::DENSE_UNION), DenseUnionFilterExec},
+      {InputType(Type::STRUCT), StructFilterExec},
+      {InputType(Type::MAP), MapFilterExec},
+  };
+}
+
+}  // namespace internal
+}  // namespace compute
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.h
new file mode 100644
index 0000000000000..4ad24bc0dd8ab
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+const FilterOptions* GetDefaultFilterOptions();
+
+std::unique_ptr<Function> MakeFilterMetaFunction();
+
+void PopulateFilterKernels(std::vector<SelectionKernelData>* out);
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
new file mode 100644
index 0000000000000..0fd5d9ca00f47
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -0,0 +1,814 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::CheckIndexBounds;
+
+namespace compute {
+namespace internal {
+
+void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
+                               VectorKernel base_kernel, InputType selection_type,
+                               const std::vector<SelectionKernelData>& kernels,
+                               const FunctionOptions* default_options,
+                               FunctionRegistry* registry) {
+  auto func = std::make_shared<VectorFunction>(name, Arity::Binary(), std::move(doc),
+                                               default_options);
+  for (auto& kernel_data : kernels) {
+    base_kernel.signature =
+        KernelSignature::Make({std::move(kernel_data.input), selection_type}, FirstType);
+    base_kernel.exec = kernel_data.exec;
+    DCHECK_OK(func->AddKernel(base_kernel));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
+                                     bool allocate_validity, ArrayData* out) {
+  // Preallocate memory
+  out->length = length;
+  out->buffers.resize(2);
+
+  if (allocate_validity) {
+    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
+  }
+  if (bit_width == 1) {
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
+  } else {
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(length * bit_width / 8));
+  }
+  return Status::OK();
+}
+
+namespace {
+
+using FilterState = OptionsWrapper<FilterOptions>;
+using TakeState = OptionsWrapper<TakeOptions>;
+
+// ----------------------------------------------------------------------
+// Implement take for other data types where there is less performance
+// sensitivity by visiting the selected indices.
+
+// Use CRTP to dispatch to type-specific processing of take indices for each
+// unsigned integer type.
+template <typename Impl, typename Type>
+struct Selection {
+  using ValuesArrayType = typename TypeTraits<Type>::ArrayType;
+
+  // Forwards the generic value visitors to the VisitFilter template
+  struct FilterAdapter {
+    static constexpr bool is_take = false;
+
+    Impl* impl;
+    explicit FilterAdapter(Impl* impl) : impl(impl) {}
+    template <typename ValidVisitor, typename NullVisitor>
+    Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+      return impl->VisitFilter(std::forward<ValidVisitor>(visit_valid),
+                               std::forward<NullVisitor>(visit_null));
+    }
+  };
+
+  // Forwards the generic value visitors to the take index visitor template
+  template <typename IndexCType>
+  struct TakeAdapter {
+    static constexpr bool is_take = true;
+
+    Impl* impl;
+    explicit TakeAdapter(Impl* impl) : impl(impl) {}
+    template <typename ValidVisitor, typename NullVisitor>
+    Status Generate(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+      return impl->template VisitTake<IndexCType>(std::forward<ValidVisitor>(visit_valid),
+                                                  std::forward<NullVisitor>(visit_null));
+    }
+  };
+
+  KernelContext* ctx;
+  const ArraySpan& values;
+  const ArraySpan& selection;
+  int64_t output_length;
+  ArrayData* out;
+  TypedBufferBuilder<bool> validity_builder;
+
+  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+            ExecResult* out)
+      : ctx(ctx),
+        values(batch[0].array),
+        selection(batch[1].array),
+        output_length(output_length),
+        out(out->array_data().get()),
+        validity_builder(ctx->memory_pool()) {}
+
+  virtual ~Selection() = default;
+
+  Status FinishCommon() {
+    out->buffers.resize(values.num_buffers());
+    out->length = validity_builder.length();
+    out->null_count = validity_builder.false_count();
+    return validity_builder.Finish(&out->buffers[0]);
+  }
+
+  template <typename IndexCType, typename ValidVisitor, typename NullVisitor>
+  Status VisitTake(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+    const auto indices_values = selection.GetValues<IndexCType>(1);
+    const uint8_t* is_valid = selection.buffers[0].data;
+    arrow::internal::OptionalBitIndexer indices_is_valid(is_valid, selection.offset);
+    arrow::internal::OptionalBitIndexer values_is_valid(values.buffers[0].data,
+                                                        values.offset);
+
+    const bool values_have_nulls = values.MayHaveNulls();
+    arrow::internal::OptionalBitBlockCounter bit_counter(is_valid, selection.offset,
+                                                         selection.length);
+    int64_t position = 0;
+    while (position < selection.length) {
+      BitBlockCount block = bit_counter.NextBlock();
+      const bool indices_have_nulls = block.popcount < block.length;
+      if (!indices_have_nulls && !values_have_nulls) {
+        // Fastest path, neither indices nor values have nulls
+        validity_builder.UnsafeAppend(block.length, true);
+        for (int64_t i = 0; i < block.length; ++i) {
+          RETURN_NOT_OK(visit_valid(indices_values[position++]));
+        }
+      } else if (block.popcount > 0) {
+        // Since we have to branch on whether the indices are null or not, we
+        // combine the "non-null indices block but some values null" and
+        // "some-null indices block but values non-null" into a single loop.
+        for (int64_t i = 0; i < block.length; ++i) {
+          if ((!indices_have_nulls || indices_is_valid[position]) &&
+              values_is_valid[indices_values[position]]) {
+            validity_builder.UnsafeAppend(true);
+            RETURN_NOT_OK(visit_valid(indices_values[position]));
+          } else {
+            validity_builder.UnsafeAppend(false);
+            RETURN_NOT_OK(visit_null());
+          }
+          ++position;
+        }
+      } else {
+        // The whole block is null
+        validity_builder.UnsafeAppend(block.length, false);
+        for (int64_t i = 0; i < block.length; ++i) {
+          RETURN_NOT_OK(visit_null());
+        }
+        position += block.length;
+      }
+    }
+    return Status::OK();
+  }
+
+  // We use the NullVisitor both for "selected" nulls as well as "emitted"
+  // nulls coming from the filter when using FilterOptions::EMIT_NULL
+  template <typename ValidVisitor, typename NullVisitor>
+  Status VisitFilter(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
+    auto null_selection = FilterState::Get(ctx).null_selection_behavior;
+
+    const uint8_t* filter_data = selection.buffers[1].data;
+
+    const uint8_t* filter_is_valid = selection.buffers[0].data;
+    const int64_t filter_offset = selection.offset;
+    arrow::internal::OptionalBitIndexer values_is_valid(values.buffers[0].data,
+                                                        values.offset);
+
+    // We use 3 block counters for fast scanning of the filter
+    //
+    // * values_valid_counter: for values null/not-null
+    // * filter_valid_counter: for filter null/not-null
+    // * filter_counter: for filter true/false
+    arrow::internal::OptionalBitBlockCounter values_valid_counter(
+        values.buffers[0].data, values.offset, values.length);
+    arrow::internal::OptionalBitBlockCounter filter_valid_counter(
+        filter_is_valid, filter_offset, selection.length);
+    arrow::internal::BitBlockCounter filter_counter(filter_data, filter_offset,
+                                                    selection.length);
+    int64_t in_position = 0;
+
+    auto AppendNotNull = [&](int64_t index) -> Status {
+      validity_builder.UnsafeAppend(true);
+      return visit_valid(index);
+    };
+
+    auto AppendNull = [&]() -> Status {
+      validity_builder.UnsafeAppend(false);
+      return visit_null();
+    };
+
+    auto AppendMaybeNull = [&](int64_t index) -> Status {
+      if (values_is_valid[index]) {
+        return AppendNotNull(index);
+      } else {
+        return AppendNull();
+      }
+    };
+
+    while (in_position < selection.length) {
+      arrow::internal::BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
+      arrow::internal::BitBlockCount values_valid_block = values_valid_counter.NextWord();
+      arrow::internal::BitBlockCount filter_block = filter_counter.NextWord();
+      if (filter_block.NoneSet() && null_selection == FilterOptions::DROP) {
+        // For this exceedingly common case in low-selectivity filters we can
+        // skip further analysis of the data and move on to the next block.
+        in_position += filter_block.length;
+      } else if (filter_valid_block.AllSet()) {
+        // Simpler path: no filter values are null
+        if (filter_block.AllSet()) {
+          // Fastest path: filter values are all true and not null
+          if (values_valid_block.AllSet()) {
+            // The values aren't null either
+            validity_builder.UnsafeAppend(filter_block.length, true);
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              RETURN_NOT_OK(visit_valid(in_position++));
+            }
+          } else {
+            // Some of the values in this block are null
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              RETURN_NOT_OK(AppendMaybeNull(in_position++));
+            }
+          }
+        } else {  // !filter_block.AllSet()
+          // Some of the filter values are false, but all not null
+          if (values_valid_block.AllSet()) {
+            // All the values are not-null, so we can skip null checking for
+            // them
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+                RETURN_NOT_OK(AppendNotNull(in_position));
+              }
+              ++in_position;
+            }
+          } else {
+            // Some of the values in the block are null, so we have to check
+            // each one
+            for (int64_t i = 0; i < filter_block.length; ++i) {
+              if (bit_util::GetBit(filter_data, filter_offset + in_position)) {
+                RETURN_NOT_OK(AppendMaybeNull(in_position));
+              }
+              ++in_position;
+            }
+          }
+        }
+      } else {  // !filter_valid_block.AllSet()
+        // Some of the filter values are null, so we have to handle the DROP
+        // versus EMIT_NULL null selection behavior.
+        if (null_selection == FilterOptions::DROP) {
+          // Filter null values are treated as false.
+          for (int64_t i = 0; i < filter_block.length; ++i) {
+            if (bit_util::GetBit(filter_is_valid, filter_offset + in_position) &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              RETURN_NOT_OK(AppendMaybeNull(in_position));
+            }
+            ++in_position;
+          }
+        } else {
+          // Filter null values are appended to output as null whether the
+          // value in the corresponding slot is valid or not
+          for (int64_t i = 0; i < filter_block.length; ++i) {
+            const bool filter_not_null =
+                bit_util::GetBit(filter_is_valid, filter_offset + in_position);
+            if (filter_not_null &&
+                bit_util::GetBit(filter_data, filter_offset + in_position)) {
+              RETURN_NOT_OK(AppendMaybeNull(in_position));
+            } else if (!filter_not_null) {
+              // EMIT_NULL case
+              RETURN_NOT_OK(AppendNull());
+            }
+            ++in_position;
+          }
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  virtual Status Init() { return Status::OK(); }
+
+  // Implementation specific finish logic
+  virtual Status Finish() = 0;
+
+  Status ExecTake() {
+    RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
+    RETURN_NOT_OK(Init());
+    int index_width = this->selection.type->byte_width();
+
+    // CTRP dispatch here
+    switch (index_width) {
+      case 1: {
+        Status s =
+            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint8_t>>();
+        RETURN_NOT_OK(s);
+      } break;
+      case 2: {
+        Status s =
+            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint16_t>>();
+        RETURN_NOT_OK(s);
+      } break;
+      case 4: {
+        Status s =
+            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint32_t>>();
+        RETURN_NOT_OK(s);
+      } break;
+      case 8: {
+        Status s =
+            static_cast<Impl*>(this)->template GenerateOutput<TakeAdapter<uint64_t>>();
+        RETURN_NOT_OK(s);
+      } break;
+      default:
+        DCHECK(false) << "Invalid index width";
+        break;
+    }
+    RETURN_NOT_OK(this->FinishCommon());
+    return Finish();
+  }
+
+  Status ExecFilter() {
+    RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
+    RETURN_NOT_OK(Init());
+    // CRTP dispatch
+    Status s = static_cast<Impl*>(this)->template GenerateOutput<FilterAdapter>();
+    RETURN_NOT_OK(s);
+    RETURN_NOT_OK(this->FinishCommon());
+    return Finish();
+  }
+};
+
+#define LIFT_BASE_MEMBERS()                               \
+  using ValuesArrayType = typename Base::ValuesArrayType; \
+  using Base::ctx;                                        \
+  using Base::values;                                     \
+  using Base::selection;                                  \
+  using Base::output_length;                              \
+  using Base::out;                                        \
+  using Base::validity_builder
+
+inline Status VisitNoop() { return Status::OK(); }
+
+// A selection implementation for 32-bit and 64-bit variable binary
+// types. Common generated kernels are shared between Binary/String and
+// LargeBinary/LargeString
+template <typename Type>
+struct VarBinarySelectionImpl : public Selection<VarBinarySelectionImpl<Type>, Type> {
+  using offset_type = typename Type::offset_type;
+
+  using Base = Selection<VarBinarySelectionImpl<Type>, Type>;
+  LIFT_BASE_MEMBERS();
+
+  TypedBufferBuilder<offset_type> offset_builder;
+  TypedBufferBuilder<uint8_t> data_builder;
+
+  static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
+
+  VarBinarySelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                         ExecResult* out)
+      : Base(ctx, batch, output_length, out),
+        offset_builder(ctx->memory_pool()),
+        data_builder(ctx->memory_pool()) {}
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    const auto raw_offsets = this->values.template GetValues<offset_type>(1);
+    const uint8_t* raw_data = this->values.buffers[2].data;
+
+    // Presize the data builder with a rough estimate of the required data size
+    if (this->values.length > 0) {
+      int64_t data_length = raw_offsets[this->values.length] - raw_offsets[0];
+      const double mean_value_length =
+          data_length / static_cast<double>(this->values.length);
+
+      // TODO: See if possible to reduce output_length for take/filter cases
+      // where there are nulls in the selection array
+      RETURN_NOT_OK(
+          data_builder.Reserve(static_cast<int64_t>(mean_value_length * output_length)));
+    }
+    int64_t space_available = data_builder.capacity();
+
+    offset_type offset = 0;
+    Adapter adapter(this);
+    RETURN_NOT_OK(adapter.Generate(
+        [&](int64_t index) {
+          offset_builder.UnsafeAppend(offset);
+          offset_type val_offset = raw_offsets[index];
+          offset_type val_size = raw_offsets[index + 1] - val_offset;
+
+          // Use static property to prune this code from the filter path in
+          // optimized builds
+          if (Adapter::is_take &&
+              ARROW_PREDICT_FALSE(static_cast<int64_t>(offset) +
+                                  static_cast<int64_t>(val_size)) > kOffsetLimit) {
+            return Status::Invalid("Take operation overflowed binary array capacity");
+          }
+          offset += val_size;
+          if (ARROW_PREDICT_FALSE(val_size > space_available)) {
+            RETURN_NOT_OK(data_builder.Reserve(val_size));
+            space_available = data_builder.capacity() - data_builder.length();
+          }
+          data_builder.UnsafeAppend(raw_data + val_offset, val_size);
+          space_available -= val_size;
+          return Status::OK();
+        },
+        [&]() {
+          offset_builder.UnsafeAppend(offset);
+          return Status::OK();
+        }));
+    offset_builder.UnsafeAppend(offset);
+    return Status::OK();
+  }
+
+  Status Init() override { return offset_builder.Reserve(output_length + 1); }
+
+  Status Finish() override {
+    RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+    return data_builder.Finish(&out->buffers[2]);
+  }
+};
+
+struct FSBSelectionImpl : public Selection<FSBSelectionImpl, FixedSizeBinaryType> {
+  using Base = Selection<FSBSelectionImpl, FixedSizeBinaryType>;
+  LIFT_BASE_MEMBERS();
+
+  TypedBufferBuilder<uint8_t> data_builder;
+
+  FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                   ExecResult* out)
+      : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    FixedSizeBinaryArray typed_values(this->values.ToArrayData());
+    int32_t value_size = typed_values.byte_width();
+
+    RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
+    Adapter adapter(this);
+    return adapter.Generate(
+        [&](int64_t index) {
+          auto val = typed_values.GetView(index);
+          data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(val.data()),
+                                    value_size);
+          return Status::OK();
+        },
+        [&]() {
+          data_builder.UnsafeAppend(value_size, static_cast<uint8_t>(0x00));
+          return Status::OK();
+        });
+  }
+
+  Status Finish() override { return data_builder.Finish(&out->buffers[1]); }
+};
+
+template <typename Type>
+struct ListSelectionImpl : public Selection<ListSelectionImpl<Type>, Type> {
+  using offset_type = typename Type::offset_type;
+
+  using Base = Selection<ListSelectionImpl<Type>, Type>;
+  LIFT_BASE_MEMBERS();
+
+  TypedBufferBuilder<offset_type> offset_builder;
+  typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
+
+  ListSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                    ExecResult* out)
+      : Base(ctx, batch, output_length, out),
+        offset_builder(ctx->memory_pool()),
+        child_index_builder(ctx->memory_pool()) {}
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    ValuesArrayType typed_values(this->values.ToArrayData());
+
+    // TODO presize child_index_builder with a similar heuristic as VarBinarySelectionImpl
+
+    offset_type offset = 0;
+    Adapter adapter(this);
+    RETURN_NOT_OK(adapter.Generate(
+        [&](int64_t index) {
+          offset_builder.UnsafeAppend(offset);
+          offset_type value_offset = typed_values.value_offset(index);
+          offset_type value_length = typed_values.value_length(index);
+          offset += value_length;
+          RETURN_NOT_OK(child_index_builder.Reserve(value_length));
+          for (offset_type j = value_offset; j < value_offset + value_length; ++j) {
+            child_index_builder.UnsafeAppend(j);
+          }
+          return Status::OK();
+        },
+        [&]() {
+          offset_builder.UnsafeAppend(offset);
+          return Status::OK();
+        }));
+    offset_builder.UnsafeAppend(offset);
+    return Status::OK();
+  }
+
+  Status Init() override {
+    RETURN_NOT_OK(offset_builder.Reserve(output_length + 1));
+    return Status::OK();
+  }
+
+  Status Finish() override {
+    std::shared_ptr<Array> child_indices;
+    RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
+
+    ValuesArrayType typed_values(this->values.ToArrayData());
+
+    // No need to boundscheck the child values indices
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
+                          Take(*typed_values.values(), *child_indices,
+                               TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+    RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
+    out->child_data = {taken_child->data()};
+    return Status::OK();
+  }
+};
+
+struct DenseUnionSelectionImpl
+    : public Selection<DenseUnionSelectionImpl, DenseUnionType> {
+  using Base = Selection<DenseUnionSelectionImpl, DenseUnionType>;
+  LIFT_BASE_MEMBERS();
+
+  TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
+  TypedBufferBuilder<int8_t> child_id_buffer_builder_;
+  std::vector<int8_t> type_codes_;
+  std::vector<Int32Builder> child_indices_builders_;
+
+  DenseUnionSelectionImpl(KernelContext* ctx, const ExecSpan& batch,
+                          int64_t output_length, ExecResult* out)
+      : Base(ctx, batch, output_length, out),
+        value_offset_buffer_builder_(ctx->memory_pool()),
+        child_id_buffer_builder_(ctx->memory_pool()),
+        type_codes_(checked_cast<const UnionType&>(*this->values.type).type_codes()),
+        child_indices_builders_(type_codes_.size()) {
+    for (auto& child_indices_builder : child_indices_builders_) {
+      child_indices_builder = Int32Builder(ctx->memory_pool());
+    }
+  }
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    DenseUnionArray typed_values(this->values.ToArrayData());
+    Adapter adapter(this);
+    RETURN_NOT_OK(adapter.Generate(
+        [&](int64_t index) {
+          int8_t child_id = typed_values.child_id(index);
+          child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+          int32_t value_offset = typed_values.value_offset(index);
+          value_offset_buffer_builder_.UnsafeAppend(
+              static_cast<int32_t>(child_indices_builders_[child_id].length()));
+          RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+          child_indices_builders_[child_id].UnsafeAppend(value_offset);
+          return Status::OK();
+        },
+        [&]() {
+          int8_t child_id = 0;
+          child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+          value_offset_buffer_builder_.UnsafeAppend(
+              static_cast<int32_t>(child_indices_builders_[child_id].length()));
+          RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+          child_indices_builders_[child_id].UnsafeAppendNull();
+          return Status::OK();
+        }));
+    return Status::OK();
+  }
+
+  Status Init() override {
+    RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
+    RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
+    return Status::OK();
+  }
+
+  Status Finish() override {
+    ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
+    ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
+                          value_offset_buffer_builder_.Finish());
+    DenseUnionArray typed_values(this->values.ToArrayData());
+    auto num_fields = typed_values.num_fields();
+    auto num_rows = child_ids_buffer->size();
+    BufferVector buffers{nullptr, std::move(child_ids_buffer),
+                         std::move(value_offsets_buffer)};
+    *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
+    for (auto i = 0; i < num_fields; i++) {
+      ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
+                            child_indices_builders_[i].Finish());
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
+                            Take(*typed_values.field(i), *child_indices_array));
+      out->child_data.push_back(child_array->data());
+    }
+    return Status::OK();
+  }
+};
+
+struct FSLSelectionImpl : public Selection<FSLSelectionImpl, FixedSizeListType> {
+  Int64Builder child_index_builder;
+
+  using Base = Selection<FSLSelectionImpl, FixedSizeListType>;
+  LIFT_BASE_MEMBERS();
+
+  FSLSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                   ExecResult* out)
+      : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    ValuesArrayType typed_values(this->values.ToArrayData());
+    const int32_t list_size = typed_values.list_type()->list_size();
+    const int64_t base_offset = typed_values.offset();
+
+    // We must take list_size elements even for null elements of
+    // indices.
+    RETURN_NOT_OK(child_index_builder.Reserve(output_length * list_size));
+
+    Adapter adapter(this);
+    return adapter.Generate(
+        [&](int64_t index) {
+          int64_t offset = (base_offset + index) * list_size;
+          for (int64_t j = offset; j < offset + list_size; ++j) {
+            child_index_builder.UnsafeAppend(j);
+          }
+          return Status::OK();
+        },
+        [&]() { return child_index_builder.AppendNulls(list_size); });
+  }
+
+  Status Finish() override {
+    std::shared_ptr<Array> child_indices;
+    RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
+
+    ValuesArrayType typed_values(this->values.ToArrayData());
+
+    // No need to boundscheck the child values indices
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
+                          Take(*typed_values.values(), *child_indices,
+                               TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+    out->child_data = {taken_child->data()};
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// Struct selection implementations
+
+// We need a slightly different approach for StructType. For Take, we can
+// invoke Take on each struct field's data with boundschecking disabled. For
+// Filter on the other hand, if we naively call Filter on each field, then the
+// filter output length will have to be redundantly computed. Thus, for Filter
+// we instead convert the filter to selection indices and then invoke take.
+
+// Struct selection implementation. ONLY used for Take
+struct StructSelectionImpl : public Selection<StructSelectionImpl, StructType> {
+  using Base = Selection<StructSelectionImpl, StructType>;
+  LIFT_BASE_MEMBERS();
+  using Base::Base;
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    StructArray typed_values(this->values.ToArrayData());
+    Adapter adapter(this);
+    // There's nothing to do for Struct except to generate the validity bitmap
+    return adapter.Generate([&](int64_t index) { return Status::OK(); },
+                            /*visit_null=*/VisitNoop);
+  }
+
+  Status Finish() override {
+    StructArray typed_values(this->values.ToArrayData());
+
+    // Select from children without boundschecking
+    out->child_data.resize(this->values.type->num_fields());
+    for (int field_index = 0; field_index < this->values.type->num_fields();
+         ++field_index) {
+      ARROW_ASSIGN_OR_RAISE(Datum taken_field,
+                            Take(Datum(typed_values.field(field_index)),
+                                 Datum(this->selection.ToArrayData()),
+                                 TakeOptions::NoBoundsCheck(), ctx->exec_context()));
+      out->child_data[field_index] = taken_field.array();
+    }
+    return Status::OK();
+  }
+};
+
+#undef LIFT_BASE_MEMBERS
+
+// ----------------------------------------------------------------------
+
+template <typename Impl>
+Status FilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  int64_t output_length =
+      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
+  Impl kernel(ctx, batch, output_length, out);
+  return kernel.ExecFilter();
+}
+
+}  // namespace
+
+Status FSBFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<FSBSelectionImpl>(ctx, batch, out);
+}
+
+Status ListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<ListSelectionImpl<ListType>>(ctx, batch, out);
+}
+
+Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<ListSelectionImpl<LargeListType>>(ctx, batch, out);
+}
+
+Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<FSLSelectionImpl>(ctx, batch, out);
+}
+
+Status DenseUnionFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<DenseUnionSelectionImpl>(ctx, batch, out);
+}
+
+Status MapFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return FilterExec<ListSelectionImpl<MapType>>(ctx, batch, out);
+}
+
+namespace {
+
+template <typename Impl>
+Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  if (TakeState::Get(ctx).boundscheck) {
+    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+  }
+  Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
+  return kernel.ExecTake();
+}
+
+}  // namespace
+
+Status VarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<VarBinarySelectionImpl<BinaryType>>(ctx, batch, out);
+}
+
+Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
+                              ExecResult* out) {
+  return TakeExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, batch, out);
+}
+
+Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<FSBSelectionImpl>(ctx, batch, out);
+}
+
+Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<ListSelectionImpl<ListType>>(ctx, batch, out);
+}
+
+Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<ListSelectionImpl<LargeListType>>(ctx, batch, out);
+}
+
+Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<FSLSelectionImpl>(ctx, batch, out);
+}
+
+Status DenseUnionTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<DenseUnionSelectionImpl>(ctx, batch, out);
+}
+
+Status StructTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<StructSelectionImpl>(ctx, batch, out);
+}
+
+Status MapTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return TakeExec<ListSelectionImpl<MapType>>(ctx, batch, out);
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
new file mode 100644
index 0000000000000..dd84748a4bd34
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+struct SelectionKernelData {
+  InputType input;
+  ArrayKernelExec exec;
+};
+
+void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
+                               VectorKernel base_kernel, InputType selection_type,
+                               const std::vector<SelectionKernelData>& kernels,
+                               const FunctionOptions* default_options,
+                               FunctionRegistry* registry);
+
+/// \brief Allocate an ArrayData for a primitive array with a given length and bit width
+///
+/// \param[in] bit_width 1 or a multiple of 8
+Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
+                                     bool allocate_validity, ArrayData* out);
+
+Status FSBFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
+
+Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status StructTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status MapTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
new file mode 100644
index 0000000000000..882dd3e9af440
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -0,0 +1,740 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
+#include "arrow/compute/kernels/vector_selection_take_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/int_util.h"
+
+namespace arrow {
+
+using internal::BinaryBitBlockCounter;
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::CheckIndexBounds;
+using internal::OptionalBitBlockCounter;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+template <typename IndexType>
+Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
+    MemoryPool* memory_pool) {
+  using T = typename IndexType::c_type;
+
+  const uint8_t* filter_data = filter.buffers[1].data;
+  const bool have_filter_nulls = filter.MayHaveNulls();
+  const uint8_t* filter_is_valid = filter.buffers[0].data;
+
+  if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
+    // Most complex case: the filter may have nulls and we don't drop them.
+    // The logic is ternary:
+    // - filter is null: emit null
+    // - filter is valid and true: emit index
+    // - filter is valid and false: don't emit anything
+
+    typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
+
+    // The position relative to the start of the filter
+    T position = 0;
+    // The current position taking the filter offset into account
+    int64_t position_with_offset = filter.offset;
+
+    // To count blocks where filter_data[i] || !filter_is_valid[i]
+    BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+                                         filter.offset, filter.length);
+    BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
+    while (position < filter.length) {
+      // true OR NOT valid
+      BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
+      if (selected_or_null_block.NoneSet()) {
+        position += selected_or_null_block.length;
+        position_with_offset += selected_or_null_block.length;
+        continue;
+      }
+      RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
+
+      // If the values are all valid and the selected_or_null_block is full,
+      // then we can infer that all the values are true and skip the bit checking
+      BitBlockCount is_valid_block = is_valid_counter.NextWord();
+
+      if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
+        // All the values are selected and non-null
+        for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+          builder.UnsafeAppend(position++);
+        }
+        position_with_offset += selected_or_null_block.length;
+      } else {
+        // Some of the values are false or null
+        for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+          if (bit_util::GetBit(filter_is_valid, position_with_offset)) {
+            if (bit_util::GetBit(filter_data, position_with_offset)) {
+              builder.UnsafeAppend(position);
+            }
+          } else {
+            // Null slot, so append a null
+            builder.UnsafeAppendNull();
+          }
+          ++position;
+          ++position_with_offset;
+        }
+      }
+    }
+    std::shared_ptr<ArrayData> result;
+    RETURN_NOT_OK(builder.FinishInternal(&result));
+    return result;
+  }
+
+  // Other cases don't emit nulls and are therefore simpler.
+  TypedBufferBuilder<T> builder(memory_pool);
+
+  if (have_filter_nulls) {
+    // The filter may have nulls, so we scan the validity bitmap and the filter
+    // data bitmap together.
+    DCHECK_EQ(null_selection, FilterOptions::DROP);
+
+    // The position relative to the start of the filter
+    T position = 0;
+    // The current position taking the filter offset into account
+    int64_t position_with_offset = filter.offset;
+
+    BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+                                         filter.offset, filter.length);
+    while (position < filter.length) {
+      BitBlockCount and_block = filter_counter.NextAndWord();
+      RETURN_NOT_OK(builder.Reserve(and_block.popcount));
+      if (and_block.AllSet()) {
+        // All the values are selected and non-null
+        for (int64_t i = 0; i < and_block.length; ++i) {
+          builder.UnsafeAppend(position++);
+        }
+        position_with_offset += and_block.length;
+      } else if (!and_block.NoneSet()) {
+        // Some of the values are false or null
+        for (int64_t i = 0; i < and_block.length; ++i) {
+          if (bit_util::GetBit(filter_is_valid, position_with_offset) &&
+              bit_util::GetBit(filter_data, position_with_offset)) {
+            builder.UnsafeAppend(position);
+          }
+          ++position;
+          ++position_with_offset;
+        }
+      } else {
+        position += and_block.length;
+        position_with_offset += and_block.length;
+      }
+    }
+  } else {
+    // The filter has no nulls, so we need only look for true values
+    RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
+        filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
+          // Append the consecutive run of indices
+          RETURN_NOT_OK(builder.Reserve(length));
+          for (int64_t i = 0; i < length; ++i) {
+            builder.UnsafeAppend(static_cast<T>(offset + i));
+          }
+          return Status::OK();
+        }));
+  }
+
+  const int64_t length = builder.length();
+  std::shared_ptr<Buffer> out_buffer;
+  RETURN_NOT_OK(builder.Finish(&out_buffer));
+  return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
+                                     BufferVector{nullptr, out_buffer}, /*null_count=*/0);
+}
+
+Result<std::shared_ptr<ArrayData>> GetTakeIndicesFromBitmap(
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
+    MemoryPool* memory_pool) {
+  DCHECK_EQ(filter.type->id(), Type::BOOL);
+  if (filter.length <= std::numeric_limits<uint16_t>::max()) {
+    return GetTakeIndicesImpl<UInt16Type>(filter, null_selection, memory_pool);
+  } else if (filter.length <= std::numeric_limits<uint32_t>::max()) {
+    return GetTakeIndicesImpl<UInt32Type>(filter, null_selection, memory_pool);
+  } else {
+    // Arrays over 4 billion elements, not especially likely.
+    return Status::NotImplemented(
+        "Filter length exceeds UINT32_MAX, "
+        "consider a different strategy for selecting elements");
+  }
+}
+
+// TODO(pr-35750): Handle run-end encoded filters in compute kernels
+
+}  // namespace
+
+Result<std::shared_ptr<ArrayData>> GetTakeIndices(
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
+    MemoryPool* memory_pool) {
+  return GetTakeIndicesFromBitmap(filter, null_selection, memory_pool);
+}
+
+namespace {
+
+using TakeState = OptionsWrapper<TakeOptions>;
+
+// ----------------------------------------------------------------------
+// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
+// C-type based types. Use common implementation for every byte width and only
+// generate code for unsigned integer indices, since after boundschecking to
+// check for negative numbers in the indices we can safely reinterpret_cast
+// signed integers as unsigned.
+
+/// \brief The Take implementation for primitive (fixed-width) types does not
+/// use the logical Arrow type but rather the physical C type. This way we
+/// only generate one take function for each byte width.
+///
+/// This function assumes that the indices have been boundschecked.
+template <typename IndexCType, typename ValueCType>
+struct PrimitiveTakeImpl {
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
+                   ArrayData* out_arr) {
+    const auto* values_data = values.GetValues<ValueCType>(1);
+    const uint8_t* values_is_valid = values.buffers[0].data;
+    auto values_offset = values.offset;
+
+    const auto* indices_data = indices.GetValues<IndexCType>(1);
+    const uint8_t* indices_is_valid = indices.buffers[0].data;
+    auto indices_offset = indices.offset;
+
+    auto out = out_arr->GetMutableValues<ValueCType>(1);
+    auto out_is_valid = out_arr->buffers[0]->mutable_data();
+    auto out_offset = out_arr->offset;
+
+    // If either the values or indices have nulls, we preemptively zero out the
+    // out validity bitmap so that we don't have to use ClearBit in each
+    // iteration for nulls.
+    if (values.null_count != 0 || indices.null_count != 0) {
+      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
+    }
+
+    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
+                                                indices.length);
+    int64_t position = 0;
+    int64_t valid_count = 0;
+    while (position < indices.length) {
+      BitBlockCount block = indices_bit_counter.NextBlock();
+      if (values.null_count == 0) {
+        // Values are never null, so things are easier
+        valid_count += block.popcount;
+        if (block.popcount == block.length) {
+          // Fastest path: neither values nor index nulls
+          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
+          for (int64_t i = 0; i < block.length; ++i) {
+            out[position] = values_data[indices_data[position]];
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some indices but not all are null
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
+              // index is not null
+              bit_util::SetBit(out_is_valid, out_offset + position);
+              out[position] = values_data[indices_data[position]];
+            } else {
+              out[position] = ValueCType{};
+            }
+            ++position;
+          }
+        } else {
+          memset(out + position, 0, sizeof(ValueCType) * block.length);
+          position += block.length;
+        }
+      } else {
+        // Values have nulls, so we must do random access into the values bitmap
+        if (block.popcount == block.length) {
+          // Faster path: indices are not null but values may be
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(values_is_valid,
+                                 values_offset + indices_data[position])) {
+              // value is not null
+              out[position] = values_data[indices_data[position]];
+              bit_util::SetBit(out_is_valid, out_offset + position);
+              ++valid_count;
+            } else {
+              out[position] = ValueCType{};
+            }
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some but not all indices are null. Since we are doing
+          // random access in general we have to check the value nullness one by
+          // one.
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
+                bit_util::GetBit(values_is_valid,
+                                 values_offset + indices_data[position])) {
+              // index is not null && value is not null
+              out[position] = values_data[indices_data[position]];
+              bit_util::SetBit(out_is_valid, out_offset + position);
+              ++valid_count;
+            } else {
+              out[position] = ValueCType{};
+            }
+            ++position;
+          }
+        } else {
+          memset(out + position, 0, sizeof(ValueCType) * block.length);
+          position += block.length;
+        }
+      }
+    }
+    out_arr->null_count = out_arr->length - valid_count;
+  }
+};
+
+template <typename IndexCType>
+struct BooleanTakeImpl {
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
+                   ArrayData* out_arr) {
+    const uint8_t* values_data = values.buffers[1].data;
+    const uint8_t* values_is_valid = values.buffers[0].data;
+    auto values_offset = values.offset;
+
+    const auto* indices_data = indices.GetValues<IndexCType>(1);
+    const uint8_t* indices_is_valid = indices.buffers[0].data;
+    auto indices_offset = indices.offset;
+
+    auto out = out_arr->buffers[1]->mutable_data();
+    auto out_is_valid = out_arr->buffers[0]->mutable_data();
+    auto out_offset = out_arr->offset;
+
+    // If either the values or indices have nulls, we preemptively zero out the
+    // out validity bitmap so that we don't have to use ClearBit in each
+    // iteration for nulls.
+    if (values.null_count != 0 || indices.null_count != 0) {
+      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
+    }
+    // Avoid uninitialized data in values array
+    bit_util::SetBitsTo(out, out_offset, indices.length, false);
+
+    auto PlaceDataBit = [&](int64_t loc, IndexCType index) {
+      bit_util::SetBitTo(out, out_offset + loc,
+                         bit_util::GetBit(values_data, values_offset + index));
+    };
+
+    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
+                                                indices.length);
+    int64_t position = 0;
+    int64_t valid_count = 0;
+    while (position < indices.length) {
+      BitBlockCount block = indices_bit_counter.NextBlock();
+      if (values.null_count == 0) {
+        // Values are never null, so things are easier
+        valid_count += block.popcount;
+        if (block.popcount == block.length) {
+          // Fastest path: neither values nor index nulls
+          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
+          for (int64_t i = 0; i < block.length; ++i) {
+            PlaceDataBit(position, indices_data[position]);
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some but not all indices are null
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
+              // index is not null
+              bit_util::SetBit(out_is_valid, out_offset + position);
+              PlaceDataBit(position, indices_data[position]);
+            }
+            ++position;
+          }
+        } else {
+          position += block.length;
+        }
+      } else {
+        // Values have nulls, so we must do random access into the values bitmap
+        if (block.popcount == block.length) {
+          // Faster path: indices are not null but values may be
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(values_is_valid,
+                                 values_offset + indices_data[position])) {
+              // value is not null
+              bit_util::SetBit(out_is_valid, out_offset + position);
+              PlaceDataBit(position, indices_data[position]);
+              ++valid_count;
+            }
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some but not all indices are null. Since we are doing
+          // random access in general we have to check the value nullness one by
+          // one.
+          for (int64_t i = 0; i < block.length; ++i) {
+            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
+              // index is not null
+              if (bit_util::GetBit(values_is_valid,
+                                   values_offset + indices_data[position])) {
+                // value is not null
+                PlaceDataBit(position, indices_data[position]);
+                bit_util::SetBit(out_is_valid, out_offset + position);
+                ++valid_count;
+              }
+            }
+            ++position;
+          }
+        } else {
+          position += block.length;
+        }
+      }
+    }
+    out_arr->null_count = out_arr->length - valid_count;
+  }
+};
+
+template <template <typename...> class TakeImpl, typename... Args>
+void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
+                       ArrayData* out) {
+  // With the simplifying assumption that boundschecking has taken place
+  // already at a higher level, we can now assume that the index values are all
+  // non-negative. Thus, we can interpret signed integers as unsigned and avoid
+  // having to generate double the amount of binary code to handle each integer
+  // width.
+  switch (indices.type->byte_width()) {
+    case 1:
+      return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
+    case 2:
+      return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
+    case 4:
+      return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
+    case 8:
+      return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
+    default:
+      DCHECK(false) << "Invalid indices byte width";
+      break;
+  }
+}
+
+Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& indices = batch[1].array;
+
+  if (TakeState::Get(ctx).boundscheck) {
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length));
+  }
+
+  ArrayData* out_arr = out->array_data().get();
+
+  const int bit_width = values.type->bit_width();
+
+  // TODO: When neither values nor indices contain nulls, we can skip
+  // allocating the validity bitmap altogether and save time and space. A
+  // streamlined PrimitiveTakeImpl would need to be written that skips all
+  // interactions with the output validity bitmap, though.
+  RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width,
+                                              /*allocate_validity=*/true, out_arr));
+  switch (bit_width) {
+    case 1:
+      TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
+      break;
+    case 8:
+      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
+      break;
+    case 16:
+      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
+      break;
+    case 32:
+      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
+      break;
+    case 64:
+      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
+      break;
+    default:
+      DCHECK(false) << "Invalid values byte width";
+      break;
+  }
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Null take
+
+Status NullTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  if (TakeState::Get(ctx).boundscheck) {
+    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+  }
+  // batch.length doesn't take into account the take indices
+  auto new_length = batch[1].array.length;
+  out->value = std::make_shared<NullArray>(new_length)->data();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dictionary take
+
+Status DictionaryTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DictionaryArray values(batch[0].array.ToArrayData());
+  Datum result;
+  RETURN_NOT_OK(Take(Datum(values.indices()), batch[1].array.ToArrayData(),
+                     TakeState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
+  DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
+  out->value = taken_values.data();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Extension take
+
+Status ExtensionTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ExtensionArray values(batch[0].array.ToArrayData());
+  Datum result;
+  RETURN_NOT_OK(Take(Datum(values.storage()), batch[1].array.ToArrayData(),
+                     TakeState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
+  ExtensionArray taken_values(values.type(), result.make_array());
+  out->value = taken_values.data();
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Take metafunction implementation
+
+// Shorthand naming of these functions
+// A -> Array
+// C -> ChunkedArray
+// R -> RecordBatch
+// T -> Table
+
+Result<std::shared_ptr<ArrayData>> TakeAA(const std::shared_ptr<ArrayData>& values,
+                                          const std::shared_ptr<ArrayData>& indices,
+                                          const TakeOptions& options, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum result,
+                        CallFunction("array_take", {values, indices}, &options, ctx));
+  return result.array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
+                                             const Array& indices,
+                                             const TakeOptions& options,
+                                             ExecContext* ctx) {
+  auto num_chunks = values.num_chunks();
+  std::shared_ptr<Array> current_chunk;
+
+  // Case 1: `values` has a single chunk, so just use it
+  if (num_chunks == 1) {
+    current_chunk = values.chunk(0);
+  } else {
+    // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
+    // See
+    // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
+    // TODO Case 3: If indices are sorted, can slice them and call Array Take
+
+    // Case 4: Else, concatenate chunks and call Array Take
+    if (values.chunks().empty()) {
+      ARROW_ASSIGN_OR_RAISE(current_chunk, MakeArrayOfNull(values.type(), /*length=*/0,
+                                                           ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(current_chunk,
+                            Concatenate(values.chunks(), ctx->memory_pool()));
+    }
+  }
+  // Call Array Take on our single chunk
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> new_chunk,
+                        TakeAA(current_chunk->data(), indices.data(), options, ctx));
+  std::vector<std::shared_ptr<Array>> chunks = {MakeArray(new_chunk)};
+  return std::make_shared<ChunkedArray>(std::move(chunks));
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
+                                             const ChunkedArray& indices,
+                                             const TakeOptions& options,
+                                             ExecContext* ctx) {
+  auto num_chunks = indices.num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    // Note that as currently implemented, this is inefficient because `values`
+    // will get concatenated on every iteration of this loop
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> current_chunk,
+                          TakeCA(values, *indices.chunk(i), options, ctx));
+    // Concatenate the result to make a single array for this chunk
+    ARROW_ASSIGN_OR_RAISE(new_chunks[i],
+                          Concatenate(current_chunk->chunks(), ctx->memory_pool()));
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
+}
+
+Result<std::shared_ptr<ChunkedArray>> TakeAC(const Array& values,
+                                             const ChunkedArray& indices,
+                                             const TakeOptions& options,
+                                             ExecContext* ctx) {
+  auto num_chunks = indices.num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> chunk,
+                          TakeAA(values.data(), indices.chunk(i)->data(), options, ctx));
+    new_chunks[i] = MakeArray(chunk);
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
+}
+
+Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
+                                            const Array& indices,
+                                            const TakeOptions& options,
+                                            ExecContext* ctx) {
+  auto ncols = batch.num_columns();
+  auto nrows = indices.length();
+  std::vector<std::shared_ptr<Array>> columns(ncols);
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> col_data,
+                          TakeAA(batch.column(j)->data(), indices.data(), options, ctx));
+    columns[j] = MakeArray(col_data);
+  }
+  return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
+                                      const TakeOptions& options, ExecContext* ctx) {
+  auto ncols = table.num_columns();
+  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(columns[j], TakeCA(*table.column(j), indices, options, ctx));
+  }
+  return Table::Make(table.schema(), std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& indices,
+                                      const TakeOptions& options, ExecContext* ctx) {
+  auto ncols = table.num_columns();
+  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(columns[j], TakeCC(*table.column(j), indices, options, ctx));
+  }
+  return Table::Make(table.schema(), std::move(columns));
+}
+
+const FunctionDoc take_doc(
+    "Select values from an input based on indices from another array",
+    ("The output is populated with values from the input at positions\n"
+     "given by `indices`.  Nulls in `indices` emit null in the output."),
+    {"input", "indices"}, "TakeOptions");
+
+// Metafunction for dispatching to different Take implementations other than
+// Array-Array.
+//
+// TODO: Revamp approach to executing Take operations. In addition to being
+// overly complex dispatching, there is no parallelization.
+class TakeMetaFunction : public MetaFunction {
+ public:
+  TakeMetaFunction()
+      : MetaFunction("take", Arity::Binary(), take_doc, GetDefaultTakeOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options,
+                            ExecContext* ctx) const override {
+    Datum::Kind index_kind = args[1].kind();
+    const auto& take_opts = static_cast<const TakeOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY:
+        if (index_kind == Datum::ARRAY) {
+          return TakeAA(args[0].array(), args[1].array(), take_opts, ctx);
+        } else if (index_kind == Datum::CHUNKED_ARRAY) {
+          return TakeAC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
+        }
+        break;
+      case Datum::CHUNKED_ARRAY:
+        if (index_kind == Datum::ARRAY) {
+          return TakeCA(*args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
+        } else if (index_kind == Datum::CHUNKED_ARRAY) {
+          return TakeCC(*args[0].chunked_array(), *args[1].chunked_array(), take_opts,
+                        ctx);
+        }
+        break;
+      case Datum::RECORD_BATCH:
+        if (index_kind == Datum::ARRAY) {
+          return TakeRA(*args[0].record_batch(), *args[1].make_array(), take_opts, ctx);
+        }
+        break;
+      case Datum::TABLE:
+        if (index_kind == Datum::ARRAY) {
+          return TakeTA(*args[0].table(), *args[1].make_array(), take_opts, ctx);
+        } else if (index_kind == Datum::CHUNKED_ARRAY) {
+          return TakeTC(*args[0].table(), *args[1].chunked_array(), take_opts, ctx);
+        }
+        break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for take operation: "
+        "values=",
+        args[0].ToString(), "indices=", args[1].ToString());
+  }
+};
+
+// ----------------------------------------------------------------------
+
+}  // namespace
+
+const TakeOptions* GetDefaultTakeOptions() {
+  static const auto kDefaultTakeOptions = TakeOptions::Defaults();
+  return &kDefaultTakeOptions;
+}
+
+std::unique_ptr<Function> MakeTakeMetaFunction() {
+  return std::make_unique<TakeMetaFunction>();
+}
+
+void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
+  *out = {
+      {InputType(match::Primitive()), PrimitiveTakeExec},
+      {InputType(match::BinaryLike()), VarBinaryTakeExec},
+      {InputType(match::LargeBinaryLike()), LargeVarBinaryTakeExec},
+      {InputType(Type::FIXED_SIZE_BINARY), FSBTakeExec},
+      {InputType(null()), NullTakeExec},
+      {InputType(Type::DECIMAL128), FSBTakeExec},
+      {InputType(Type::DECIMAL256), FSBTakeExec},
+      {InputType(Type::DICTIONARY), DictionaryTake},
+      {InputType(Type::EXTENSION), ExtensionTake},
+      {InputType(Type::LIST), ListTakeExec},
+      {InputType(Type::LARGE_LIST), LargeListTakeExec},
+      {InputType(Type::FIXED_SIZE_LIST), FSLTakeExec},
+      {InputType(Type::DENSE_UNION), DenseUnionTakeExec},
+      {InputType(Type::STRUCT), StructTakeExec},
+      {InputType(Type::MAP), MapTakeExec},
+  };
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.h
new file mode 100644
index 0000000000000..9c875f6f5047e
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
+#include "arrow/memory_pool.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+const TakeOptions* GetDefaultTakeOptions();
+
+std::unique_ptr<Function> MakeTakeMetaFunction();
+
+void PopulateTakeKernels(std::vector<SelectionKernelData>* out);
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow