Skip to content

Commit

Permalink
binary_slice kernel for fixed size binary
Browse files Browse the repository at this point in the history
  • Loading branch information
js8544 committed Dec 16, 2023
1 parent 431c4ea commit 097324d
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 0 deletions.
28 changes: 28 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2436,6 +2436,7 @@ void AddAsciiStringReplaceSlice(FunctionRegistry* registry) {

namespace {
struct SliceBytesTransform : StringSliceTransformBase {
using StringSliceTransformBase::StringSliceTransformBase;
int64_t MaxCodeunits(int64_t ninputs, int64_t input_bytes) override {
const SliceOptions& opt = *this->options;
if ((opt.start >= 0) != (opt.stop >= 0)) {
Expand Down Expand Up @@ -2568,6 +2569,27 @@ struct SliceBytesTransform : StringSliceTransformBase {

return dest - output;
}

static int32_t FixedOutputSize(SliceOptions options, int32_t input_width_32) {
auto step = options.step;
auto start = options.start;
auto stop = options.stop;
auto input_width = static_cast<int64_t>(input_width_32);

if (start < 0) {
start = std::max(0L, start + input_width);
}
if (stop < 0) {
stop = std::max(0L, stop + input_width);
}
start = std::min(start, input_width);
stop = std::min(stop, input_width);

if ((start >= stop and step > 0) || (start <= stop and step < 0) || start == stop) {
return 0;
}
return std::max(0L, (stop - start + (step - (step > 0 ? 1 : -1))) / step);
}
};

template <typename Type>
Expand All @@ -2594,6 +2616,12 @@ void AddAsciiStringSlice(FunctionRegistry* registry) {
DCHECK_OK(
func->AddKernel({ty}, ty, std::move(exec), SliceBytesTransform::State::Init));
}
using TransformExec = FixedSizeBinaryTransformExecWithState<SliceBytesTransform>;
ScalarKernel fsb_kernel({InputType(Type::FIXED_SIZE_BINARY)},
OutputType(TransformExec::OutputType), TransformExec::Exec,
StringSliceTransformBase::State::Init);
fsb_kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
DCHECK_OK(func->AddKernel(std::move(fsb_kernel)));
DCHECK_OK(registry->AddFunction(std::move(func)));
}

Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ struct StringSliceTransformBase : public StringTransformBase {
using State = OptionsWrapper<SliceOptions>;

const SliceOptions* options;
StringSliceTransformBase() = default;
explicit StringSliceTransformBase(const SliceOptions& options) : options{&options} {}

Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
options = &State::Get(ctx);
Expand Down
90 changes: 90 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <string>
#include <utility>
#include <vector>
#include "arrow/type_fwd.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>
Expand Down Expand Up @@ -712,6 +713,95 @@ TEST_F(TestFixedSizeBinaryKernels, BinaryLength) {
"[6, null, 6]");
}

TEST_F(TestFixedSizeBinaryKernels, SliceBytesBasic) {
SliceOptions options{2, 4};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ca", "fd"])", &options);

SliceOptions options_edgecase_1{-3, 1};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(0),
R"(["", ""])", &options_edgecase_1);

SliceOptions options_edgecase_2{-10, -3};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(3),
R"(["abc", "def"])", &options_edgecase_2);

auto input = ArrayFromJSON(this->type(), R"(["foobaz"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("Function 'binary_slice' cannot be called without options"),
CallFunction("binary_slice", {input}));

SliceOptions options_invalid{2, 4, 0};
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("Slice step cannot be zero"),
CallFunction("binary_slice", {input}, &options_invalid));
}

TEST_F(TestFixedSizeBinaryKernels, SliceBytesPosPos) {
SliceOptions options_step{1, 5, 2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ba", "ed"])", &options_step);

SliceOptions options_step_neg{5, 0, -2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(3),
R"(["cab", "fde"])", &options_step_neg);
}

TEST_F(TestFixedSizeBinaryKernels, SliceBytesPosNeg) {
SliceOptions options{2, -1};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(3),
R"(["cab", "fde"])", &options);

SliceOptions options_step{1, -1, 2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ba", "ed"])", &options_step);

SliceOptions options_step_neg{5, -4, -2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ca", "fd"])", &options_step_neg);

options_step_neg.stop = -6;
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(3),
R"(["cab", "fde"])", &options_step_neg);
}

TEST_F(TestFixedSizeBinaryKernels, SliceBytesNegNeg) {
SliceOptions options{-2, -1};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(1),
R"(["b", "e"])", &options);

SliceOptions options_step{-4, -1, 2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["cb", "fe"])", &options_step);

SliceOptions options_step_neg{-1, -3, -2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(1),
R"(["c", "f"])", &options_step_neg);

options_step_neg.stop = -4;
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ca", "fd"])", &options_step_neg);
}

TEST_F(TestFixedSizeBinaryKernels, SliceBytesNegPos) {
SliceOptions options{-2, 4};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(0),
R"(["", ""])", &options);

SliceOptions options_step{-4, 5, 2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["cb", "fe"])", &options_step);

SliceOptions options_step_neg{-1, 1, -2};
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(2),
R"(["ca", "fd"])", &options_step_neg);

options_step_neg.stop = 0;
CheckUnary("binary_slice", R"(["abcabc", "defdef"])", fixed_size_binary(3),
R"(["cab", "fde"])", &options_step_neg);
}

TEST_F(TestFixedSizeBinaryKernels, BinaryReplaceSlice) {
ReplaceSliceOptions options{0, 1, "XX"};
CheckUnary("binary_replace_slice", "[]", fixed_size_binary(7), "[]", &options);
Expand Down

0 comments on commit 097324d

Please sign in to comment.