Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37710: [C++][Integration] Add C++ Utf8View implementation #37792

Merged
merged 5 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ struct ScalarFromArraySlotImpl {
return Finish(a.GetString(index_));
}

Status Visit(const BinaryViewArray& a) { return Finish(a.GetString(index_)); }

Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }

Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "arrow/array/validate.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"

Expand Down Expand Up @@ -89,6 +90,33 @@ LargeStringArray::LargeStringArray(int64_t length,

Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

BinaryViewArray::BinaryViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
SetData(std::move(data));
}

BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
SetData(
ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset));
}

std::string_view BinaryViewArray::GetView(int64_t i) const {
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromBinaryView(raw_values_[i], data_buffers);
}

StringViewArray::StringViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
SetData(std::move(data));
}

Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}
Expand Down
60 changes: 60 additions & 0 deletions cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
Expand Down Expand Up @@ -217,6 +218,65 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// BinaryView and StringView

/// Concrete Array class for variable-size binary view data using the
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
class ARROW_EXPORT BinaryViewArray : public FlatArray {
public:
using TypeClass = BinaryViewType;
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
using c_type = BinaryViewType::c_type;

explicit BinaryViewArray(std::shared_ptr<ArrayData> data);

BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector data_buffers,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

// For API compatibility with BinaryArray etc.
std::string_view GetView(int64_t i) const;
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }

const auto& values() const { return data_->buffers[1]; }
const c_type* raw_values() const { return raw_values_; }

std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}

IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }

protected:
using FlatArray::FlatArray;

void SetData(std::shared_ptr<ArrayData> data) {
FlatArray::SetData(std::move(data));
raw_values_ = data_->GetValuesSafe<c_type>(1);
}

const c_type* raw_values_;
};

/// Concrete Array class for variable-size string view (utf-8) data using
/// BinaryViewType::c_type to reference in-line or out-of-line string values
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
public:
using TypeClass = StringViewType;

explicit StringViewArray(std::shared_ptr<ArrayData> data);

using BinaryViewArray::BinaryViewArray;

/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// Fixed width binary

Expand Down
138 changes: 121 additions & 17 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,21 @@

#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/validate.h"
#include "arrow/buffer.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/testing/builder.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/visit_data_inline.h"

namespace arrow {
Expand Down Expand Up @@ -365,38 +369,134 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets();

TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }

// Produce an Array of index/offset views from a std::vector of index/offset
// BinaryViewType::c_type
Result<std::shared_ptr<StringViewArray>> MakeBinaryViewArray(
BufferVector data_buffers, const std::vector<BinaryViewType::c_type>& views,
bool validate = true) {
auto length = static_cast<int64_t>(views.size());
auto arr = std::make_shared<StringViewArray>(
utf8_view(), length, Buffer::FromVector(views), std::move(data_buffers));
if (validate) {
RETURN_NOT_OK(arr->ValidateFull());
}
return arr;
}

TEST(StringViewArray, Validate) {
// Since this is a test of validation, we need to be able to construct invalid arrays.
auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)");
auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy");

// empty array is valid
EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok());

// empty array with some data buffers is valid
EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok());

// inline views need not have a corresponding buffer
EXPECT_THAT(MakeBinaryViewArray({},
{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
util::ToInlineBinaryView("inline me"),
}),
Ok());

// non-inline views are expected to reference only buffers managed by the array
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Ok());

// views may not reference data buffers not present in the array
EXPECT_THAT(
MakeBinaryViewArray(
{}, {util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0)}),
Raises(StatusCode::IndexError));
// ... or ranges which overflow the referenced data buffer
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s}, {util::ToBinaryView(
"supe", static_cast<int32_t>(buffer_s->size() + 50), 0, 0)}),
Raises(StatusCode::IndexError));

// Additionally, the prefixes of non-inline views must match the data buffer
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Raises(StatusCode::Invalid));

// Invalid string views which are masked by a null bit do not cause validation to fail
auto invalid_but_masked =
MakeBinaryViewArray(
{buffer_s},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", 50, 40, 30)},
/*validate=*/false)
.ValueOrDie()
->data();
invalid_but_masked->null_count = 2;
invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2);
EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok());

// overlapping views are allowed
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s},
{
util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("uper", static_cast<int32_t>(buffer_s->size() - 1), 0,
1),
util::ToBinaryView("perc", static_cast<int32_t>(buffer_s->size() - 2), 0,
2),
util::ToBinaryView("erca", static_cast<int32_t>(buffer_s->size() - 3), 0,
3),
}),
Ok());
}

template <typename T>
class TestUTF8Array : public ::testing::Test {
public:
using TypeClass = T;
using offset_type = typename TypeClass::offset_type;
using ArrayType = typename TypeTraits<TypeClass>::ArrayType;

Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
std::string_view data, int64_t offset = 0) {
ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared<Buffer>(data),
/*null_bitmap=*/nullptr, /*null_count=*/0, offset);
return arr.ValidateUTF8();
std::shared_ptr<DataType> type() const {
if constexpr (is_binary_view_like_type<TypeClass>::value) {
return TypeClass::is_utf8 ? utf8_view() : binary_view();
} else {
return TypeTraits<TypeClass>::type_singleton();
}
}

Status ValidateUTF8(const std::string& json) {
auto ty = TypeTraits<T>::type_singleton();
auto arr = ArrayFromJSON(ty, json);
return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
Status ValidateUTF8(const Array& arr) {
return checked_cast<const ArrayType&>(arr).ValidateUTF8();
}

Status ValidateUTF8(std::vector<std::string> values) {
std::shared_ptr<Array> arr;
ArrayFromVector<T, std::string>(type(), values, &arr);
return ValidateUTF8(*arr);
}

void TestValidateUTF8() {
ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff
ASSERT_OK(
ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])")));
ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff

ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"}));

// More tests in TestValidateData() above
// (ValidateFull() calls ValidateUTF8() internally)
}
};

TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes);
TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes);

TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }

Expand Down Expand Up @@ -883,11 +983,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
void SetUp() override { type_ = TypeTraits<TypeClass>::type_singleton(); }

void TestBasics() {
auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])");
auto array = ArrayFromJSON(
type_,
R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])");
BinaryAppender appender;
ArraySpanVisitor<TypeClass> visitor;
ASSERT_OK(visitor.Visit(*array->data(), &appender));
ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"}));
ASSERT_THAT(appender.data,
::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me",
"allocate_me_aaaaa", "allocate_me_bbbb"}));
ARROW_UNUSED(visitor); // Workaround weird MSVC warning
}

Expand All @@ -904,7 +1008,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
std::shared_ptr<DataType> type_;
};

TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes);
TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes);

TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }

Expand Down
8 changes: 7 additions & 1 deletion cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -382,10 +382,12 @@ static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseType
float64(),
binary(),
large_binary(),
binary_view(),
fixed_size_binary(3),
decimal(16, 4),
utf8(),
large_utf8(),
utf8_view(),
list(utf8()),
list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
list(large_utf8()),
Expand Down Expand Up @@ -601,12 +603,15 @@ static ScalarVector GetScalars() {
std::make_shared<DurationScalar>(60, duration(TimeUnit::SECOND)),
std::make_shared<BinaryScalar>(hello),
std::make_shared<LargeBinaryScalar>(hello),
std::make_shared<BinaryViewScalar>(hello),
std::make_shared<FixedSizeBinaryScalar>(
hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
std::make_shared<StringScalar>(hello),
std::make_shared<LargeStringScalar>(hello),
std::make_shared<StringViewScalar>(hello),
std::make_shared<StringViewScalar>(Buffer::FromString("long string; not inlined")),
std::make_shared<ListScalar>(ArrayFromJSON(int8(), "[1, 2, 3]")),
ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"),
std::make_shared<LargeListScalar>(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")),
Expand Down Expand Up @@ -647,13 +652,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {

for (int64_t length : {16}) {
for (auto scalar : scalars) {
ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString());
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length));
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(array->length(), length);
ASSERT_EQ(array->null_count(), 0);

// test case for ARROW-13321
for (int64_t i : std::vector<int64_t>{0, length / 2, length - 1}) {
for (int64_t i : {int64_t{0}, length / 2, length - 1}) {
ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i));
AssertScalarsEqual(*s, *scalar, /*verbose=*/true);
}
Expand Down
Loading
Loading