From 27981976f7dfd4c4ab7f60677f849439e038fc55 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 25 Oct 2023 17:15:05 -0400 Subject: [PATCH] revert binary_like, review comments --- cpp/src/arrow/array/builder_base.cc | 3 +-- cpp/src/arrow/array/builder_binary.h | 8 ++++++ cpp/src/arrow/array/concatenate_test.cc | 3 ++- cpp/src/arrow/array/diff.cc | 31 +++++++++++----------- cpp/src/arrow/array/diff_test.cc | 13 +++++++++ cpp/src/arrow/integration/json_internal.cc | 7 ++--- cpp/src/arrow/pretty_print.cc | 17 +++++------- cpp/src/arrow/type_traits.h | 10 +++---- cpp/src/arrow/util/binary_view_util.h | 4 +-- 9 files changed, 55 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 76f0d9fc3f679..d3502a0ab645a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -123,8 +123,7 @@ struct AppendScalarImpl { Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } template - enable_if_t::value || is_string_like_type::value, Status> - Visit(const T&) { + enable_if_has_string_view Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index f5b335b87a26d..3e87cf2403610 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -583,8 +583,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { data_builder_(pool, alignment), data_heap_builder_(pool, alignment) {} + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. int64_t current_block_bytes_remaining() const { return data_heap_builder_.current_remaining_bytes(); } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index e49f65e15e15d..0ef1136ea78f8 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -163,7 +163,8 @@ TEST_F(ConcatenateTest, StringType) { TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability); + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); ASSERT_OK((**out).ValidateFull()); }); } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 800f19b752726..f267c52c0d696 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -476,30 +476,31 @@ class MakeFormatterImpl { return Status::OK(); } - // format Binary, LargeBinary and FixedSizeBinary in hexadecimal template - enable_if_binary_like Visit(const T&) { + enable_if_has_string_view Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << HexEncode(checked_cast(array).GetView(index)); + std::string_view view = checked_cast(array).GetView(index); + if constexpr (T::is_utf8) { + // format String and StringView with \"\n\r\t\\ escaped + *os << '"' << Escape(view) << '"'; + } else { + // format Binary, LargeBinary, BinaryView, and FixedSizeBinary in hexadecimal + *os << HexEncode(view); + } }; return Status::OK(); } - // format Strings with \"\n\r\t\\ escaped + // format Decimals with Decimal___Array::FormatValue template - enable_if_string_like Visit(const T&) { - using ArrayType = typename TypeTraits::ArrayType; - impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << "\"" << Escape(checked_cast(array).GetView(index)) << "\""; - }; - return Status::OK(); - } - - // format Decimals with Decimal128Array::FormatValue - Status Visit(const Decimal128Type&) { + enable_if_decimal Visit(const T&) { impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << checked_cast(array).FormatValue(index); + if constexpr (T::type_id == Type::DECIMAL128) { + *os << checked_cast(array).FormatValue(index); + } else { + *os << checked_cast(array).FormatValue(index); + } }; return Status::OK(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index e322006732f9a..4c2f39eddf863 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -596,6 +596,19 @@ TEST_F(DiffTest, UnifiedDiffFormatter) { @@ -5, +8 @@ +79 +11 +)"); + } + + for (const auto& type : { + decimal128(10, 4), + decimal256(10, 4), + }) { + base_ = ArrayFromJSON(type, R"(["123.4567", "-78.9000"])"); + target_ = ArrayFromJSON(type, R"(["123.4567", "-123.4567"])"); + AssertDiffAndFormat(R"( +@@ -1, +1 @@ +--78.9000 ++-123.4567 )"); } } diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index c4f44d21bed38..59749c36a958e 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -109,7 +109,7 @@ std::string GetTimeUnitName(TimeUnit::type unit) { Result GetStringView(const rj::Value& str) { if (!str.IsString()) { - return Status::Invalid("field was not a string line ", __LINE__); + return Status::Invalid("field was not a string"); } return std::string_view{str.GetString(), str.GetStringLength()}; } @@ -1364,10 +1364,7 @@ class ArrayReader { RETURN_NOT_OK(builder.AppendNull()); continue; } - - DCHECK(json_val.IsString()); - std::string_view val{ - json_val.GetString()}; // XXX can we use json_val.GetStringLength()? + ARROW_ASSIGN_OR_RAISE(auto val, GetStringView(json_val)); int64_t offset_start = ParseOffset(json_offsets[i]); int64_t offset_end = ParseOffset(json_offsets[i + 1]); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index a53568074fdd6..262576bea95e3 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -229,18 +229,13 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_string_like WriteDataValues(const ArrayType& array) { + enable_if_has_string_view WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { - (*sink_) << "\"" << array.GetView(i) << "\""; - return Status::OK(); - }); - } - - template - enable_if_t::value && !is_decimal_type::value, Status> - WriteDataValues(const ArrayType& array) { - return WriteValues(array, [&](int64_t i) { - (*sink_) << HexEncode(array.GetView(i)); + if constexpr (T::is_utf8) { + (*sink_) << "\"" << array.GetView(i) << "\""; + } else { + (*sink_) << HexEncode(array.GetView(i)); + } return Status::OK(); }); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 738aee08a3b65..9d8cafacf397b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -659,8 +659,7 @@ using enable_if_string_view = enable_if_t::value, R>; template using is_string_like_type = - std::integral_constant::value && T::is_utf8) || - is_string_view_type::value>; + std::integral_constant::value && T::is_utf8>; template using enable_if_string_like = enable_if_t::value, R>; @@ -683,9 +682,10 @@ template using enable_if_fixed_width_type = enable_if_t::value, R>; template -using is_binary_like_type = std::integral_constant< - bool, (is_base_binary_type::value && !is_string_like_type::value) || - is_binary_view_type::value || is_fixed_size_binary_type::value>; +using is_binary_like_type = + std::integral_constant::value && + !is_string_like_type::value) || + is_fixed_size_binary_type::value>; template using enable_if_binary_like = enable_if_t::value, R>; diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h index 13f05e0e7c808..94f7a5bdfa667 100644 --- a/cpp/src/arrow/util/binary_view_util.h +++ b/cpp/src/arrow/util/binary_view_util.h @@ -75,8 +75,8 @@ bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, if (l_size_and_prefix != r_size_and_prefix) return false; if (l.is_inline()) { - // The inline part is zeroed at construction, so we can compare - // a word at a time if data extends past 'prefix_'. + // The columnar spec mandates that the inlined part be zero-padded, so we can compare + // a word at a time regardless of the exact size. int64_t l_inlined, r_inlined; memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined)); memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined));