From 53323d46edc82571c52b2690aadd47290068d467 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 29 Jul 2022 23:07:47 +0200 Subject: [PATCH 001/107] add type-only parts from rle branch --- cpp/src/arrow/type.cc | 21 +++++++++++++++++++++ cpp/src/arrow/type.h | 35 +++++++++++++++++++++++++++++++++++ cpp/src/arrow/type_fwd.h | 6 ++++++ 3 files changed, 62 insertions(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index efff07db6671f..59a9b09cafad1 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -714,6 +714,15 @@ Result> DenseUnionType::Make( return std::make_shared(fields, type_codes); } +// ---------------------------------------------------------------------- +// Run-length encoded type + +std::string RunLengthEncodedType::ToString() const { + std::stringstream s; + s << name() << "<" << encoded_type()->ToString() << ">"; + return s.str(); +} + // ---------------------------------------------------------------------- // Struct type @@ -2136,6 +2145,14 @@ std::string UnionType::ComputeFingerprint() const { return ss.str(); } +std::string RunLengthEncodedType::ComputeFingerprint() const { + std::stringstream ss; + ss << TypeIdFingerprint(*this) << "{"; + ss << encoded_type()->fingerprint(); + ss << "}"; + return ss.str(); +} + std::string TimeType::ComputeFingerprint() const { std::stringstream ss; ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_); @@ -2272,6 +2289,10 @@ std::shared_ptr struct_(const std::vector>& fie return std::make_shared(fields); } +std::shared_ptr run_length_encoded(std::shared_ptr encoded_type) { + return std::make_shared(std::move(encoded_type)); +} + std::shared_ptr sparse_union(FieldVector child_fields, std::vector type_codes) { if (type_codes.empty()) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f3ac2d62d8268..3de2002e5feb6 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1201,6 +1201,41 @@ class ARROW_EXPORT DenseUnionType : public UnionType { std::string name() const override { return "dense_union"; } }; +/// \brief Type class for run-length encoded data +class ARROW_EXPORT EncodingType { + public: + EncodingType(std::shared_ptr encoded_type) : encoded_type_{encoded_type} {} + + const std::shared_ptr& encoded_type() const { return encoded_type_; } + + private: + std::shared_ptr encoded_type_; +}; + +/// \brief Type class for run-length encoded data +class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType { + public: + static constexpr Type::type type_id = Type::RUN_LENGTH_ENCODED; + + static constexpr const char* type_name() { return "run_length_encoded"; } + + RunLengthEncodedType(std::shared_ptr encoded_type) + : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(std::move(encoded_type)) {} + + DataTypeLayout layout() const override { + // Fixed width layout means fixed size per logical item, which our buffers don't have + return DataTypeLayout( + {DataTypeLayout::VariableWidth(), DataTypeLayout::VariableWidth()}); + } + + std::string ToString() const override; + + std::string name() const override { return "run_length_encoded"; } + + private: + std::string ComputeFingerprint() const override; +}; + /// @} // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 84a50a12eb30d..c045bc4b84939 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -405,6 +405,8 @@ struct Type { /// Calendar interval type with three fields. INTERVAL_MONTH_DAY_NANO, + RUN_LENGTH_ENCODED, + // Leave this at the end MAX_ID }; @@ -547,6 +549,10 @@ std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); std::shared_ptr ARROW_EXPORT struct_(const std::vector>& fields); +/// \brief Create a RunLengthEncoded instance +std::shared_ptr ARROW_EXPORT +run_length_encoded(std::shared_ptr encoded_type); + /// \brief Create a SparseUnionType instance std::shared_ptr ARROW_EXPORT sparse_union(FieldVector child_fields, std::vector type_codes = {}); From 4eb0b46242c6bbcce43a65812cd95ffd9ece7a3f Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 20 Jul 2022 17:21:59 +0200 Subject: [PATCH 002/107] handle rle type in ToString for type ids --- cpp/src/arrow/type.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 59a9b09cafad1..d81cde440e851 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -157,6 +157,7 @@ std::string ToString(Type::type id) { TO_STRING_CASE(DENSE_UNION) TO_STRING_CASE(SPARSE_UNION) TO_STRING_CASE(DICTIONARY) + TO_STRING_CASE(RUN_LENGTH_ENCODED) TO_STRING_CASE(EXTENSION) #undef TO_STRING_CASE From b4f94c0a5cbdfb70177aa64015cd71b842d2b8d6 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 20 Jul 2022 17:38:20 +0200 Subject: [PATCH 003/107] handle RLE in ARROW_GENERATE_FOR_ALL_TYPES --- cpp/src/arrow/visitor_generate.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 265c76197a4d4..c5622480474af 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -63,6 +63,7 @@ namespace arrow { ACTION(SparseUnion); \ ACTION(DenseUnion); \ ACTION(Dictionary); \ + ACTION(RunLengthEncoded); \ ACTION(Extension) } // namespace arrow From 12df4bc687d8e77af1dfa5793434cf6addeebb02 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 20 Jul 2022 17:39:36 +0200 Subject: [PATCH 004/107] imlement NotImplemented status for rle in MakeFormatterImpl --- cpp/src/arrow/array/diff.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 16f4f9c7638a8..44251c193cb46 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -633,6 +633,10 @@ class MakeFormatterImpl { return Status::NotImplemented("formatting diffs between arrays of type ", t); } + Status Visit(const RunLengthEncodedType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + template Formatter MakeTimeFormatter(const std::string& fmt_str) { return [fmt_str](const Array& array, int64_t index, std::ostream* os) { From bf685674c88070a3d4b3fa4460e1dffdc20ab984 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 27 Jul 2022 17:36:05 +0200 Subject: [PATCH 005/107] add RunLengthEncodedArray class --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_base.cc | 4 ++ cpp/src/arrow/array/array_encoded.cc | 41 ++++++++++++++ cpp/src/arrow/array/array_encoded.h | 81 ++++++++++++++++++++++++++++ cpp/src/arrow/type_fwd.h | 2 + 5 files changed, 129 insertions(+) create mode 100644 cpp/src/arrow/array/array_encoded.cc create mode 100644 cpp/src/arrow/array/array_encoded.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 93dd1297bd744..04b5498204bfe 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -140,6 +140,7 @@ set(ARROW_SRCS array/array_binary.cc array/array_decimal.cc array/array_dict.cc + array/array_encoded.cc array/array_nested.cc array/array_primitive.cc array/builder_adaptive.cc diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 5d27b2aedfb47..aa895ab88333c 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -143,6 +143,10 @@ struct ScalarFromArraySlotImpl { return Status::OK(); } + Status Visit(const RunLengthEncodedArray& a) { + return Status::NotImplemented("Creating scalar from encoded array"); + } + Status Visit(const ExtensionArray& a) { ARROW_ASSIGN_OR_RAISE(auto storage, a.storage()->GetScalar(index_)); out_ = std::make_shared(std::move(storage), a.type()); diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc new file mode 100644 index 0000000000000..b31fd3a34a7b1 --- /dev/null +++ b/cpp/src/arrow/array/array_encoded.cc @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/array_encoded.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// RunLengthEncodedArray + +RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& data) { + ARROW_CHECK_EQ(data->type->id(), Type::RUN_LENGTH_ENCODED); + SetData(data); +} + +RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& type, + int64_t length, + std::shared_ptr& values_array, + std::shared_ptr run_ends_buffer, + int64_t null_count, int64_t offset) { + ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); + SetData(ArrayData::Make(type, length, {run_ends_buffer}, null_count, offset)); + data_->child_data.push_back(std::move(values_array->data())); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h new file mode 100644 index 0000000000000..26f46e0f30e58 --- /dev/null +++ b/cpp/src/arrow/array/array_encoded.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and +// Union + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \addtogroup encoded-arrays +/// +/// @{ + +// ---------------------------------------------------------------------- +// RunLengthEncoded + +/// Concrete Array class for run-length encoded data +class ARROW_EXPORT RunLengthEncodedArray : public Array { + public: + using TypeClass = RunLengthEncodedType; + + explicit RunLengthEncodedArray(const std::shared_ptr& data); + + RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, + std::shared_ptr& values_array, + std::shared_ptr run_ends_buffer, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct a RunLengthEncodedArray from values and run ends arrays + /// + /// The length and data type are automatically inferred from the arguments. + /// They need to be the same length. + static Result> Make(std::shared_ptr& values_array, + std::shared_ptr& run_ends_array, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + const RunLengthEncodedType* encoding_type() const; + const DataType* encoded_type() const; + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. The returned array has its offset, length and null + // count adjusted. + std::shared_ptr values() const; + int64_t* run_ends() const; +}; + +/// @} + +} // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index c045bc4b84939..8fa4c3bceb848 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -179,6 +179,8 @@ class DenseUnionArray; class DenseUnionBuilder; struct DenseUnionScalar; +class RunLengthEncodedArray; + template class NumericArray; From b90a5fe621ddfcb320c5ce7a052047e06b006770 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 27 Jul 2022 18:39:39 +0200 Subject: [PATCH 006/107] type_fwd: add RunLengthEncodedType --- cpp/src/arrow/type_fwd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 8fa4c3bceb848..c90192e661b13 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -179,6 +179,7 @@ class DenseUnionArray; class DenseUnionBuilder; struct DenseUnionScalar; +class RunLengthEncodedType; class RunLengthEncodedArray; template From 21259d80262beeeaa618adec0a67cd8c320006ae Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 27 Jul 2022 18:55:23 +0200 Subject: [PATCH 007/107] include new array_encoded header in array.h --- cpp/src/arrow/array.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 918c7617446e3..295d7918cc3ed 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -38,6 +38,7 @@ #include "arrow/array/array_binary.h" // IWYU pragma: keep #include "arrow/array/array_decimal.h" // IWYU pragma: keep #include "arrow/array/array_dict.h" // IWYU pragma: keep +#include "arrow/array/array_encoded.h" // IWYU pragma: keep #include "arrow/array/array_nested.h" // IWYU pragma: keep #include "arrow/array/array_primitive.h" // IWYU pragma: keep #include "arrow/array/data.h" // IWYU pragma: keep From 2689543ef4abb847ee6f353a60e8a69135956777 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 18:55:52 +0200 Subject: [PATCH 008/107] introduce type traits for rle/encoding types --- cpp/src/arrow/type_traits.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 221b35ce57323..557ab3edeafaf 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -381,6 +381,17 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return large_utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = RunLengthEncodedArray; + // TODO: using BuilderType = RunLengthEncodedBuilder; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int64_t)); + } + constexpr static bool is_parameter_free = false; +}; + /// @} /// \addtogroup c-type-traits @@ -749,6 +760,18 @@ using is_interval_type = std::is_base_of; template using enable_if_interval = enable_if_t::value, R>; +template +using is_encoding_type = std::is_base_of; + +template +using enable_if_encoding = enable_if_t::value, R>; + +template +using is_run_length_encoded_type = std::is_base_of; + +template +using enable_if_run_length_encoded = enable_if_t::value, R>; + template using is_dictionary_type = std::is_base_of; From 83754f58fa2f8156251f95a924b55b96ba658c25 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 18:58:00 +0200 Subject: [PATCH 009/107] add methods for rle in type visitor abstract classes --- cpp/src/arrow/visitor.cc | 2 ++ cpp/src/arrow/visitor.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index d22efc942ed9a..550a9b4aba788 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -69,6 +69,7 @@ ARRAY_VISITOR_DEFAULT(DenseUnionArray) ARRAY_VISITOR_DEFAULT(DictionaryArray) ARRAY_VISITOR_DEFAULT(Decimal128Array) ARRAY_VISITOR_DEFAULT(Decimal256Array) +ARRAY_VISITOR_DEFAULT(RunLengthEncodedArray) ARRAY_VISITOR_DEFAULT(ExtensionArray) #undef ARRAY_VISITOR_DEFAULT @@ -118,6 +119,7 @@ TYPE_VISITOR_DEFAULT(StructType) TYPE_VISITOR_DEFAULT(SparseUnionType) TYPE_VISITOR_DEFAULT(DenseUnionType) TYPE_VISITOR_DEFAULT(DictionaryType) +TYPE_VISITOR_DEFAULT(RunLengthEncodedType) TYPE_VISITOR_DEFAULT(ExtensionType) #undef TYPE_VISITOR_DEFAULT diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 7f83c9ebab025..c7ca714b0a080 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -68,6 +68,7 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const SparseUnionArray& array); virtual Status Visit(const DenseUnionArray& array); virtual Status Visit(const DictionaryArray& array); + virtual Status Visit(const RunLengthEncodedArray& array); virtual Status Visit(const ExtensionArray& array); }; @@ -116,6 +117,7 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const SparseUnionType& type); virtual Status Visit(const DenseUnionType& type); virtual Status Visit(const DictionaryType& type); + virtual Status Visit(const RunLengthEncodedType& type); virtual Status Visit(const ExtensionType& type); }; From 735fa04781057ad4f91cf8171d67085206ef9467 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 29 Jul 2022 23:17:03 +0200 Subject: [PATCH 010/107] split ARROW_GENERATE_FOR_ALL_TYPES the scalar version visits all types that can exist as a Scalar. Currently this true for all types we have. This will change once we add run-length encoding, which is an array encoding. --- cpp/src/arrow/scalar.cc | 8 ++--- cpp/src/arrow/visitor_generate.h | 58 +++++++++++++++++--------------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 21e1cdedc2a5e..ef0fdb378e33c 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -768,7 +768,7 @@ struct MakeNullImpl { std::shared_ptr Finish() && { // Should not fail. - DCHECK_OK(VisitTypeInline(*type_, this)); + DCHECK_OK(VisitScalarTypeInline(*type_, this)); return std::move(out_); } @@ -835,7 +835,7 @@ struct ScalarParseImpl { Status FinishWithBuffer() { return Finish(Buffer::FromString(std::string(s_))); } Result> Finish() && { - RETURN_NOT_OK(VisitTypeInline(*type_, this)); + RETURN_NOT_OK(VisitScalarTypeInline(*type_, this)); return std::move(out_); } @@ -1101,7 +1101,7 @@ struct ToTypeVisitor : CastImplVisitor { template Status Visit(const ToType&) { FromTypeVisitor unpack_from_type{from_, to_type_, out_}; - return VisitTypeInline(*from_.type, &unpack_from_type); + return VisitScalarTypeInline(*from_.type, &unpack_from_type); } Status Visit(const NullType&) { @@ -1128,7 +1128,7 @@ Result> Scalar::CastTo(std::shared_ptr to) con if (is_valid) { out->is_valid = true; ToTypeVisitor unpack_to_type{*this, to, out.get()}; - RETURN_NOT_OK(VisitTypeInline(*to, &unpack_to_type)); + RETURN_NOT_OK(VisitScalarTypeInline(*to, &unpack_to_type)); } return out; } diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 265c76197a4d4..9074237084dd4 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -35,34 +35,36 @@ namespace arrow { ACTION(Float); \ ACTION(Double) -#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ - ACTION(Null); \ - ACTION(Boolean); \ - ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ - ACTION(String); \ - ACTION(Binary); \ - ACTION(LargeString); \ - ACTION(LargeBinary); \ - ACTION(FixedSizeBinary); \ - ACTION(Duration); \ - ACTION(Date32); \ - ACTION(Date64); \ - ACTION(Timestamp); \ - ACTION(Time32); \ - ACTION(Time64); \ - ACTION(MonthDayNanoInterval); \ - ACTION(MonthInterval); \ - ACTION(DayTimeInterval); \ - ACTION(Decimal128); \ - ACTION(Decimal256); \ - ACTION(List); \ - ACTION(LargeList); \ - ACTION(Map); \ - ACTION(FixedSizeList); \ - ACTION(Struct); \ - ACTION(SparseUnion); \ - ACTION(DenseUnion); \ - ACTION(Dictionary); \ +#define ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(ACTION) \ + ACTION(Null); \ + ACTION(Boolean); \ + ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ + ACTION(String); \ + ACTION(Binary); \ + ACTION(LargeString); \ + ACTION(LargeBinary); \ + ACTION(FixedSizeBinary); \ + ACTION(Duration); \ + ACTION(Date32); \ + ACTION(Date64); \ + ACTION(Timestamp); \ + ACTION(Time32); \ + ACTION(Time64); \ + ACTION(MonthDayNanoInterval); \ + ACTION(MonthInterval); \ + ACTION(DayTimeInterval); \ + ACTION(Decimal128); \ + ACTION(Decimal256); \ + ACTION(List); \ + ACTION(LargeList); \ + ACTION(Map); \ + ACTION(FixedSizeList); \ + ACTION(Struct); \ + ACTION(SparseUnion); \ + ACTION(DenseUnion); \ + ACTION(Dictionary); \ ACTION(Extension) +#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(ACTION) + } // namespace arrow From 672b2a551520f584eb1e02f8c36493c215dda166 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 19:18:28 +0200 Subject: [PATCH 011/107] actually add scalar type visitor --- cpp/src/arrow/visit_scalar_inline.h | 2 +- cpp/src/arrow/visit_type_inline.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/visit_scalar_inline.h b/cpp/src/arrow/visit_scalar_inline.h index f3e8108e9c362..16fc27e2b295b 100644 --- a/cpp/src/arrow/visit_scalar_inline.h +++ b/cpp/src/arrow/visit_scalar_inline.h @@ -52,7 +52,7 @@ namespace arrow { template inline Status VisitScalarInline(const Scalar& scalar, VISITOR* visitor) { switch (scalar.type->id()) { - ARROW_GENERATE_FOR_ALL_TYPES(SCALAR_VISIT_INLINE); + ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(SCALAR_VISIT_INLINE); default: break; } diff --git a/cpp/src/arrow/visit_type_inline.h b/cpp/src/arrow/visit_type_inline.h index 6897d12b3cf97..5a9a59fc82261 100644 --- a/cpp/src/arrow/visit_type_inline.h +++ b/cpp/src/arrow/visit_type_inline.h @@ -55,6 +55,16 @@ inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) { return Status::NotImplemented("Type not implemented"); } +template +inline Status VisitScalarTypeInline(const DataType& type, VISITOR* visitor) { + switch (type.id()) { + ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(TYPE_VISIT_INLINE); + default: + break; + } + return Status::NotImplemented("Type not implemented"); +} + #undef TYPE_VISIT_INLINE #define TYPE_ID_VISIT_INLINE(TYPE_CLASS) \ From 7091760317e397485892d0ce60284b48dc248ccb Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 29 Jul 2022 23:49:14 +0200 Subject: [PATCH 012/107] add comments --- cpp/src/arrow/visit_type_inline.h | 20 ++++++++++++++++++++ cpp/src/arrow/visitor_generate.h | 1 + 2 files changed, 21 insertions(+) diff --git a/cpp/src/arrow/visit_type_inline.h b/cpp/src/arrow/visit_type_inline.h index 5a9a59fc82261..a7079b4f1db31 100644 --- a/cpp/src/arrow/visit_type_inline.h +++ b/cpp/src/arrow/visit_type_inline.h @@ -55,6 +55,26 @@ inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) { return Status::NotImplemented("Type not implemented"); } +/// \brief Calls `visitor` with the corresponding concrete type class. This function does +/// the same as VisitTypeInline, except that it excludes types that can't exist as a +/// Scalar. +/// +/// \tparam VISITOR Visitor type that implements Visit() for Arrow types that can exist +/// as a Scalar. +/// \return Status +/// +/// A visitor is a type that implements specialized logic for each Arrow type. +/// Example usage: +/// +/// ``` +/// class ExampleVisitor { +/// arrow::Status Visit(const arrow::Int32Type& type) { ... } +/// arrow::Status Visit(const arrow::Int64Type& type) { ... } +/// ... +/// } +/// ExampleVisitor visitor; +/// VisitTypeInline(some_type, &visitor); +/// ``` template inline Status VisitScalarTypeInline(const DataType& type, VISITOR* visitor) { switch (type.id()) { diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 9074237084dd4..07b885b455869 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -35,6 +35,7 @@ namespace arrow { ACTION(Float); \ ACTION(Double) +// all types that can exist as a Scalar #define ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(ACTION) \ ACTION(Null); \ ACTION(Boolean); \ From e4d480a12cc55a7ab5bb98f996307a0f8595d411 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 19:15:13 +0200 Subject: [PATCH 013/107] stub rle type in various visitors --- cpp/src/arrow/array/concatenate.cc | 4 ++++ cpp/src/arrow/array/diff.cc | 4 ++++ cpp/src/arrow/ipc/metadata_internal.cc | 4 ++++ cpp/src/arrow/ipc/reader.cc | 4 ++++ cpp/src/arrow/ipc/writer.cc | 4 ++++ cpp/src/arrow/pretty_print.cc | 4 ++++ cpp/src/arrow/testing/json_internal.cc | 12 ++++++++++++ 7 files changed, 36 insertions(+) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 9f77fbb5f433c..bc5ac483a6336 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -436,6 +436,10 @@ class ConcatenateImpl { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("concatenation of ", type); + } + Status Visit(const ExtensionType& e) { // XXX can we just concatenate their storage? return Status::NotImplemented("concatenation of ", e); diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 44251c193cb46..04dfa701afe0f 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -118,6 +118,10 @@ struct ValueComparatorVisitor { return Status::NotImplemented("dictionary type"); } + Status Visit(const RunLengthEncodedType&) { + return Status::NotImplemented("dictionary type"); + } + ValueComparator Create(const DataType& type) { DCHECK_OK(VisitTypeInline(type, this)); return out; diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 3e5da9d656768..720be09dde7bb 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -689,6 +689,10 @@ class FieldToFlatbufferVisitor { return VisitType(*checked_cast(type).value_type()); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("run-length encoded type in ipc"); + } + Status Visit(const ExtensionType& type) { RETURN_NOT_OK(VisitType(*type.storage_type())); extra_type_metadata_[kExtensionTypeKeyName] = type.extension_name(); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 0972d7e85ca7a..7d6838db7f08b 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -421,6 +421,10 @@ class ArrayLoader { return LoadType(*type.index_type()); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("run-length encoded array in ipc"); + } + Status Visit(const ExtensionType& type) { return LoadType(*type.storage_type()); } BatchDataReadRequest& read_request() { return read_request_; } diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index d015ee3f4d98c..088ebff3ccfea 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -526,6 +526,10 @@ class RecordBatchSerializer { return VisitType(*array.indices()); } + Status Visit(const RunLengthEncodedArray& type) { + return Status::NotImplemented("run-length encoded array in ipc"); + } + Status Visit(const ExtensionArray& array) { return VisitType(*array.storage()); } Status VisitType(const Array& values) { return VisitArrayInline(values, this); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index ac92287f1bcc4..4dedd97d6f131 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -376,6 +376,10 @@ class ArrayPrinter : public PrettyPrinter { return PrettyPrint(*array.indices(), ChildOptions(true), sink_); } + Status Visit(const RunLengthEncodedArray& array) { + return Status::NotImplemented("printing run-length encoded array"); + } + Status Print(const Array& array) { RETURN_NOT_OK(VisitArrayInline(array, this)); Flush(); diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc index c88e95df016e3..8e6574a6bfd1d 100644 --- a/cpp/src/arrow/testing/json_internal.cc +++ b/cpp/src/arrow/testing/json_internal.cc @@ -437,6 +437,10 @@ class SchemaWriter { Status Visit(const DictionaryType& type) { return VisitType(*type.value_type()); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("run-length encoded type in JSON"); + } + Status Visit(const ExtensionType& type) { return Status::NotImplemented(type.name()); } private: @@ -743,6 +747,10 @@ class ArrayWriter { return WriteChildren(type.fields(), children); } + Status Visit(const RunLengthEncodedArray& type) { + return Status::NotImplemented("run-length encoded array in JSON"); + } + Status Visit(const ExtensionArray& array) { return VisitArrayValues(*array.storage()); } private: @@ -1542,6 +1550,10 @@ class ArrayReader { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("run-length encoded array in JSON"); + } + Status Visit(const ExtensionType& type) { ArrayReader parser(obj_, pool_, field_->WithType(type.storage_type())); ARROW_ASSIGN_OR_RAISE(data_, parser.Parse()); From 05f7404b0547752cb0756e03f2945c1ab3116149 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 19:16:50 +0200 Subject: [PATCH 014/107] more stubs --- cpp/src/arrow/compare.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index c5406ee583faf..ef1cb31201a58 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -387,6 +387,10 @@ class RangeDataEqualsImpl { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("comparing run-length encoded data"); + } + Status Visit(const ExtensionType& type) { // Compare storages result_ &= CompareWithType(*type.storage_type()); @@ -680,6 +684,12 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const RunLengthEncodedType& left) { + const auto& right = checked_cast(right_); + result_ = left.encoded_type()->Equals(right.encoded_type()); + return Status::OK(); + } + Status Visit(const ExtensionType& left) { result_ = left.ExtensionEquals(static_cast(right_)); return Status::OK(); From f62046bfe7a2fa0e8e8bd28acff6ffd6580f7cd1 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 28 Jul 2022 19:23:17 +0200 Subject: [PATCH 015/107] more stubs --- cpp/src/arrow/compute/exec/ir_consumer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/exec/ir_consumer.cc b/cpp/src/arrow/compute/exec/ir_consumer.cc index f17dbf1ed7962..b3b81413924eb 100644 --- a/cpp/src/arrow/compute/exec/ir_consumer.cc +++ b/cpp/src/arrow/compute/exec/ir_consumer.cc @@ -289,6 +289,7 @@ struct ConvertLiteralImpl { Status Visit(const LargeStringType& t) { return NotImplemented(); } Status Visit(const LargeBinaryType& t) { return NotImplemented(); } Status Visit(const LargeListType& t) { return NotImplemented(); } + Status Visit(const RunLengthEncodedType& t) { return NotImplemented(); } template Status Visit(const T& t) { From cf548bc5d11e3571aa267855b5a908ae091796fc Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 00:34:55 +0200 Subject: [PATCH 016/107] formatting --- cpp/src/arrow/visitor_generate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index a36af54fda72b..5242ad4160a46 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -66,7 +66,7 @@ namespace arrow { ACTION(Dictionary); \ ACTION(Extension) -#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ +#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ ARROW_GENERATE_FOR_ALL_SCALAR_TYPES(ACTION); \ ACTION(RunLengthEncoded) From 8e21a091bffdf77123861a69a8fea8fed51266f0 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 00:35:16 +0200 Subject: [PATCH 017/107] update/stub more visitors --- cpp/src/arrow/array/util.cc | 6 +++++- cpp/src/arrow/array/validate.cc | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index c0cdcab730ca4..5dca10e5c3402 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -284,6 +284,10 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("Byte-swap run-length encoded data"); + } + const std::shared_ptr& data_; std::shared_ptr out_; }; @@ -542,7 +546,7 @@ class RepeatedArrayFactory { : pool_(pool), scalar_(scalar), length_(length) {} Result> Create() { - RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this)); + RETURN_NOT_OK(VisitScalarTypeInline(*scalar_.type, this)); return out_; } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 05155d64b6acb..90b48e757cc40 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -413,6 +413,10 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { + return Status::NotImplemented("validating RLE"); + } + Status Visit(const ExtensionType& type) { // Visit storage return ValidateWithType(*type.storage_type()); From 0a3353029dafc310b18d79f5a39f3ee89d666797 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 00:38:29 +0200 Subject: [PATCH 018/107] one more visitor --- cpp/src/arrow/json/test_common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 508be0c9102bb..a981470e34a05 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -129,6 +129,8 @@ struct GenerateImpl { Status Visit(const UnionType& t) { return NotImplemented(t); } + Status Visit(const RunLengthEncodedType& t) { return NotImplemented(t); } + Status NotImplemented(const DataType& t) { return Status::NotImplemented("random generation of arrays of type ", t); } From a59335c3d8e18864698333ee05ed232ab663a8ec Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 00:43:24 +0200 Subject: [PATCH 019/107] gtest_util: add rle to type list --- cpp/src/arrow/testing/gtest_util.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index c5ab367befe27..6d82a049a5677 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -101,6 +101,7 @@ std::vector AllTypeIds() { Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY, + Type::RUN_LENGTH_ENCODED, Type::EXTENSION, Type::INTERVAL_MONTH_DAY_NANO}; } From 7741fe9ef095ea90475533d772d8f8d05be1dbf6 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 00:45:06 +0200 Subject: [PATCH 020/107] remove some unused methods for now --- cpp/src/arrow/array/array_encoded.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 26f46e0f30e58..4ed1c982f4f76 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -56,24 +56,6 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { std::shared_ptr& values_array, std::shared_ptr run_ends_buffer, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - /// \brief Construct a RunLengthEncodedArray from values and run ends arrays - /// - /// The length and data type are automatically inferred from the arguments. - /// They need to be the same length. - static Result> Make(std::shared_ptr& values_array, - std::shared_ptr& run_ends_array, - int64_t null_count = kUnknownNullCount, - int64_t offset = 0); - - const RunLengthEncodedType* encoding_type() const; - const DataType* encoded_type() const; - - // Return a shared pointer in case the requestor desires to share ownership - // with this array. The returned array has its offset, length and null - // count adjusted. - std::shared_ptr values() const; - int64_t* run_ends() const; }; /// @} From 77b565096f808f093417becc186d62f7b982985b Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 01:57:26 +0200 Subject: [PATCH 021/107] add rle_util --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/rle_util.cc | 24 ++++++++ cpp/src/arrow/util/rle_util.h | 85 +++++++++++++++++++++++++++++ cpp/src/arrow/util/rle_util_test.cc | 43 +++++++++++++++ 5 files changed, 154 insertions(+) create mode 100644 cpp/src/arrow/util/rle_util.cc create mode 100644 cpp/src/arrow/util/rle_util.h create mode 100644 cpp/src/arrow/util/rle_util_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 04b5498204bfe..bc7a3532c9b7c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -217,6 +217,7 @@ set(ARROW_SRCS util/key_value_metadata.cc util/memory.cc util/mutex.cc + util/rle_util.cc util/string.cc util/string_builder.cc util/task_group.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index cd1a8967eeb19..6746cbc4d3533 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -61,6 +61,7 @@ add_arrow_test(utility-test range_test.cc reflection_test.cc rle_encoding_test.cc + rle_util_test.cc small_vector_test.cc stl_util_test.cc string_test.cc diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc new file mode 100644 index 0000000000000..afcf2a81775fc --- /dev/null +++ b/cpp/src/arrow/util/rle_util.cc @@ -0,0 +1,24 @@ +#include "arrow/util/rle_util.h" +#include +#include "arrow/builder.h" + +namespace arrow { +namespace rle_util { + +int64_t FindPhysicalOffset(const int64_t* accumulated_run_lengths, + int64_t physical_length, int64_t logical_offset) { + auto it = std::upper_bound(accumulated_run_lengths, + accumulated_run_lengths + physical_length, logical_offset); + return std::distance(accumulated_run_lengths, it); +} + +void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { + auto& child = array->child_data[0]; + auto builder = MakeBuilder(child->type).ValueOrDie(); + ARROW_CHECK_OK(builder->AppendNulls(offset)); + ARROW_CHECK_OK(builder->AppendArraySlice(ArraySpan(*child), 0, child->length)); + array->child_data[0] = builder->Finish().ValueOrDie()->Slice(offset)->data(); +} + +} // namespace rle_util +} // namespace arrow diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h new file mode 100644 index 0000000000000..25c227407a2ce --- /dev/null +++ b/cpp/src/arrow/util/rle_util.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +#include "arrow/array/data.h" + +namespace arrow { +namespace rle_util { + +int64_t FindPhysicalOffset(const int64_t* accumulated_run_lengths, + int64_t physical_length, int64_t logical_offset); + +static const int64_t* RunEnds(const ArraySpan& span) { + return span.GetValues(0, /*absolute_offset=*/0); +} + +static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[0]; } + +template +void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { + const int64_t a_physical_offset = + rle_util::FindPhysicalOffset(RunEnds(a), DataArray(a).length, a.offset); + const int64_t b_physical_offset = + rle_util::FindPhysicalOffset(RunEnds(b), DataArray(b).length, b.offset); + + ARROW_DCHECK_EQ(a.length, b.length); + const int64_t logical_length = a.length; + + // run indices from the start of the run ends buffers without offset + int64_t a_run_index = a_physical_offset; + int64_t b_run_index = b_physical_offset; + int64_t logical_position = 0; + + while (logical_position < logical_length) { + // logical indices of the end of the run we are currently in in each input + const int64_t a_run_end = RunEnds(a)[a_run_index] - a.offset; + const int64_t b_run_end = RunEnds(b)[b_run_index] - b.offset; + + ARROW_DCHECK_GT(a_run_end, logical_position); + ARROW_DCHECK_GT(b_run_end, logical_position); + + const int64_t merged_run_end = std::min(a_run_end, b_run_end); + const int64_t merged_run_length = merged_run_end - logical_position; + + // callback to code that wants to work on the data - we give it physical indices + // including all offsets. This includes the additional offset the data array may have. + callback(merged_run_length, a_run_index + DataArray(a).offset, + b_run_index + DataArray(b).offset); + + logical_position = merged_run_end; + if (logical_position == a_run_end) { + a_run_index++; + } + if (logical_position == b_run_end) { + b_run_index++; + } + } +} + +// TODO: this may fit better into some testing header +void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset); + +} // namespace rle_util +} // namespace arrow diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc new file mode 100644 index 0000000000000..73dfef9430162 --- /dev/null +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/rle_util.h" + +namespace arrow { +namespace rle_util { + +TEST(TestRleUtil, FindPhysicalOffsetTest) { + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1}, 1, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 1), 1); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 2), 2); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 3), 3); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 1), 0); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 2), 1); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 3), 2); + ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 4, 6}, 3, 3), 1); +} + +} // namespace rle_util +} // namespace arrow From 3fcddddd50e53e5074e7181764d8fd624c76a71d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Sat, 30 Jul 2022 02:00:14 +0200 Subject: [PATCH 022/107] update the rle_util functions - use int32 - calculate physical offset based on buffer size, instead of incorrectly using the physical size --- cpp/src/arrow/util/rle_util.cc | 7 ++++--- cpp/src/arrow/util/rle_util.h | 8 ++++---- cpp/src/arrow/util/rle_util_test.cc | 21 +++++++++++---------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index afcf2a81775fc..d27ec6770f1af 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -5,10 +5,11 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int64_t* accumulated_run_lengths, - int64_t physical_length, int64_t logical_offset) { +int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths, int64_t buffer_size, + int64_t logical_offset) { auto it = std::upper_bound(accumulated_run_lengths, - accumulated_run_lengths + physical_length, logical_offset); + accumulated_run_lengths + buffer_size / sizeof(int32_t), + logical_offset); return std::distance(accumulated_run_lengths, it); } diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 25c227407a2ce..85a620157075e 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,11 +28,11 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int64_t* accumulated_run_lengths, - int64_t physical_length, int64_t logical_offset); +int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths, + int64_t buffer_size, int64_t logical_offset); -static const int64_t* RunEnds(const ArraySpan& span) { - return span.GetValues(0, /*absolute_offset=*/0); +static const int32_t* RunEnds(const ArraySpan& span) { + return span.GetValues(0, /*absolute_offset=*/0); } static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[0]; } diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 73dfef9430162..fc1ec536b4746 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -27,16 +27,17 @@ namespace arrow { namespace rle_util { TEST(TestRleUtil, FindPhysicalOffsetTest) { - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1}, 1, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 1), 1); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 2), 2); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){1, 2, 3}, 3, 3), 3); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 1), 0); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 2), 1); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 3, 4}, 3, 3), 2); - ASSERT_EQ(FindPhysicalOffset((const int64_t[]){2, 4, 6}, 3, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1}, 1, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 1), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 2), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 3), 3); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 1), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 2), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 3), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3*4, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4, 5}, 3*4+3, 100), 2); } } // namespace rle_util From 3bfef481d8e900feaccfd3e578c3083a560f9bec Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 1 Aug 2022 20:00:26 +0200 Subject: [PATCH 023/107] type: RLE only has one buffer --- cpp/src/arrow/type.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3de2002e5feb6..7e2d2153e780d 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1224,8 +1224,7 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType DataTypeLayout layout() const override { // Fixed width layout means fixed size per logical item, which our buffers don't have - return DataTypeLayout( - {DataTypeLayout::VariableWidth(), DataTypeLayout::VariableWidth()}); + return DataTypeLayout({DataTypeLayout::VariableWidth()}); } std::string ToString() const override; From 0f391a41921c12deda78a53ffbbddf57b96f0daa Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 1 Aug 2022 20:15:22 +0200 Subject: [PATCH 024/107] naming --- cpp/src/arrow/util/rle_util.cc | 8 ++++---- cpp/src/arrow/util/rle_util.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index d27ec6770f1af..14077d12e4cc6 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -5,12 +5,12 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths, int64_t buffer_size, +int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, int64_t logical_offset) { - auto it = std::upper_bound(accumulated_run_lengths, - accumulated_run_lengths + buffer_size / sizeof(int32_t), + auto it = std::upper_bound(run_ends, + run_ends + buffer_size / sizeof(int32_t), logical_offset); - return std::distance(accumulated_run_lengths, it); + return std::distance(run_ends, it); } void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 85a620157075e..e10326443e480 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,7 +28,7 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths, +int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, int64_t logical_offset); static const int32_t* RunEnds(const ArraySpan& span) { From 8380e794e14a73d56ef1ba6a7c8312b111bfe284 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 1 Aug 2022 20:51:13 +0200 Subject: [PATCH 025/107] wip: testing for VisitMergedRuns --- cpp/src/arrow/util/rle_util_test.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index fc1ec536b4746..f542f47a3ef8f 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -40,5 +40,20 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4, 5}, 3*4+3, 100), 2); } +TEST(TestRleUtil, VisitMergedRuns) { + /*std::vector left = {5, 10, 5, 5}; + std::vector right = {5, 4, 16};*/ + std::vector merged = {5, 4, 6, 5, 5}; + + size_t merged_position = 0; + + rle_util::VisitMergedRuns() + + + ASSERT_EQ(merged_position, merged.size()); + + +} + } // namespace rle_util } // namespace arrow From 6878b5bcd995e96e2c4130b45d4b49f68986c129 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:07:57 +0200 Subject: [PATCH 026/107] array_encoded: add methods for working with RunLengthEncodedArray --- cpp/src/arrow/array/array_encoded.cc | 32 ++++++++++++++++++++++++++++ cpp/src/arrow/array/array_encoded.h | 23 ++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index b31fd3a34a7b1..0e979946107c4 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/array/array_encoded.h" +#include "arrow/array/util.h" #include "arrow/util/logging.h" namespace arrow { @@ -38,4 +39,35 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty data_->child_data.push_back(std::move(values_array->data())); } +Result> RunLengthEncodedArray::Make( + std::shared_ptr& values_array, std::shared_ptr& run_ends_array, + int64_t logical_length, int64_t null_count, int64_t offset) { + if (run_ends_array->type_id() != Type::INT32) { + return Status::Invalid("Run ends array must be int32 type"); + } + int64_t physical_length = values_array->length(); + if (run_ends_array->null_count() != 0) { + return Status::Invalid("Run ends array cannot contain null values"); + } + + std::shared_ptr run_ends_buffer = SliceBuffer( + run_ends_array->data()->buffers[1], run_ends_array->offset(), physical_length); + + return std::make_shared( + run_length_encoded(values_array->type()), logical_length, values_array, + std::move(run_ends_buffer), null_count, offset); +} + +std::shared_ptr RunLengthEncodedArray::values_array() const { + return MakeArray(data()->child_data[0]); +} + +std::shared_ptr RunLengthEncodedArray::run_ends_buffer() const { + return data()->buffers[0]; +} + +const int32_t* RunLengthEncodedArray::run_ends() const { + return data()->GetValuesSafe(0, 0); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 4ed1c982f4f76..88f93e9f8cc5a 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -56,6 +56,29 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { std::shared_ptr& values_array, std::shared_ptr run_ends_buffer, int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct a RunLengthEncodedArray from values and run ends arrays + /// + /// The data type is automatically inferred from the arguments. + /// The run_ends_array and values_array must be the same length. + static Result> Make( + std::shared_ptr& values_array, std::shared_ptr& run_ends_array, + int64_t logical_length, int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Returns an array holding the values of each run. This function does apply + /// neiher the physical offset to the array + std::shared_ptr values_array() const; + + /// \brief Returns the buffer holding the logical indexes of each run end. This function + /// does apply neiher the physical offset to the start of the buffer nor the logical + /// offset to the values. + std::shared_ptr run_ends_buffer() const; + + /// \brief Returns a pointer to logical indexes of each run end. This function does + /// apply neiher the physical offset to the start of the buffer nor the logical offset + /// to the values. This function does only work if the run ends are stored in a CPU + /// buffer. + const int32_t* run_ends() const; }; /// @} From fe8c4a407b7427d4f22ae02d46839a608af2b24c Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:10:09 +0200 Subject: [PATCH 027/107] add rle array tests --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_encoded_test.cc | 97 +++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 cpp/src/arrow/array/array_encoded_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 04b5498204bfe..43624a10ac123 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -701,6 +701,7 @@ add_arrow_test(array_test array/array_test.cc array/array_binary_test.cc array/array_dict_test.cc + array/array_encoded_test.cc array/array_list_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc new file mode 100644 index 0000000000000..8106822d4514d --- /dev/null +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(string_values, int32_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); +} + +TEST(RunLengthEncodedArray, FromRunEndsAndValues) { + std::shared_ptr rle_array; + + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + ASSERT_EQ(rle_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); + ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); + ASSERT_EQ(rle_array->offset(), 0); + // explicitly access null count variable so it is not calculated automatically + ASSERT_EQ(rle_array->data()->null_count, kUnknownNullCount); + + // explicitly passing offset and null_count + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(string_values, int32_values, 2, 0, 1)); + ASSERT_EQ(rle_array->length(), 2); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); + ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); + ASSERT_EQ(rle_array->offset(), 1); + // explicitly access null count variable so it is not calculated automatically + ASSERT_EQ(rle_array->data()->null_count, 0); + + ASSERT_RAISES_WITH_MESSAGE(Invalid, "Run ends array must be int32 type", + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Run ends array must be int32 type", + RunLengthEncodedArray::Make(int32_values, int32_only_null, 3)); +} + +TEST(RunLengthEncodedArray, Accessors) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(string_values, int32_values, 3)); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); + ASSERT_EQ(rle_array->run_ends_buffer()->data(), + int32_values->data()->buffers[1]->data()); + ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1, 0)); +} + +} // anonymous namespace + +} // namespace arrow From adfcac50f177d90641b736c22006e95f33119d5d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:33:22 +0200 Subject: [PATCH 028/107] make HasValidityBitmap return false for rle --- cpp/src/arrow/type.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 7e2d2153e780d..6156bf77231ec 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2138,6 +2138,7 @@ static inline bool HasValidityBitmap(Type::type id) { case Type::NA: case Type::DENSE_UNION: case Type::SPARSE_UNION: + case Type::RUN_LENGTH_ENCODED: return false; default: return true; From caf19be6021c6a8f409ffa3926456435c550c6b9 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:42:26 +0200 Subject: [PATCH 029/107] make null count of rle arrays always zero --- cpp/src/arrow/array/array_encoded.cc | 12 ++++++------ cpp/src/arrow/array/array_encoded.h | 5 ++--- cpp/src/arrow/array/array_encoded_test.cc | 6 +++--- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 0e979946107c4..07450462376b3 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -33,15 +33,15 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty int64_t length, std::shared_ptr& values_array, std::shared_ptr run_ends_buffer, - int64_t null_count, int64_t offset) { + int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); - SetData(ArrayData::Make(type, length, {run_ends_buffer}, null_count, offset)); + SetData(ArrayData::Make(type, length, {run_ends_buffer}, offset)); data_->child_data.push_back(std::move(values_array->data())); } Result> RunLengthEncodedArray::Make( std::shared_ptr& values_array, std::shared_ptr& run_ends_array, - int64_t logical_length, int64_t null_count, int64_t offset) { + int64_t logical_length, int64_t offset) { if (run_ends_array->type_id() != Type::INT32) { return Status::Invalid("Run ends array must be int32 type"); } @@ -53,9 +53,9 @@ Result> RunLengthEncodedArray::Make( std::shared_ptr run_ends_buffer = SliceBuffer( run_ends_array->data()->buffers[1], run_ends_array->offset(), physical_length); - return std::make_shared( - run_length_encoded(values_array->type()), logical_length, values_array, - std::move(run_ends_buffer), null_count, offset); + return std::make_shared(run_length_encoded(values_array->type()), + logical_length, values_array, + std::move(run_ends_buffer), offset); } std::shared_ptr RunLengthEncodedArray::values_array() const { diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 88f93e9f8cc5a..e3dd64d274d67 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -54,8 +54,7 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, std::shared_ptr& values_array, - std::shared_ptr run_ends_buffer, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); + std::shared_ptr run_ends_buffer, int64_t offset = 0); /// \brief Construct a RunLengthEncodedArray from values and run ends arrays /// @@ -63,7 +62,7 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { /// The run_ends_array and values_array must be the same length. static Result> Make( std::shared_ptr& values_array, std::shared_ptr& run_ends_array, - int64_t logical_length, int64_t null_count = kUnknownNullCount, int64_t offset = 0); + int64_t logical_length, int64_t offset = 0); /// \brief Returns an array holding the values of each run. This function does apply /// neiher the physical offset to the array diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 8106822d4514d..a4f348d47f805 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -64,11 +64,11 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); ASSERT_EQ(rle_array->offset(), 0); // explicitly access null count variable so it is not calculated automatically - ASSERT_EQ(rle_array->data()->null_count, kUnknownNullCount); + ASSERT_EQ(rle_array->data()->null_count, 0); - // explicitly passing offset and null_count + // explicitly passing offset ASSERT_OK_AND_ASSIGN(rle_array, - RunLengthEncodedArray::Make(string_values, int32_values, 2, 0, 1)); + RunLengthEncodedArray::Make(string_values, int32_values, 2, 1)); ASSERT_EQ(rle_array->length(), 2); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); From 0ef01289a922b2c9785d33dee5d3f7deb37efe13 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:46:58 +0200 Subject: [PATCH 030/107] handle rle in GetNumBuffers --- cpp/src/arrow/array/data.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 0cfa9fcd2e11a..01554591b57ca 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -195,6 +195,7 @@ int GetNumBuffers(const DataType& type) { case Type::NA: case Type::STRUCT: case Type::FIXED_SIZE_LIST: + case Type::RUN_LENGTH_ENCODED: return 1; case Type::BINARY: case Type::LARGE_BINARY: From d503b98fcbd0aa5b1e1a24680cdf58c9a04eccca Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:52:32 +0200 Subject: [PATCH 031/107] fix setting offset in RunLengthEncodedArray::Make --- cpp/src/arrow/array/array_encoded.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 07450462376b3..5c446f27a146a 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -35,7 +35,7 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty std::shared_ptr run_ends_buffer, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); - SetData(ArrayData::Make(type, length, {run_ends_buffer}, offset)); + SetData(ArrayData::Make(type, length, {run_ends_buffer}, 0, offset)); data_->child_data.push_back(std::move(values_array->data())); } From 47e186b273654f57b917ffaeb2633a714830cc38 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 2 Aug 2022 23:55:12 +0200 Subject: [PATCH 032/107] fix testing status strings --- cpp/src/arrow/array/array_encoded_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index a4f348d47f805..39011e07b0b5b 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -76,10 +76,10 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { // explicitly access null count variable so it is not calculated automatically ASSERT_EQ(rle_array->data()->null_count, 0); - ASSERT_RAISES_WITH_MESSAGE(Invalid, "Run ends array must be int32 type", + ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array must be int32 type", RunLengthEncodedArray::Make(int32_values, string_values, 3)); ASSERT_RAISES_WITH_MESSAGE( - Invalid, "Run ends array must be int32 type", + Invalid, "Invalid: Run ends array cannot contain null values", RunLengthEncodedArray::Make(int32_values, int32_only_null, 3)); } From 1624e1e304f7db950da097cf0d8ebbb178a79bdd Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 00:27:01 +0200 Subject: [PATCH 033/107] mark constructors as explicit --- cpp/src/arrow/type.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 6156bf77231ec..9aae8cd75e3b9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1204,7 +1204,8 @@ class ARROW_EXPORT DenseUnionType : public UnionType { /// \brief Type class for run-length encoded data class ARROW_EXPORT EncodingType { public: - EncodingType(std::shared_ptr encoded_type) : encoded_type_{encoded_type} {} + explicit EncodingType(std::shared_ptr encoded_type) + : encoded_type_{encoded_type} {} const std::shared_ptr& encoded_type() const { return encoded_type_; } @@ -1219,7 +1220,7 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType static constexpr const char* type_name() { return "run_length_encoded"; } - RunLengthEncodedType(std::shared_ptr encoded_type) + explicit RunLengthEncodedType(std::shared_ptr encoded_type) : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(std::move(encoded_type)) {} DataTypeLayout layout() const override { From b65f2a68b36d9595c1333455aa3b50e38d2b1aa2 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 00:30:56 +0200 Subject: [PATCH 034/107] doxygen: add group for encoded arrays --- cpp/src/arrow/array.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 295d7918cc3ed..7ea95d5e3bf99 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -34,6 +34,10 @@ /// @{ /// @} +/// \defgroup encoded-arrays Concrete classes for encoded arrays +/// @{ +/// @} + #include "arrow/array/array_base.h" // IWYU pragma: keep #include "arrow/array/array_binary.h" // IWYU pragma: keep #include "arrow/array/array_decimal.h" // IWYU pragma: keep From adc4e31bf98bb3ab0906f144077e2ad96a0e012c Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 00:39:40 +0200 Subject: [PATCH 035/107] fix comment --- cpp/src/arrow/array/array_encoded.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index e3dd64d274d67..c5e4542b94fad 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and -// Union +// Array accessor classes run-length encoded arrays #pragma once From c94e1dd4d6d9a9b207cef417308cd2664bf2c7af Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 18:27:57 +0200 Subject: [PATCH 036/107] wip: test for VisitMergedRuns --- cpp/src/arrow/util/rle_util_test.cc | 37 ++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index f542f47a3ef8f..232838981d568 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -40,17 +40,42 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4, 5}, 3*4+3, 100), 2); } + TEST(TestRleUtil, VisitMergedRuns) { - /*std::vector left = {5, 10, 5, 5}; - std::vector right = {5, 4, 16};*/ - std::vector merged = {5, 4, 6, 5, 5}; + const auto left_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 1050]"); + const auto right_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); + const std::vector expected_run_lengths = {5, 4, 6, 5, 5, 25}; + const std::vector expected_left_visits = {5, 4, 6, 5, 5}; + const std::vector expected_right_visits = {5, 4, 6, 5, 5}; + const int32_t left_parent_offset = 1000; + const int32_t left_child_offset = 100; + const int32_t right_parent_offset = 200; + const int32_t right_child_offset = 100; + + std::shared_ptr left_child = std::make_shared(left_child_offset + left_run_ends->length()); + std::shared_ptr right_child = std::make_shared(right_child_offset + right_run_ends->length()); + + left_child = left_child->Slice(left_child_offset); + right_child = right_child->Slice(right_child_offset); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, RunLengthEncodedArray::Make(left_run_ends, left_child, 1026)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, RunLengthEncodedArray::Make(right_run_ends, right_child, 2026)); + + left_array = left_array->Slice(left_parent_offset); + left_array = left_array->Slice(left_parent_offset); - size_t merged_position = 0; + size_t position = 0; - rle_util::VisitMergedRuns() + rle_util::VisitMergedRuns(ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), + [&position, &expected_run_lengths, &expected_left_visits, expected_right_visits](int64_t run_length, int64_t left_index, int64_t right_index) { + ASSERT_EQ(run_length, expected_run_lengths[position]); + ASSERT_EQ(left_index, expected_left_visits[position]); + ASSERT_EQ(run_length, expected_right_visits[position]); + position++; + }); - ASSERT_EQ(merged_position, merged.size()); + ASSERT_EQ(position, expected_run_lengths.size()); } From 25e9e2db9cf2caac458cd4c2650c8ab6a28e724e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 18:29:32 +0200 Subject: [PATCH 037/107] make shared_ptr reference parameters const --- cpp/src/arrow/array/array_encoded.cc | 5 +++-- cpp/src/arrow/array/array_encoded.h | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 5c446f27a146a..6f69073dafd50 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -40,8 +40,9 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty } Result> RunLengthEncodedArray::Make( - std::shared_ptr& values_array, std::shared_ptr& run_ends_array, - int64_t logical_length, int64_t offset) { + const std::shared_ptr& values_array, + const std::shared_ptr& run_ends_array, int64_t logical_length, + int64_t offset) { if (run_ends_array->type_id() != Type::INT32) { return Status::Invalid("Run ends array must be int32 type"); } diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index c5e4542b94fad..26e0fbb233d52 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -52,16 +52,18 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { explicit RunLengthEncodedArray(const std::shared_ptr& data); RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, - std::shared_ptr& values_array, - std::shared_ptr run_ends_buffer, int64_t offset = 0); + const std::shared_ptr& values_array, + const std::shared_ptr run_ends_buffer, + int64_t offset = 0); /// \brief Construct a RunLengthEncodedArray from values and run ends arrays /// /// The data type is automatically inferred from the arguments. /// The run_ends_array and values_array must be the same length. static Result> Make( - std::shared_ptr& values_array, std::shared_ptr& run_ends_array, - int64_t logical_length, int64_t offset = 0); + const std::shared_ptr& values_array, + const std::shared_ptr& run_ends_array, int64_t logical_length, + int64_t offset = 0); /// \brief Returns an array holding the values of each run. This function does apply /// neiher the physical offset to the array From 8260b922adab75cf73f001341918344af7db7835 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 18:32:56 +0200 Subject: [PATCH 038/107] actually fix const reference parameters --- cpp/src/arrow/array/array_encoded.cc | 2 +- cpp/src/arrow/array/array_encoded.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 6f69073dafd50..dbe6791dbbc6c 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -31,7 +31,7 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& d RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, - std::shared_ptr& values_array, + const std::shared_ptr& values_array, std::shared_ptr run_ends_buffer, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 26e0fbb233d52..3dd085c91446e 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -53,8 +53,7 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values_array, - const std::shared_ptr run_ends_buffer, - int64_t offset = 0); + std::shared_ptr run_ends_buffer, int64_t offset = 0); /// \brief Construct a RunLengthEncodedArray from values and run ends arrays /// From 4324f726a3a849eb98ac418f01b1cabea988ec3c Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 20:14:23 +0200 Subject: [PATCH 039/107] type_traits: remove misleading bytes_required method for rle --- cpp/src/arrow/type_traits.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 557ab3edeafaf..8ec8ac08f1b79 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -386,9 +386,6 @@ struct TypeTraits { using ArrayType = RunLengthEncodedArray; // TODO: using BuilderType = RunLengthEncodedBuilder; - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int64_t)); - } constexpr static bool is_parameter_free = false; }; From b318d4332ed45311a737d42236f9d60ec709f693 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 20:19:50 +0200 Subject: [PATCH 040/107] type_internal: stub visitor for rle --- cpp/src/arrow/engine/substrait/type_internal.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 6c65b32e2a95d..a9a01ee036e74 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -323,6 +323,7 @@ struct DataTypeToProtoImpl { Status Visit(const SparseUnionType& t) { return NotImplemented(t); } Status Visit(const DenseUnionType& t) { return NotImplemented(t); } Status Visit(const DictionaryType& t) { return NotImplemented(t); } + Status Visit(const RunLengthEncodedType& t) { return NotImplemented(t); } Status Visit(const MapType& t) { // FIXME assert default field names; custom ones won't roundtrip From 557a058fa95295bc40554429d2c5a0c53daa50f7 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 20:22:26 +0200 Subject: [PATCH 041/107] type: move run ends into child and set children array in constructor --- cpp/src/arrow/type.cc | 6 ++++++ cpp/src/arrow/type.h | 5 ++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d81cde440e851..dfb19555b9146 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -718,6 +718,12 @@ Result> DenseUnionType::Make( // ---------------------------------------------------------------------- // Run-length encoded type +RunLengthEncodedType::RunLengthEncodedType(std::shared_ptr encoded_type) + : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(std::move(encoded_type)) { + children_ = {std::make_shared("run_ends", int32(), false), + std::make_shared("values", encoded_type, true)}; +} + std::string RunLengthEncodedType::ToString() const { std::stringstream s; s << name() << "<" << encoded_type()->ToString() << ">"; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9aae8cd75e3b9..36c8d72d33293 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1220,12 +1220,11 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType static constexpr const char* type_name() { return "run_length_encoded"; } - explicit RunLengthEncodedType(std::shared_ptr encoded_type) - : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(std::move(encoded_type)) {} + explicit RunLengthEncodedType(std::shared_ptr encoded_type); DataTypeLayout layout() const override { // Fixed width layout means fixed size per logical item, which our buffers don't have - return DataTypeLayout({DataTypeLayout::VariableWidth()}); + return DataTypeLayout({}); } std::string ToString() const override; From e0bf74b21e4f9209e365a48435e93cddd4226f4e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 20:34:36 +0200 Subject: [PATCH 042/107] update RunLengthEncodedArray class --- cpp/src/arrow/array/array_encoded.cc | 21 +++++++-------------- cpp/src/arrow/array/array_encoded.h | 19 ++++++------------- cpp/src/arrow/array/array_encoded_test.cc | 14 ++------------ 3 files changed, 15 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index dbe6791dbbc6c..a5a8d03237055 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -32,10 +32,11 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& d RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values_array, - std::shared_ptr run_ends_buffer, + const std::shared_ptr& run_ends_array, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); - SetData(ArrayData::Make(type, length, {run_ends_buffer}, 0, offset)); + SetData(ArrayData::Make(type, length, {}, 0, offset)); + data_->child_data.push_back(std::move(run_ends_array->data())); data_->child_data.push_back(std::move(values_array->data())); } @@ -46,29 +47,21 @@ Result> RunLengthEncodedArray::Make( if (run_ends_array->type_id() != Type::INT32) { return Status::Invalid("Run ends array must be int32 type"); } - int64_t physical_length = values_array->length(); if (run_ends_array->null_count() != 0) { return Status::Invalid("Run ends array cannot contain null values"); } - std::shared_ptr run_ends_buffer = SliceBuffer( - run_ends_array->data()->buffers[1], run_ends_array->offset(), physical_length); - return std::make_shared(run_length_encoded(values_array->type()), logical_length, values_array, - std::move(run_ends_buffer), offset); + run_ends_array, offset); } std::shared_ptr RunLengthEncodedArray::values_array() const { - return MakeArray(data()->child_data[0]); + return MakeArray(data()->child_data[1]); } -std::shared_ptr RunLengthEncodedArray::run_ends_buffer() const { - return data()->buffers[0]; -} - -const int32_t* RunLengthEncodedArray::run_ends() const { - return data()->GetValuesSafe(0, 0); +std::shared_ptr RunLengthEncodedArray::run_ends_array() const { + return MakeArray(data()->child_data[0]); } } // namespace arrow diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 3dd085c91446e..a4d34d037337c 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -53,7 +53,7 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values_array, - std::shared_ptr run_ends_buffer, int64_t offset = 0); + const std::shared_ptr& run_ends_array, int64_t offset = 0); /// \brief Construct a RunLengthEncodedArray from values and run ends arrays /// @@ -64,20 +64,13 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { const std::shared_ptr& run_ends_array, int64_t logical_length, int64_t offset = 0); - /// \brief Returns an array holding the values of each run. This function does apply - /// neiher the physical offset to the array + /// \brief Returns an array holding the values of each run. This function does apply the + /// physical offset to the array std::shared_ptr values_array() const; - /// \brief Returns the buffer holding the logical indexes of each run end. This function - /// does apply neiher the physical offset to the start of the buffer nor the logical - /// offset to the values. - std::shared_ptr run_ends_buffer() const; - - /// \brief Returns a pointer to logical indexes of each run end. This function does - /// apply neiher the physical offset to the start of the buffer nor the logical offset - /// to the values. This function does only work if the run ends are stored in a CPU - /// buffer. - const int32_t* run_ends() const; + /// \brief Returns an array holding the logical indexes of each run end. This function + /// does apply the physical offset to the array + std::shared_ptr run_ends_array() const; }; /// @} diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 39011e07b0b5b..e47eaa342e3a8 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -61,9 +61,8 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(int32_values, int32_values, 3)); ASSERT_EQ(rle_array->length(), 3); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); - ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); ASSERT_EQ(rle_array->offset(), 0); - // explicitly access null count variable so it is not calculated automatically ASSERT_EQ(rle_array->data()->null_count, 0); // explicitly passing offset @@ -71,7 +70,7 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(string_values, int32_values, 2, 1)); ASSERT_EQ(rle_array->length(), 2); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); - ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1)); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); ASSERT_EQ(rle_array->offset(), 1); // explicitly access null count variable so it is not calculated automatically ASSERT_EQ(rle_array->data()->null_count, 0); @@ -83,15 +82,6 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(int32_values, int32_only_null, 3)); } -TEST(RunLengthEncodedArray, Accessors) { - ASSERT_OK_AND_ASSIGN(auto rle_array, - RunLengthEncodedArray::Make(string_values, int32_values, 3)); - ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); - ASSERT_EQ(rle_array->run_ends_buffer()->data(), - int32_values->data()->buffers[1]->data()); - ASSERT_EQ(rle_array->run_ends(), int32_values->data()->GetValues(1, 0)); -} - } // anonymous namespace } // namespace arrow From 2565e2f49cbad5e8f426cb8ea22b6340c7f89ee8 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 20:44:50 +0200 Subject: [PATCH 043/107] pandas: correctly detect rle type as not supported --- cpp/src/arrow/python/arrow_to_pandas.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 8f9d1cb45b983..19ba688f2831f 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1197,6 +1197,7 @@ struct ObjectWriterVisitor { enable_if_t::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || From d8623d7ec8f1dbd38d0c2c08ce643f7019ae9a6b Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:04:45 +0200 Subject: [PATCH 044/107] fix copy-paste error in VisitMergedRuns test --- cpp/src/arrow/util/rle_util_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 232838981d568..fb8f78f3650ce 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -62,7 +62,7 @@ TEST(TestRleUtil, VisitMergedRuns) { ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, RunLengthEncodedArray::Make(right_run_ends, right_child, 2026)); left_array = left_array->Slice(left_parent_offset); - left_array = left_array->Slice(left_parent_offset); + right_array = right_array->Slice(right_parent_offset); size_t position = 0; From 23ca398a250e6a655991cf374c56b5429a186ca5 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:05:33 +0200 Subject: [PATCH 045/107] remove invalid testcase in FindPhysicalOffset test --- cpp/src/arrow/util/rle_util_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index fb8f78f3650ce..02c5be400b243 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -37,7 +37,6 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 2), 1); ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 3), 2); ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3*4, 3), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4, 5}, 3*4+3, 100), 2); } From 13b49ac92470e8eae5116314bf249470e2c90487 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:11:24 +0200 Subject: [PATCH 046/107] order RunLengthEncodedArray arguments like the child arrays in format --- cpp/src/arrow/array/array_encoded.cc | 7 +++---- cpp/src/arrow/array/array_encoded.h | 8 ++++---- cpp/src/arrow/array/array_encoded_test.cc | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index a5a8d03237055..d6e0e8802db9f 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -31,8 +31,8 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& d RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& values_array, const std::shared_ptr& run_ends_array, + const std::shared_ptr& values_array, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); SetData(ArrayData::Make(type, length, {}, 0, offset)); @@ -41,9 +41,8 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty } Result> RunLengthEncodedArray::Make( - const std::shared_ptr& values_array, - const std::shared_ptr& run_ends_array, int64_t logical_length, - int64_t offset) { + const std::shared_ptr& run_ends_array, + const std::shared_ptr& values_array, int64_t logical_length, int64_t offset) { if (run_ends_array->type_id() != Type::INT32) { return Status::Invalid("Run ends array must be int32 type"); } diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index a4d34d037337c..1e31b69c2d728 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -52,16 +52,16 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { explicit RunLengthEncodedArray(const std::shared_ptr& data); RunLengthEncodedArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& values_array, - const std::shared_ptr& run_ends_array, int64_t offset = 0); + const std::shared_ptr& run_ends_array, + const std::shared_ptr& values_array, int64_t offset = 0); /// \brief Construct a RunLengthEncodedArray from values and run ends arrays /// /// The data type is automatically inferred from the arguments. /// The run_ends_array and values_array must be the same length. static Result> Make( - const std::shared_ptr& values_array, - const std::shared_ptr& run_ends_array, int64_t logical_length, + const std::shared_ptr& run_ends_array, + const std::shared_ptr& values_array, int64_t logical_length, int64_t offset = 0); /// \brief Returns an array holding the values of each run. This function does apply the diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index e47eaa342e3a8..925dd1d04d89c 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -46,7 +46,7 @@ auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); TEST(RunLengthEncodedArray, MakeArray) { ASSERT_OK_AND_ASSIGN(auto rle_array, - RunLengthEncodedArray::Make(string_values, int32_values, 3)); + RunLengthEncodedArray::Make(int32_values, string_values, 3)); auto array_data = rle_array->data(); auto new_array = MakeArray(array_data); ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); @@ -67,7 +67,7 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { // explicitly passing offset ASSERT_OK_AND_ASSIGN(rle_array, - RunLengthEncodedArray::Make(string_values, int32_values, 2, 1)); + RunLengthEncodedArray::Make(int32_values, string_values, 2, 1)); ASSERT_EQ(rle_array->length(), 2); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); From d81cc30e49c2259ccdcf24fb60c2e04d0e4bfeb1 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:19:09 +0200 Subject: [PATCH 047/107] hopefully fix compiling parquet --- cpp/src/parquet/arrow/path_internal.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 8002f13e79941..2a6be6f889cd0 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -802,6 +802,10 @@ class PathBuilder { return Status::OK(); } + Status Visit(const ::arrow::StructArray& array) { + return Status::NotImplemented("arrow rle array in Parquet"); + } + Status Visit(const ::arrow::FixedSizeListArray& array) { MaybeAddNullable(array); int32_t list_size = array.list_type()->list_size(); From 587860f5a05e4614ee091d1b003c80f27a965924 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:21:15 +0200 Subject: [PATCH 048/107] fix test --- cpp/src/arrow/array/array_encoded_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 925dd1d04d89c..adc27a562ef55 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -76,7 +76,7 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { ASSERT_EQ(rle_array->data()->null_count, 0); ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array must be int32 type", - RunLengthEncodedArray::Make(int32_values, string_values, 3)); + RunLengthEncodedArray::Make(string_values, int32_values, 3)); ASSERT_RAISES_WITH_MESSAGE( Invalid, "Invalid: Run ends array cannot contain null values", RunLengthEncodedArray::Make(int32_values, int32_only_null, 3)); From 634eafe30785c92f97e55b00c4ff1b2197b431af Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:29:28 +0200 Subject: [PATCH 049/107] fix RunLengthEncodedArray constructor calls --- cpp/src/arrow/array/array_encoded.cc | 4 ++-- cpp/src/arrow/array/array_encoded_test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index d6e0e8802db9f..e5b068ea045b3 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -51,8 +51,8 @@ Result> RunLengthEncodedArray::Make( } return std::make_shared(run_length_encoded(values_array->type()), - logical_length, values_array, - run_ends_array, offset); + logical_length, run_ends_array, values_array, + offset); } std::shared_ptr RunLengthEncodedArray::values_array() const { diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index adc27a562ef55..8b295f6ce2935 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -79,7 +79,7 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(string_values, int32_values, 3)); ASSERT_RAISES_WITH_MESSAGE( Invalid, "Invalid: Run ends array cannot contain null values", - RunLengthEncodedArray::Make(int32_values, int32_only_null, 3)); + RunLengthEncodedArray::Make(int32_only_null, int32_values, 3)); } } // anonymous namespace From 60527e0d814bc2432adebe9ffbad877967831e5e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:38:03 +0200 Subject: [PATCH 050/107] rle_util_test: formatting --- cpp/src/arrow/util/rle_util_test.cc | 56 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 02c5be400b243..81631594a30de 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -28,21 +28,22 @@ namespace rle_util { TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1}, 1, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 1), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 2), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3*4, 3), 3); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 1), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 2), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3*4, 3), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3*4, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 1), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 2), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 3), 3); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 1), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 2), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 3), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3 * 4, 3), 1); } - TEST(TestRleUtil, VisitMergedRuns) { - const auto left_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 1050]"); - const auto right_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); + const auto left_run_ends = ArrayFromJSON( + int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 1050]"); + const auto right_run_ends = + ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); const std::vector expected_run_lengths = {5, 4, 6, 5, 5, 25}; const std::vector expected_left_visits = {5, 4, 6, 5, 5}; const std::vector expected_right_visits = {5, 4, 6, 5, 5}; @@ -51,32 +52,35 @@ TEST(TestRleUtil, VisitMergedRuns) { const int32_t right_parent_offset = 200; const int32_t right_child_offset = 100; - std::shared_ptr left_child = std::make_shared(left_child_offset + left_run_ends->length()); - std::shared_ptr right_child = std::make_shared(right_child_offset + right_run_ends->length()); + std::shared_ptr left_child = + std::make_shared(left_child_offset + left_run_ends->length()); + std::shared_ptr right_child = + std::make_shared(right_child_offset + right_run_ends->length()); left_child = left_child->Slice(left_child_offset); right_child = right_child->Slice(right_child_offset); - ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, RunLengthEncodedArray::Make(left_run_ends, left_child, 1026)); - ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, RunLengthEncodedArray::Make(right_run_ends, right_child, 2026)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, + RunLengthEncodedArray::Make(left_run_ends, left_child, 1026)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, + RunLengthEncodedArray::Make(right_run_ends, right_child, 2026)); left_array = left_array->Slice(left_parent_offset); right_array = right_array->Slice(right_parent_offset); size_t position = 0; - rle_util::VisitMergedRuns(ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), - [&position, &expected_run_lengths, &expected_left_visits, expected_right_visits](int64_t run_length, int64_t left_index, int64_t right_index) { - ASSERT_EQ(run_length, expected_run_lengths[position]); - ASSERT_EQ(left_index, expected_left_visits[position]); - ASSERT_EQ(run_length, expected_right_visits[position]); - position++; - }); - + rle_util::VisitMergedRuns( + ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), + [&position, &expected_run_lengths, &expected_left_visits, expected_right_visits]( + int64_t run_length, int64_t left_index, int64_t right_index) { + ASSERT_EQ(run_length, expected_run_lengths[position]); + ASSERT_EQ(left_index, expected_left_visits[position]); + ASSERT_EQ(run_length, expected_right_visits[position]); + position++; + }); ASSERT_EQ(position, expected_run_lengths.size()); - - } } // namespace rle_util From aaab59c9495ef2c170012296d59002a4b4f4327e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:38:48 +0200 Subject: [PATCH 051/107] update rle utilitties for new format --- cpp/src/arrow/util/rle_util.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index e10326443e480..6603efa6fd79b 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,14 +28,16 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* run_ends, - int64_t buffer_size, int64_t logical_offset); +int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, + int64_t logical_offset); + +static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } static const int32_t* RunEnds(const ArraySpan& span) { - return span.GetValues(0, /*absolute_offset=*/0); + return RunEndsArray(span).GetValues(1); } -static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[0]; } +static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[1]; } template void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { From 117da6f428000c88ea851b92131f9611603a2ed8 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 3 Aug 2022 22:39:33 +0200 Subject: [PATCH 052/107] rle_util: formatting --- cpp/src/arrow/util/rle_util.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index 14077d12e4cc6..34e6649782ed0 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -7,8 +7,7 @@ namespace rle_util { int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, int64_t logical_offset) { - auto it = std::upper_bound(run_ends, - run_ends + buffer_size / sizeof(int32_t), + auto it = std::upper_bound(run_ends, run_ends + buffer_size / sizeof(int32_t), logical_offset); return std::distance(run_ends, it); } From 30d7d672edc010b9e4943453ca2378276539438d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 16:48:16 +0200 Subject: [PATCH 053/107] give rle type one buffer since I found examples of code assuming one --- cpp/src/arrow/array/array_encoded.cc | 6 +++--- cpp/src/arrow/array/array_encoded_test.cc | 3 +++ cpp/src/arrow/type.h | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index e5b068ea045b3..624bf8cc9ba33 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -35,7 +35,7 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty const std::shared_ptr& values_array, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); - SetData(ArrayData::Make(type, length, {}, 0, offset)); + SetData(ArrayData::Make(type, length, {NULLPTR}, 0, offset)); data_->child_data.push_back(std::move(run_ends_array->data())); data_->child_data.push_back(std::move(values_array->data())); } @@ -51,8 +51,8 @@ Result> RunLengthEncodedArray::Make( } return std::make_shared(run_length_encoded(values_array->type()), - logical_length, run_ends_array, values_array, - offset); + logical_length, run_ends_array, + values_array, offset); } std::shared_ptr RunLengthEncodedArray::values_array() const { diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 8b295f6ce2935..8a476ee560ff7 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -52,6 +52,7 @@ TEST(RunLengthEncodedArray, MakeArray) { ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); // should be the exact same ArrayData object ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); } TEST(RunLengthEncodedArray, FromRunEndsAndValues) { @@ -64,6 +65,8 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); ASSERT_EQ(rle_array->offset(), 0); ASSERT_EQ(rle_array->data()->null_count, 0); + // one dummy buffer, since code may assume there is exactly one buffer + ASSERT_EQ(rle_array->data()->buffers.size(), 1); // explicitly passing offset ASSERT_OK_AND_ASSIGN(rle_array, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 36c8d72d33293..c8de369879314 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1223,8 +1223,9 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType explicit RunLengthEncodedType(std::shared_ptr encoded_type); DataTypeLayout layout() const override { - // Fixed width layout means fixed size per logical item, which our buffers don't have - return DataTypeLayout({}); + // always add one that is NULLPTR to make code, since existing code may assume there is at least + // one buffer + return DataTypeLayout({DataTypeLayout::AlwaysNull()}); } std::string ToString() const override; From 81520bcee6bb5c53dc3d66d90fb97011f912e89f Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 16:50:17 +0200 Subject: [PATCH 054/107] formatting --- cpp/src/arrow/type.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index c8de369879314..9086a8ba478be 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1223,8 +1223,8 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType explicit RunLengthEncodedType(std::shared_ptr encoded_type); DataTypeLayout layout() const override { - // always add one that is NULLPTR to make code, since existing code may assume there is at least - // one buffer + // always add one that is NULLPTR to make code, since existing code may assume there + // is at least one buffer return DataTypeLayout({DataTypeLayout::AlwaysNull()}); } From 05e98c6b810d10bc805c121b4862ae8b2395bf56 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 16:53:39 +0200 Subject: [PATCH 055/107] fix comment --- cpp/src/arrow/type.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9086a8ba478be..1d2b60e69e3e2 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1201,7 +1201,9 @@ class ARROW_EXPORT DenseUnionType : public UnionType { std::string name() const override { return "dense_union"; } }; -/// \brief Type class for run-length encoded data +/// \brief Type class class that can be subclassed by all encoding types, that allowes +/// users to check if arrays are compatible besides the encoding independent of which +/// exact encoding they use class ARROW_EXPORT EncodingType { public: explicit EncodingType(std::shared_ptr encoded_type) From c78fbf31ab4e7a17b48c292b27f3c62f3952ebcf Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 18:15:56 +0200 Subject: [PATCH 056/107] rle_util: fixes and make FindPhysicalOffset take element count instread of bytes --- cpp/src/arrow/util/rle_util.cc | 9 ++++---- cpp/src/arrow/util/rle_util.h | 6 ++--- cpp/src/arrow/util/rle_util_test.cc | 36 +++++++++++++++++++---------- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index 34e6649782ed0..e578513e3c0a5 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -5,11 +5,12 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, +int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset) { - auto it = std::upper_bound(run_ends, run_ends + buffer_size / sizeof(int32_t), - logical_offset); - return std::distance(run_ends, it); + auto it = std::upper_bound(run_ends, run_ends + num_run_ends, logical_offset); + int64_t result = std::distance(run_ends, it); + ARROW_DCHECK_LT(result, num_run_ends); + return result; } void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 6603efa6fd79b..fd5793aa124b2 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,7 +28,7 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t buffer_size, +int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset); static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } @@ -42,9 +42,9 @@ static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_dat template void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { const int64_t a_physical_offset = - rle_util::FindPhysicalOffset(RunEnds(a), DataArray(a).length, a.offset); + rle_util::FindPhysicalOffset(RunEnds(a), RunEndsArray(a).length, a.offset); const int64_t b_physical_offset = - rle_util::FindPhysicalOffset(RunEnds(b), DataArray(b).length, b.offset); + rle_util::FindPhysicalOffset(RunEnds(b), RunEndsArray(b).length, b.offset); ARROW_DCHECK_EQ(a.length, b.length); const int64_t logical_length = a.length; diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 81631594a30de..d8a7eb8ffb6c2 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -37,6 +37,10 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 2), 1); ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 3), 2); ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3 * 4, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, + 1015, 1020, 1025, 1050}, + 15 * 4, 1000), + 10); } TEST(TestRleUtil, VisitMergedRuns) { @@ -45,12 +49,12 @@ TEST(TestRleUtil, VisitMergedRuns) { const auto right_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); const std::vector expected_run_lengths = {5, 4, 6, 5, 5, 25}; - const std::vector expected_left_visits = {5, 4, 6, 5, 5}; - const std::vector expected_right_visits = {5, 4, 6, 5, 5}; + const std::vector expected_left_visits = {110, 111, 111, 112, 113, 114}; + const std::vector expected_right_visits = {205, 206, 207, 207, 207, 208}; const int32_t left_parent_offset = 1000; const int32_t left_child_offset = 100; - const int32_t right_parent_offset = 200; - const int32_t right_child_offset = 100; + const int32_t right_parent_offset = 2000; + const int32_t right_child_offset = 200; std::shared_ptr left_child = std::make_shared(left_child_offset + left_run_ends->length()); @@ -61,22 +65,30 @@ TEST(TestRleUtil, VisitMergedRuns) { right_child = right_child->Slice(right_child_offset); ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, - RunLengthEncodedArray::Make(left_run_ends, left_child, 1026)); + RunLengthEncodedArray::Make(left_run_ends, left_child, 1050)); ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, - RunLengthEncodedArray::Make(right_run_ends, right_child, 2026)); + RunLengthEncodedArray::Make(right_run_ends, right_child, 2050)); left_array = left_array->Slice(left_parent_offset); right_array = right_array->Slice(right_parent_offset); size_t position = 0; - rle_util::VisitMergedRuns( - ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), - [&position, &expected_run_lengths, &expected_left_visits, expected_right_visits]( - int64_t run_length, int64_t left_index, int64_t right_index) { - ASSERT_EQ(run_length, expected_run_lengths[position]); + std::cout << left_array << std::endl; + std::cout << left_array->data() << std::endl; + std::cout << left_array->data()->length << std::endl; + std::cout << left_array->data()->buffers.data() << std::endl; + + auto arrspan = ArraySpan(*left_array->data()); + + std::cout << position << std::endl; + + rle_util:: + VisitMergedRuns(ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), [&position /*, &expected_run_lengths, &expected_left_visits, &expected_right_visits*/](int64_t run_length, int64_t left_index, int64_t right_index) { + std::cout << position << std::endl; + /*ASSERT_EQ(run_length, expected_run_lengths[position]); ASSERT_EQ(left_index, expected_left_visits[position]); - ASSERT_EQ(run_length, expected_right_visits[position]); + ASSERT_EQ(run_length, expected_right_visits[position]);*/ position++; }); From f032fbc8b59724dbfd8f64bfb66e50977353ca23 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 18:35:45 +0200 Subject: [PATCH 057/107] rle_util tests fixed --- cpp/src/arrow/util/rle_util_test.cc | 39 +++++++++++------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index d8a7eb8ffb6c2..f65ba946c9346 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -28,18 +28,17 @@ namespace rle_util { TEST(TestRleUtil, FindPhysicalOffsetTest) { ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1}, 1, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 1), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 2), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3 * 4, 3), 3); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 1), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 2), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3 * 4, 3), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3 * 4, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 1), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 2), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 1), 0); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 2), 1); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 3), 2); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 3), 1); ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 1050}, - 15 * 4, 1000), + 15, 1000), 10); } @@ -74,21 +73,13 @@ TEST(TestRleUtil, VisitMergedRuns) { size_t position = 0; - std::cout << left_array << std::endl; - std::cout << left_array->data() << std::endl; - std::cout << left_array->data()->length << std::endl; - std::cout << left_array->data()->buffers.data() << std::endl; - - auto arrspan = ArraySpan(*left_array->data()); - - std::cout << position << std::endl; - - rle_util:: - VisitMergedRuns(ArraySpan(*left_array->data()), ArraySpan(*left_array->data()), [&position /*, &expected_run_lengths, &expected_left_visits, &expected_right_visits*/](int64_t run_length, int64_t left_index, int64_t right_index) { - std::cout << position << std::endl; - /*ASSERT_EQ(run_length, expected_run_lengths[position]); + rle_util::VisitMergedRuns( + ArraySpan(*left_array->data()), ArraySpan(*right_array->data()), + [&position, &expected_run_lengths, &expected_left_visits, &expected_right_visits]( + int64_t run_length, int64_t left_index, int64_t right_index) { + ASSERT_EQ(run_length, expected_run_lengths[position]); ASSERT_EQ(left_index, expected_left_visits[position]); - ASSERT_EQ(run_length, expected_right_visits[position]);*/ + ASSERT_EQ(right_index, expected_right_visits[position]); position++; }); From 9775fb8252979fa99520e6729b59199c60e8762e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 18:51:36 +0200 Subject: [PATCH 058/107] stub rle in another visitor in parquet --- cpp/src/parquet/column_writer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f7898c02d479d..fa71d7dbfb546 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -128,6 +128,7 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Struct); NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); + NOT_IMPLEMENTED_VISIT(RunLengthEncoded); NOT_IMPLEMENTED_VISIT(Extension); #undef NOT_IMPLEMENTED_VISIT From ffdfb1cc49b7f862b0f9ede0c14bedebe99f5285 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 20:53:59 +0200 Subject: [PATCH 059/107] rle_util: add comments --- cpp/src/arrow/util/rle_util.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index fd5793aa124b2..06a9ae58061d0 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,17 +28,27 @@ namespace arrow { namespace rle_util { +/// \brief Get the child array holding the data values from an RLE array int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset); +/// \brief Get the child array holding the data values from an RLE array static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } +/// \brief Get a pointer to run ends values of an static const int32_t* RunEnds(const ArraySpan& span) { return RunEndsArray(span).GetValues(1); } +/// \brief Get the child array holding the data values from an RLE array static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[1]; } +/// \brief Iterate over two run-length encoded arrays in segments of runs that are inside run +/// boundaries in each input. A callback is called on each of these segments +/// \param[in] a first input as ArraySpan +/// \param[in] b second input as ArraySpan +/// \param[in] callback taking 3 int64_t arguments: the length of the current run segment, and a +/// phyiscal index into the data array arrays of a and b. Offsets are already applied. template void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { const int64_t a_physical_offset = From 8bb5828045a26501bb55229ef1dd4d817920a6d1 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 21:19:12 +0200 Subject: [PATCH 060/107] fix comments --- cpp/src/arrow/util/rle_util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 06a9ae58061d0..0e79ddf4ba7b0 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,14 +28,14 @@ namespace arrow { namespace rle_util { -/// \brief Get the child array holding the data values from an RLE array +/// \brief Get the physical offset from a logical offset given run end values int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset); /// \brief Get the child array holding the data values from an RLE array static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } -/// \brief Get a pointer to run ends values of an +/// \brief Get a pointer to run ends values of an RLE array static const int32_t* RunEnds(const ArraySpan& span) { return RunEndsArray(span).GetValues(1); } From d2731a05526422b177acbe98a5d796c8cbd52865 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 23:11:01 +0200 Subject: [PATCH 061/107] builder_base: use VisitScalarTypeInline --- cpp/src/arrow/array/builder_base.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index ff37cee5ba1ee..25f5e710eb6b5 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -279,7 +279,7 @@ struct AppendScalarImpl { return Status::NotImplemented("AppendScalar for type ", type); } - Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); } + Status Convert() { return VisitScalarTypeInline(*(*scalars_begin_)->type, this); } const std::shared_ptr* scalars_begin_; const std::shared_ptr* scalars_end_; From ff3ab7c25cf95cf4bd8037de9164a08dcbba3eed Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 23:14:36 +0200 Subject: [PATCH 062/107] remove no longer used visitor stub --- cpp/src/arrow/array/builder_base.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 25f5e710eb6b5..387b1ff39b721 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -275,10 +275,6 @@ struct AppendScalarImpl { return Status::OK(); } - Status Visit(const DataType& type) { - return Status::NotImplemented("AppendScalar for type ", type); - } - Status Convert() { return VisitScalarTypeInline(*(*scalars_begin_)->type, this); } const std::shared_ptr* scalars_begin_; From 1b646b6ee4b9b70fc92d283ddd3d50cfd1d6c12a Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 4 Aug 2022 23:38:16 +0200 Subject: [PATCH 063/107] formatting --- cpp/src/arrow/util/rle_util.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 0e79ddf4ba7b0..f2c49f8d7d088 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -43,12 +43,13 @@ static const int32_t* RunEnds(const ArraySpan& span) { /// \brief Get the child array holding the data values from an RLE array static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[1]; } -/// \brief Iterate over two run-length encoded arrays in segments of runs that are inside run -/// boundaries in each input. A callback is called on each of these segments +/// \brief Iterate over two run-length encoded arrays in segments of runs that are inside +/// run boundaries in each input. A callback is called on each of these segments /// \param[in] a first input as ArraySpan /// \param[in] b second input as ArraySpan -/// \param[in] callback taking 3 int64_t arguments: the length of the current run segment, and a -/// phyiscal index into the data array arrays of a and b. Offsets are already applied. +/// \param[in] callback taking 3 int64_t arguments: the length of the current run segment, +/// and a phyiscal index into the data array arrays of a and b. Offsets are already +/// applied. template void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { const int64_t a_physical_offset = From af3e625771c82c45c241a645b4d7f7076797f92a Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 00:08:55 +0200 Subject: [PATCH 064/107] Revert "remove no longer used visitor stub" This reverts commit ff3ab7c25cf95cf4bd8037de9164a08dcbba3eed. --- cpp/src/arrow/array/builder_base.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 387b1ff39b721..25f5e710eb6b5 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -275,6 +275,10 @@ struct AppendScalarImpl { return Status::OK(); } + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + Status Convert() { return VisitScalarTypeInline(*(*scalars_begin_)->type, this); } const std::shared_ptr* scalars_begin_; From 71e7bde7efa6c692eb9d84f20264a584db193590 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 00:18:01 +0200 Subject: [PATCH 065/107] add more user friendly methods to get physical offset and length --- cpp/src/arrow/array/array_encoded.cc | 11 +++++++++++ cpp/src/arrow/array/array_encoded.h | 9 +++++++++ cpp/src/arrow/util/rle_util.cc | 11 +++++++++++ cpp/src/arrow/util/rle_util.h | 12 +++++++++++- 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 624bf8cc9ba33..2fb56383127d9 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -18,6 +18,7 @@ #include "arrow/array/array_encoded.h" #include "arrow/array/util.h" #include "arrow/util/logging.h" +#include "arrow/util/rle_util.h" namespace arrow { @@ -63,4 +64,14 @@ std::shared_ptr RunLengthEncodedArray::run_ends_array() const { return MakeArray(data()->child_data[0]); } +int64_t RunLengthEncodedArray::GetPhysicalOffset() const { + const ArraySpan span(*this->data_); + return rle_util::GetPhysicalOffset(span); +} + +int64_t RunLengthEncodedArray::GetPhysicalLength() const { + const ArraySpan span(*this->data_); + return rle_util::GetPhysicalLength(span); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_encoded.h b/cpp/src/arrow/array/array_encoded.h index 1e31b69c2d728..b7cc3479a2fab 100644 --- a/cpp/src/arrow/array/array_encoded.h +++ b/cpp/src/arrow/array/array_encoded.h @@ -71,6 +71,15 @@ class ARROW_EXPORT RunLengthEncodedArray : public Array { /// \brief Returns an array holding the logical indexes of each run end. This function /// does apply the physical offset to the array std::shared_ptr run_ends_array() const; + + /// \brief Get the physical offset of the RLE array. Warning: calling this may result in + /// in an O(log(N)) binary search on the run ends buffer + int64_t GetPhysicalOffset() const; + + /// \brief Get the physical offset of the RLE array. Avoid calling this method in a + /// context where you can easily calculate the value yourself. Calling this can result + /// in an O(log(N)) binary search on the run ends buffer + int64_t GetPhysicalLength() const; }; /// @} diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index e578513e3c0a5..cec795db91164 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -21,5 +21,16 @@ void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { array->child_data[0] = builder->Finish().ValueOrDie()->Slice(offset)->data(); } +int64_t GetPhysicalOffset(const ArraySpan& span) { + // TODO: caching + return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); +} + +int64_t GetPhysicalLength(const ArraySpan& span) { + // TODO: caching + return FindPhysicalOffset(RunEnds(span) + GetPhysicalOffset(span), + RunEndsArray(span).length, span.offset); +} + } // namespace rle_util } // namespace arrow diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index f2c49f8d7d088..6325a9d20e44e 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -28,10 +28,20 @@ namespace arrow { namespace rle_util { -/// \brief Get the physical offset from a logical offset given run end values +/// \brief Get the physical offset from a logical offset given run end values using binary +/// search int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset); +/// \brief Get the physical offset of an RLE ArraySpan. Warning: calling this may result +/// in in an O(log(N)) binary search on the run ends buffer +int64_t GetPhysicalOffset(const ArraySpan& span); + +/// \brief Get the physical length of an RLE ArraySpan. Avoid calling this method in a +/// context where you can easily calculate the value yourself. Calling this can result in +/// an O(log(N)) binary search on the run ends buffer +int64_t GetPhysicalLength(const ArraySpan& span); + /// \brief Get the child array holding the data values from an RLE array static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } From ec1e004a7557ff7860bbfb8d8841ca8c5db3d2bb Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 00:24:01 +0200 Subject: [PATCH 066/107] add test --- cpp/src/arrow/array/array_encoded_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 8a476ee560ff7..d6650fc8ae8b6 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -85,6 +85,16 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(int32_only_null, int32_values, 3)); } +TEST(RunLengthEncodedArray, OffsetLength) { + auto run_ends = ArrayFromJSON(int32(), "[1000000, 2000000, 3000000, 4000000, 5000000]"); + auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "RLE"])"); + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(run_ends, values, 50000)); + auto slice = std::dynamic_pointer_cast(rle_array->Slice(1999999, 5)); + ASSERT_EQ(slice->GetPhysicalLength(), 2); + ASSERT_EQ(slice->GetPhysicalOffset(), 1); +} + } // anonymous namespace } // namespace arrow From 63b5ce25548bf1636dbfc4ecf9b491d62524c277 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 00:47:57 +0200 Subject: [PATCH 067/107] fix GetPhysicalLength method --- cpp/src/arrow/util/rle_util.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index cec795db91164..f5437aef5ec87 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -28,8 +28,14 @@ int64_t GetPhysicalOffset(const ArraySpan& span) { int64_t GetPhysicalLength(const ArraySpan& span) { // TODO: caching - return FindPhysicalOffset(RunEnds(span) + GetPhysicalOffset(span), - RunEndsArray(span).length, span.offset); + if (span.length == 0) { + return 0; + } else { + // find the offset of the last element and add 1 + return FindPhysicalOffset(RunEnds(span) + GetPhysicalOffset(span), + RunEndsArray(span).length, span.offset + span.length - 1) + + 1; + } } } // namespace rle_util From 1d31bdda8b59f64d418650fed493a74b56dd3c75 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 00:48:33 +0200 Subject: [PATCH 068/107] better test for physical offset/length --- cpp/src/arrow/array/array_encoded_test.cc | 25 ++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index d6650fc8ae8b6..4ae365158dd26 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -86,13 +86,32 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { } TEST(RunLengthEncodedArray, OffsetLength) { - auto run_ends = ArrayFromJSON(int32(), "[1000000, 2000000, 3000000, 4000000, 5000000]"); + auto run_ends = ArrayFromJSON(int32(), "[100, 200, 300, 400, 500]"); auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "RLE"])"); ASSERT_OK_AND_ASSIGN(auto rle_array, - RunLengthEncodedArray::Make(run_ends, values, 50000)); - auto slice = std::dynamic_pointer_cast(rle_array->Slice(1999999, 5)); + RunLengthEncodedArray::Make(run_ends, values, 500)); + + ASSERT_EQ(rle_array->GetPhysicalLength(), 5); + ASSERT_EQ(rle_array->GetPhysicalOffset(), 0); + + auto slice = std::dynamic_pointer_cast(rle_array->Slice(199, 5)); ASSERT_EQ(slice->GetPhysicalLength(), 2); ASSERT_EQ(slice->GetPhysicalOffset(), 1); + + auto slice2 = + std::dynamic_pointer_cast(rle_array->Slice(199, 101)); + ASSERT_EQ(slice2->GetPhysicalLength(), 2); + ASSERT_EQ(slice2->GetPhysicalOffset(), 1); + + auto slice3 = + std::dynamic_pointer_cast(rle_array->Slice(100, 200)); + ASSERT_EQ(slice3->GetPhysicalLength(), 2); + ASSERT_EQ(slice3->GetPhysicalOffset(), 1); + + auto slice4 = + std::dynamic_pointer_cast(rle_array->Slice(0, 150)); + ASSERT_EQ(slice4->GetPhysicalLength(), 2); + ASSERT_EQ(slice4->GetPhysicalOffset(), 0); } } // anonymous namespace From 6f58c4b20cfb334b476cf8152dc703fccc78bac4 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 02:14:44 +0200 Subject: [PATCH 069/107] VisitMergedRuns test: test inverted case --- cpp/src/arrow/util/rle_util_test.cc | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index f65ba946c9346..be2c0a3efadcf 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -44,7 +44,7 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { TEST(TestRleUtil, VisitMergedRuns) { const auto left_run_ends = ArrayFromJSON( - int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 1050]"); + int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); const auto right_run_ends = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); const std::vector expected_run_lengths = {5, 4, 6, 5, 5, 25}; @@ -60,7 +60,7 @@ TEST(TestRleUtil, VisitMergedRuns) { std::shared_ptr right_child = std::make_shared(right_child_offset + right_run_ends->length()); - left_child = left_child->Slice(left_child_offset); + left_child = left_child->Slice(left_child_offset, 50); right_child = right_child->Slice(right_child_offset); ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, @@ -72,7 +72,6 @@ TEST(TestRleUtil, VisitMergedRuns) { right_array = right_array->Slice(right_parent_offset); size_t position = 0; - rle_util::VisitMergedRuns( ArraySpan(*left_array->data()), ArraySpan(*right_array->data()), [&position, &expected_run_lengths, &expected_left_visits, &expected_right_visits]( @@ -82,6 +81,19 @@ TEST(TestRleUtil, VisitMergedRuns) { ASSERT_EQ(right_index, expected_right_visits[position]); position++; }); + ASSERT_EQ(position, expected_run_lengths.size()); + + // test the same data with left/right swapped + position = 0; + rle_util::VisitMergedRuns( + ArraySpan(*right_array->data()), ArraySpan(*left_array->data()), + [&position, &expected_run_lengths, &expected_left_visits, &expected_right_visits]( + int64_t run_length, int64_t right_index, int64_t left_index) { + ASSERT_EQ(run_length, expected_run_lengths[position]); + ASSERT_EQ(left_index, expected_left_visits[position]); + ASSERT_EQ(right_index, expected_right_visits[position]); + position++; + }); ASSERT_EQ(position, expected_run_lengths.size()); } From 2c211c4a633a41b38a55478464afd83f01692a4e Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 5 Aug 2022 02:16:22 +0200 Subject: [PATCH 070/107] VisitMergedRuns: fix handling both arrays ending inside a run --- cpp/src/arrow/util/rle_util.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 6325a9d20e44e..c88554db755f1 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -77,9 +77,12 @@ void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callba while (logical_position < logical_length) { // logical indices of the end of the run we are currently in in each input - const int64_t a_run_end = RunEnds(a)[a_run_index] - a.offset; - const int64_t b_run_end = RunEnds(b)[b_run_index] - b.offset; + int64_t a_run_end = RunEnds(a)[a_run_index] - a.offset; + int64_t b_run_end = RunEnds(b)[b_run_index] - b.offset; + // the logical length may end in the middle of a run + a_run_end = std::min(a_run_end, logical_length); + b_run_end = std::min(b_run_end, logical_length); ARROW_DCHECK_GT(a_run_end, logical_position); ARROW_DCHECK_GT(b_run_end, logical_position); From 1b6fc849e2c99f332b18e097f18ecb3d5631aac8 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 10 Aug 2022 15:25:09 +0200 Subject: [PATCH 071/107] implement rle_util visitor variant for a single array --- cpp/src/arrow/util/rle_util.h | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index c88554db755f1..06deb2acdf560 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -76,7 +76,7 @@ void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callba int64_t logical_position = 0; while (logical_position < logical_length) { - // logical indices of the end of the run we are currently in in each input + // logical indices of the end of the run we are currently in each input int64_t a_run_end = RunEnds(a)[a_run_index] - a.offset; int64_t b_run_end = RunEnds(b)[b_run_index] - b.offset; @@ -104,6 +104,39 @@ void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callba } } +/// \brief Iterate over the runs of a run-length encoded array. A callback is called on each of these segments +/// \param[in] input as ArraySpan +/// \param[in] callback taking 2 int64_t arguments: the length of the current run segment, +/// and a phyiscal index into the data array array. Offsets are already applied. +template +void VisitRuns(const ArraySpan& span, CallbackType callback) { + const int64_t physical_offset = + rle_util::FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); + const int64_t logical_length = span.length; + + // run indices from the start of the run ends buffers without offset + int64_t run_index = physical_offset; + int64_t logical_position = 0; + + while (logical_position < logical_length) { + // logical index of the end of the run we are currently in input + int64_t run_end = RunEnds(span)[run_index] - span.offset; + // the logical length may end in the middle of a run + run_end = std::min(run_end, logical_length); + ARROW_DCHECK_GT(run_end, logical_position); + + const int64_t run_length = run_end - logical_position; + + // callback to code that wants to work on the data - we give it physical indices + // including all offsets. This includes the additional offset the data array may have. + callback(run_length, run_index + DataArray(span).offset); + + logical_position = run_end; + run_index++; + } +} + + // TODO: this may fit better into some testing header void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset); From 66f2d81dbf3409a579ccda06c6733843850c5091 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 10 Aug 2022 17:21:37 +0200 Subject: [PATCH 072/107] add merged rle iterator to replace visitor --- cpp/src/arrow/util/rle_util.h | 105 ++++++++++++++++++++++++++-- cpp/src/arrow/util/rle_util_test.cc | 10 +++ 2 files changed, 110 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 06deb2acdf560..8c8da5d4587d3 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -104,10 +104,106 @@ void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callba } } -/// \brief Iterate over the runs of a run-length encoded array. A callback is called on each of these segments -/// \param[in] input as ArraySpan -/// \param[in] callback taking 2 int64_t arguments: the length of the current run segment, -/// and a phyiscal index into the data array array. Offsets are already applied. +template +class MergedRunsIterator { + public: + // end iterator + MergedRunsIterator() {} + + // TODO: genereric constructor + MergedRunsIterator(const ArraySpan& a) { + static_assert(NUM_INPUTS == 1, "incorrect number of inputs"); + + inputs[0] = std::ref(a); + + logical_length = a.length; + + for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { + ArraySpan& input = inputs[input_id]; + run_index[input_id] = rle_util::FindPhysicalOffset( + RunEnds(input), RunEndsArray(input).length, input.offset); + } + } + + MergedRunsIterator(const ArraySpan& a, const ArraySpan& b) { + static_assert(NUM_INPUTS == 2, "incorrect number of inputs"); + + inputs[0] = std::ref(a); + inputs[1] = std::ref(b); + + ARROW_DCHECK_EQ(a.length, b.length); + logical_length = a.length; + + for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { + ArraySpan& input = inputs[input_id]; + run_index[input_id] = rle_util::FindPhysicalOffset( + RunEnds(input), RunEndsArray(input).length, input.offset); + } + } + + MergedRunsIterator(const MergedRunsIterator& other) = default; + + MergedRunsIterator& operator++() { + logical_position += merged_run_length; + + for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { + if (logical_position == run_end[input_id]) { + run_index[input_id]++; + } + } + if (!isEnd()) { + FindMergedRun(); + } + + return *this; + } + + MergedRunsIterator& operator++(int) { + auto result = *this; + ++(this); + } + + bool operator==(const MergedRunsIterator& other) const { + return (isEnd() && other.isEnd()) || (logical_position == other.logical_position); + } + + bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } + + int64_t physical_index(int64_t input_id) const { return run_index[input_id]; } + int64_t run_length() const { return merged_run_length; } + + private: + void FindMergedRun() { + int64_t merged_run_end = std::numeric_limits::max(); + for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { + // logical indices of the end of the run we are currently in each input + run_end[input_id] = + RunEnds(inputs[input_id])[run_index[input_id]] - inputs[input_id].offset; + // the logical length may end in the middle of a run, in case the array was sliced + run_end[input_id] = std::min(run_end[input_id], logical_length); + ARROW_DCHECK_GT(run_end[input_id], logical_position); + + merged_run_end = std::min(merged_run_end, run_end[input_id]); + } + merged_run_length = merged_run_end - logical_position; + } + + bool isEnd() const { return logical_position == logical_length; } + + std::array, NUM_INPUTS> inputs; + std::array physical_offset; + std::array run_index; + // logical indices of the end of the run we are currently in each input + std::array run_end; + int64_t logical_position = 0; + int64_t logical_length = 0; + int64_t merged_run_length; +}; + +/// \brief Iterate over the runs of a run-length encoded array. A callback is called on +/// each of these segments \param[in] input as ArraySpan \param[in] callback taking 2 +/// int64_t arguments: the length of the current run segment, and a phyiscal index into +/// the data array array. Offsets are already applied. template void VisitRuns(const ArraySpan& span, CallbackType callback) { const int64_t physical_offset = @@ -136,7 +232,6 @@ void VisitRuns(const ArraySpan& span, CallbackType callback) { } } - // TODO: this may fit better into some testing header void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset); diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index be2c0a3efadcf..6f3b16c391e4a 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -96,6 +96,16 @@ TEST(TestRleUtil, VisitMergedRuns) { }); ASSERT_EQ(position, expected_run_lengths.size()); + + position = 0; + for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), + ArraySpan(*right_array->data())); + it != MergedRunsIterator<2>(); ++it) { + ASSERT_EQ(it.run_length(), expected_run_lengths[position]); + ASSERT_EQ(it.physical_index(0), expected_left_visits[position]); + ASSERT_EQ(it.physical_index(1), expected_right_visits[position]); + position++; + } } } // namespace rle_util From dd3eeb39e2cf8855fa03db282abf5b8db64ba5fd Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 14:04:03 +0200 Subject: [PATCH 073/107] MergedRunsIterator: use pointer instead of reference_wrapper --- cpp/src/arrow/util/rle_util.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 8c8da5d4587d3..1bc9485cde4bf 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -128,16 +128,16 @@ class MergedRunsIterator { MergedRunsIterator(const ArraySpan& a, const ArraySpan& b) { static_assert(NUM_INPUTS == 2, "incorrect number of inputs"); - inputs[0] = std::ref(a); - inputs[1] = std::ref(b); + inputs[0] = &a; + inputs[1] = &b; ARROW_DCHECK_EQ(a.length, b.length); logical_length = a.length; for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - ArraySpan& input = inputs[input_id]; + const ArraySpan* input = inputs[input_id]; run_index[input_id] = rle_util::FindPhysicalOffset( - RunEnds(input), RunEndsArray(input).length, input.offset); + RunEnds(*input), RunEndsArray(*input).length, input->offset); } } @@ -159,8 +159,9 @@ class MergedRunsIterator { } MergedRunsIterator& operator++(int) { - auto result = *this; - ++(this); + MergedRunsIterator& result = *this; + ++(*this); + return result; } bool operator==(const MergedRunsIterator& other) const { @@ -178,7 +179,7 @@ class MergedRunsIterator { for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { // logical indices of the end of the run we are currently in each input run_end[input_id] = - RunEnds(inputs[input_id])[run_index[input_id]] - inputs[input_id].offset; + RunEnds(*inputs[input_id])[run_index[input_id]] - inputs[input_id]->offset; // the logical length may end in the middle of a run, in case the array was sliced run_end[input_id] = std::min(run_end[input_id], logical_length); ARROW_DCHECK_GT(run_end[input_id], logical_position); @@ -190,7 +191,7 @@ class MergedRunsIterator { bool isEnd() const { return logical_position == logical_length; } - std::array, NUM_INPUTS> inputs; + std::array inputs; std::array physical_offset; std::array run_index; // logical indices of the end of the run we are currently in each input From 7f376953b2fe61ed8bf69d64d2f6d0f033958999 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 14:22:17 +0200 Subject: [PATCH 074/107] fix single input MergedRunsIterator constructor --- cpp/src/arrow/util/rle_util.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 1bc9485cde4bf..ee1721c81a66b 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -114,14 +114,14 @@ class MergedRunsIterator { MergedRunsIterator(const ArraySpan& a) { static_assert(NUM_INPUTS == 1, "incorrect number of inputs"); - inputs[0] = std::ref(a); + inputs[0] = &a; - logical_length = a.length; + logical_length = a->length; for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - ArraySpan& input = inputs[input_id]; + const ArraySpan* input = inputs[input_id]; run_index[input_id] = rle_util::FindPhysicalOffset( - RunEnds(input), RunEndsArray(input).length, input.offset); + RunEnds(*input), RunEndsArray(*input).length, input->offset); } } From fea250ec60498c9b05b55f8aa6004cab687cbf36 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 14:24:33 +0200 Subject: [PATCH 075/107] fix . -> mixup --- cpp/src/arrow/util/rle_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index ee1721c81a66b..fc046e8964be6 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -116,7 +116,7 @@ class MergedRunsIterator { inputs[0] = &a; - logical_length = a->length; + logical_length = a.length; for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { const ArraySpan* input = inputs[input_id]; From b45bd6af5f4b420413decea8b5d1b491672ed8b4 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 15:52:15 +0200 Subject: [PATCH 076/107] fix rle iterator --- cpp/src/arrow/util/rle_util.h | 9 +++++++-- cpp/src/arrow/util/rle_util_test.cc | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index fc046e8964be6..df5e0be8886f6 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -123,6 +123,7 @@ class MergedRunsIterator { run_index[input_id] = rle_util::FindPhysicalOffset( RunEnds(*input), RunEndsArray(*input).length, input->offset); } + FindMergedRun(); } MergedRunsIterator(const ArraySpan& a, const ArraySpan& b) { @@ -139,6 +140,7 @@ class MergedRunsIterator { run_index[input_id] = rle_util::FindPhysicalOffset( RunEnds(*input), RunEndsArray(*input).length, input->offset); } + FindMergedRun(); } MergedRunsIterator(const MergedRunsIterator& other) = default; @@ -165,12 +167,15 @@ class MergedRunsIterator { } bool operator==(const MergedRunsIterator& other) const { - return (isEnd() && other.isEnd()) || (logical_position == other.logical_position); + return (isEnd() && other.isEnd()) || + (!isEnd() && !other.isEnd() && logical_position == other.logical_position); } bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } - int64_t physical_index(int64_t input_id) const { return run_index[input_id]; } + int64_t physical_index(int64_t input_id) const { + return run_index[input_id] + DataArray(*inputs[input_id]).offset; + } int64_t run_length() const { return merged_run_length; } private: diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 6f3b16c391e4a..c57b30222da9f 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -106,6 +106,7 @@ TEST(TestRleUtil, VisitMergedRuns) { ASSERT_EQ(it.physical_index(1), expected_right_visits[position]); position++; } + ASSERT_EQ(position, expected_run_lengths.size()); } } // namespace rle_util From f4b3f8a5506f6d1f0fb9fdef6ead500952a41650 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 16:51:14 +0200 Subject: [PATCH 077/107] fix AddArtificialOffsetInChildArray --- cpp/src/arrow/util/rle_util.cc | 4 ++-- cpp/src/arrow/util/rle_util_test.cc | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index f5437aef5ec87..7ed07160db711 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -14,11 +14,11 @@ int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, } void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { - auto& child = array->child_data[0]; + auto& child = array->child_data[1]; auto builder = MakeBuilder(child->type).ValueOrDie(); ARROW_CHECK_OK(builder->AppendNulls(offset)); ARROW_CHECK_OK(builder->AppendArraySlice(ArraySpan(*child), 0, child->length)); - array->child_data[0] = builder->Finish().ValueOrDie()->Slice(offset)->data(); + array->child_data[1] = builder->Finish().ValueOrDie()->Slice(offset)->data(); } int64_t GetPhysicalOffset(const ArraySpan& span) { diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index c57b30222da9f..00887bea4ce63 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -42,6 +42,15 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { 10); } +TEST(TestRleUtil, ArtificalOffset) { + const auto values = ArrayFromJSON(int32(), "[1, 2, 3]"); + const auto run_ends = ArrayFromJSON(int32(), "[10, 20, 30]"); + ASSERT_OK_AND_ASSIGN(auto array, RunLengthEncodedArray::Make(run_ends, values, 30)); + AddArtificialOffsetInChildArray(array->data().get(), 100); + ASSERT_ARRAYS_EQUAL(*values, *array->values_array()); + ASSERT_EQ(array->values_array()->offset(), 100); +} + TEST(TestRleUtil, VisitMergedRuns) { const auto left_run_ends = ArrayFromJSON( int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); From c2dcab46e9c93427b9eb4d0356d84f3e7935191b Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 16:59:15 +0200 Subject: [PATCH 078/107] also test rle iterator on single input array --- cpp/src/arrow/util/rle_util_test.cc | 46 +++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 00887bea4ce63..2386e151f0608 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -116,6 +116,52 @@ TEST(TestRleUtil, VisitMergedRuns) { position++; } ASSERT_EQ(position, expected_run_lengths.size()); + + // left array only + const std::vector left_only_run_lengths = {5, 10, 5, 5, 25}; + + position = 0; + for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), + ArraySpan(*left_array->data())); + it != MergedRunsIterator<2>(); ++it) { + ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); + ASSERT_EQ(it.physical_index(0), 110 + position); + ASSERT_EQ(it.physical_index(1), 110 + position); + position++; + } + ASSERT_EQ(position, left_only_run_lengths.size()); + + position = 0; + for (auto it = MergedRunsIterator<1>(ArraySpan(*left_array->data())); + it != MergedRunsIterator<1>(); ++it) { + ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); + ASSERT_EQ(it.physical_index(0), 110 + position); + position++; + } + ASSERT_EQ(position, left_only_run_lengths.size()); + + // right array only + const std::vector right_only_run_lengths = {5, 4, 16, 25}; + + position = 0; + for (auto it = MergedRunsIterator<2>(ArraySpan(*right_array->data()), + ArraySpan(*right_array->data())); + it != MergedRunsIterator<2>(); ++it) { + ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); + ASSERT_EQ(it.physical_index(0), 205 + position); + ASSERT_EQ(it.physical_index(1), 205 + position); + position++; + } + ASSERT_EQ(position, right_only_run_lengths.size()); + + position = 0; + for (auto it = MergedRunsIterator<1>(ArraySpan(*right_array->data())); + it != MergedRunsIterator<1>(); ++it) { + ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); + ASSERT_EQ(it.physical_index(0), 205 + position); + position++; + } + ASSERT_EQ(position, right_only_run_lengths.size()); } } // namespace rle_util From fa740629ff165637f3cb8779b4bcdc51d33dbc7c Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 17:07:06 +0200 Subject: [PATCH 079/107] naming: DataArray -> ValuesArray --- cpp/src/arrow/util/rle_util.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index df5e0be8886f6..1647523873023 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -51,7 +51,7 @@ static const int32_t* RunEnds(const ArraySpan& span) { } /// \brief Get the child array holding the data values from an RLE array -static const ArraySpan& DataArray(const ArraySpan& span) { return span.child_data[1]; } +static const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; } /// \brief Iterate over two run-length encoded arrays in segments of runs that are inside /// run boundaries in each input. A callback is called on each of these segments @@ -91,8 +91,8 @@ void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callba // callback to code that wants to work on the data - we give it physical indices // including all offsets. This includes the additional offset the data array may have. - callback(merged_run_length, a_run_index + DataArray(a).offset, - b_run_index + DataArray(b).offset); + callback(merged_run_length, a_run_index + ValuesArray(a).offset, + b_run_index + ValuesArray(b).offset); logical_position = merged_run_end; if (logical_position == a_run_end) { @@ -174,7 +174,7 @@ class MergedRunsIterator { bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } int64_t physical_index(int64_t input_id) const { - return run_index[input_id] + DataArray(*inputs[input_id]).offset; + return run_index[input_id] + ValuesArray(*inputs[input_id]).offset; } int64_t run_length() const { return merged_run_length; } @@ -231,7 +231,7 @@ void VisitRuns(const ArraySpan& span, CallbackType callback) { // callback to code that wants to work on the data - we give it physical indices // including all offsets. This includes the additional offset the data array may have. - callback(run_length, run_index + DataArray(span).offset); + callback(run_length, run_index + ValuesArray(span).offset); logical_position = run_end; run_index++; From b8942693f6969b1d8b15456ad6ac5baa9020b240 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 17:12:14 +0200 Subject: [PATCH 080/107] remove old VisitMergedRuns/VisitRuns functions that are replaced by iterator --- cpp/src/arrow/util/rle_util.h | 83 +---------------------------- cpp/src/arrow/util/rle_util_test.cc | 26 --------- 2 files changed, 1 insertion(+), 108 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 1647523873023..d4804c2aba470 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -54,56 +54,7 @@ static const int32_t* RunEnds(const ArraySpan& span) { static const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; } /// \brief Iterate over two run-length encoded arrays in segments of runs that are inside -/// run boundaries in each input. A callback is called on each of these segments -/// \param[in] a first input as ArraySpan -/// \param[in] b second input as ArraySpan -/// \param[in] callback taking 3 int64_t arguments: the length of the current run segment, -/// and a phyiscal index into the data array arrays of a and b. Offsets are already -/// applied. -template -void VisitMergedRuns(const ArraySpan& a, const ArraySpan& b, CallbackType callback) { - const int64_t a_physical_offset = - rle_util::FindPhysicalOffset(RunEnds(a), RunEndsArray(a).length, a.offset); - const int64_t b_physical_offset = - rle_util::FindPhysicalOffset(RunEnds(b), RunEndsArray(b).length, b.offset); - - ARROW_DCHECK_EQ(a.length, b.length); - const int64_t logical_length = a.length; - - // run indices from the start of the run ends buffers without offset - int64_t a_run_index = a_physical_offset; - int64_t b_run_index = b_physical_offset; - int64_t logical_position = 0; - - while (logical_position < logical_length) { - // logical indices of the end of the run we are currently in each input - int64_t a_run_end = RunEnds(a)[a_run_index] - a.offset; - int64_t b_run_end = RunEnds(b)[b_run_index] - b.offset; - - // the logical length may end in the middle of a run - a_run_end = std::min(a_run_end, logical_length); - b_run_end = std::min(b_run_end, logical_length); - ARROW_DCHECK_GT(a_run_end, logical_position); - ARROW_DCHECK_GT(b_run_end, logical_position); - - const int64_t merged_run_end = std::min(a_run_end, b_run_end); - const int64_t merged_run_length = merged_run_end - logical_position; - - // callback to code that wants to work on the data - we give it physical indices - // including all offsets. This includes the additional offset the data array may have. - callback(merged_run_length, a_run_index + ValuesArray(a).offset, - b_run_index + ValuesArray(b).offset); - - logical_position = merged_run_end; - if (logical_position == a_run_end) { - a_run_index++; - } - if (logical_position == b_run_end) { - b_run_index++; - } - } -} - +/// run boundaries in each input template class MergedRunsIterator { public: @@ -206,38 +157,6 @@ class MergedRunsIterator { int64_t merged_run_length; }; -/// \brief Iterate over the runs of a run-length encoded array. A callback is called on -/// each of these segments \param[in] input as ArraySpan \param[in] callback taking 2 -/// int64_t arguments: the length of the current run segment, and a phyiscal index into -/// the data array array. Offsets are already applied. -template -void VisitRuns(const ArraySpan& span, CallbackType callback) { - const int64_t physical_offset = - rle_util::FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); - const int64_t logical_length = span.length; - - // run indices from the start of the run ends buffers without offset - int64_t run_index = physical_offset; - int64_t logical_position = 0; - - while (logical_position < logical_length) { - // logical index of the end of the run we are currently in input - int64_t run_end = RunEnds(span)[run_index] - span.offset; - // the logical length may end in the middle of a run - run_end = std::min(run_end, logical_length); - ARROW_DCHECK_GT(run_end, logical_position); - - const int64_t run_length = run_end - logical_position; - - // callback to code that wants to work on the data - we give it physical indices - // including all offsets. This includes the additional offset the data array may have. - callback(run_length, run_index + ValuesArray(span).offset); - - logical_position = run_end; - run_index++; - } -} - // TODO: this may fit better into some testing header void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset); diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 2386e151f0608..03221193c15cf 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -80,32 +80,6 @@ TEST(TestRleUtil, VisitMergedRuns) { left_array = left_array->Slice(left_parent_offset); right_array = right_array->Slice(right_parent_offset); - size_t position = 0; - rle_util::VisitMergedRuns( - ArraySpan(*left_array->data()), ArraySpan(*right_array->data()), - [&position, &expected_run_lengths, &expected_left_visits, &expected_right_visits]( - int64_t run_length, int64_t left_index, int64_t right_index) { - ASSERT_EQ(run_length, expected_run_lengths[position]); - ASSERT_EQ(left_index, expected_left_visits[position]); - ASSERT_EQ(right_index, expected_right_visits[position]); - position++; - }); - ASSERT_EQ(position, expected_run_lengths.size()); - - // test the same data with left/right swapped - position = 0; - rle_util::VisitMergedRuns( - ArraySpan(*right_array->data()), ArraySpan(*left_array->data()), - [&position, &expected_run_lengths, &expected_left_visits, &expected_right_visits]( - int64_t run_length, int64_t right_index, int64_t left_index) { - ASSERT_EQ(run_length, expected_run_lengths[position]); - ASSERT_EQ(left_index, expected_left_visits[position]); - ASSERT_EQ(right_index, expected_right_visits[position]); - position++; - }); - - ASSERT_EQ(position, expected_run_lengths.size()); - position = 0; for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), ArraySpan(*right_array->data())); From d3e5cacbc0eebfed5d98b46572cb11665e679bdc Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 17:14:04 +0200 Subject: [PATCH 081/107] mark rle_util accessors as inline --- cpp/src/arrow/util/rle_util.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index d4804c2aba470..9dbdda7abd67f 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -43,15 +43,19 @@ int64_t GetPhysicalOffset(const ArraySpan& span); int64_t GetPhysicalLength(const ArraySpan& span); /// \brief Get the child array holding the data values from an RLE array -static const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; } +static inline const ArraySpan& RunEndsArray(const ArraySpan& span) { + return span.child_data[0]; +} /// \brief Get a pointer to run ends values of an RLE array -static const int32_t* RunEnds(const ArraySpan& span) { +static inline const int32_t* RunEnds(const ArraySpan& span) { return RunEndsArray(span).GetValues(1); } /// \brief Get the child array holding the data values from an RLE array -static const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; } +static inline const ArraySpan& ValuesArray(const ArraySpan& span) { + return span.child_data[1]; +} /// \brief Iterate over two run-length encoded arrays in segments of runs that are inside /// run boundaries in each input From f8ee87aeab287523c0becb1d6446ecb021657d9f Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 17:15:14 +0200 Subject: [PATCH 082/107] rename rle iterator test --- cpp/src/arrow/util/rle_util_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 03221193c15cf..20ad6abb6c45a 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -51,7 +51,7 @@ TEST(TestRleUtil, ArtificalOffset) { ASSERT_EQ(array->values_array()->offset(), 100); } -TEST(TestRleUtil, VisitMergedRuns) { +TEST(TestRleUtil, MergedRunsInterator) { const auto left_run_ends = ArrayFromJSON( int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); const auto right_run_ends = @@ -80,7 +80,7 @@ TEST(TestRleUtil, VisitMergedRuns) { left_array = left_array->Slice(left_parent_offset); right_array = right_array->Slice(right_parent_offset); - position = 0; + size_t position = 0; for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), ArraySpan(*right_array->data())); it != MergedRunsIterator<2>(); ++it) { From d350081b0be00b54e2b64e44a76a9e15f4059c50 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 15 Aug 2022 17:47:14 +0200 Subject: [PATCH 083/107] fix rle stub in parquet path_internal --- cpp/src/parquet/arrow/path_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 2a6be6f889cd0..e3fe3670f60a5 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -802,7 +802,7 @@ class PathBuilder { return Status::OK(); } - Status Visit(const ::arrow::StructArray& array) { + Status Visit(const ::arrow::RunLengthEncodedArray& array) { return Status::NotImplemented("arrow rle array in Parquet"); } From 99a264a550b2af236dc7378edd0d53f293d9a13f Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 19 Aug 2022 01:42:48 +0200 Subject: [PATCH 084/107] fix GetPhysicalLength function --- cpp/src/arrow/array/array_encoded_test.cc | 6 +++--- cpp/src/arrow/util/rle_util.cc | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 4ae365158dd26..778a688a504bf 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -104,9 +104,9 @@ TEST(RunLengthEncodedArray, OffsetLength) { ASSERT_EQ(slice2->GetPhysicalOffset(), 1); auto slice3 = - std::dynamic_pointer_cast(rle_array->Slice(100, 200)); - ASSERT_EQ(slice3->GetPhysicalLength(), 2); - ASSERT_EQ(slice3->GetPhysicalOffset(), 1); + std::dynamic_pointer_cast(rle_array->Slice(400, 100)); + ASSERT_EQ(slice3->GetPhysicalLength(), 1); + ASSERT_EQ(slice3->GetPhysicalOffset(), 4); auto slice4 = std::dynamic_pointer_cast(rle_array->Slice(0, 150)); diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index 7ed07160db711..be3dea1fb5ae0 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -32,8 +32,10 @@ int64_t GetPhysicalLength(const ArraySpan& span) { return 0; } else { // find the offset of the last element and add 1 - return FindPhysicalOffset(RunEnds(span) + GetPhysicalOffset(span), - RunEndsArray(span).length, span.offset + span.length - 1) + + int64_t physical_offset = GetPhysicalOffset(span); + return FindPhysicalOffset(RunEnds(span) + physical_offset, + RunEndsArray(span).length - physical_offset, + span.offset + span.length - 1) + 1; } } From 2d03dbf54f41d2c15f7c6d5c8cb07d2b4dd56f42 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 24 Aug 2022 14:53:25 +0200 Subject: [PATCH 085/107] rle compare --- cpp/src/arrow/array/array_encoded_test.cc | 24 +++++++++++++++++++++++ cpp/src/arrow/compare.cc | 14 ++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 778a688a504bf..62261b48115b2 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -114,6 +114,30 @@ TEST(RunLengthEncodedArray, OffsetLength) { ASSERT_EQ(slice4->GetPhysicalOffset(), 0); } +TEST(RunLengthEncodedArray, Compare) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 30)); + + // second array object that is exactly the same as rle_array + auto standard_equals = MakeArray(rle_array->data()->Copy()); + ASSERT_ARRAYS_EQUAL(*rle_array, *standard_equals); + + ASSERT_FALSE(rle_array->Slice(0, 29)->Equals(*rle_array->Slice(1, 29))); + + // array that is logically the same as our rle_array, but has 2 small runs for the first + // value instead of one large + auto duplicate_run_ends = ArrayFromJSON(int32(), "[5, 10, 20, 30]"); + auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "Hello", "World", null])"); + ASSERT_OK_AND_ASSIGN(auto duplicate_array, RunLengthEncodedArray::Make( + duplicate_run_ends, string_values, 30)); + ASSERT_ARRAYS_EQUAL(*rle_array, *duplicate_array); + + ASSERT_OK_AND_ASSIGN(auto empty_array, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[]"), + ArrayFromJSON(binary(), "[]"), 0)); + ASSERT_ARRAYS_EQUAL(*empty_array, *MakeArray(empty_array->data()->Copy())); +} + } // anonymous namespace } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index ef1cb31201a58..811799266cab4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -46,6 +46,7 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/memory.h" +#include "arrow/util/rle_util.h" #include "arrow/visit_scalar_inline.h" #include "arrow/visit_type_inline.h" @@ -388,7 +389,18 @@ class RangeDataEqualsImpl { } Status Visit(const RunLengthEncodedType& type) { - return Status::NotImplemented("comparing run-length encoded data"); + for (auto it = rle_util::MergedRunsIterator<2>(ArraySpan(left_), ArraySpan(right_)); + it != rle_util::MergedRunsIterator<2>(); ++it) { + RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[1], + *right_.child_data[1], + left_start_idx_ + it.physical_index(0), + right_start_idx_ + it.physical_index(1), 1); + if (!impl.Compare()) { + result_ = false; + return Status::OK(); + } + } + return Status::OK(); } Status Visit(const ExtensionType& type) { From bbd81f6b85a38680fbb19fc9e4539405401671b5 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 24 Aug 2022 17:01:15 +0200 Subject: [PATCH 086/107] byte-swapping RLE arrays should now just work we no longer have any custom buffer, just standard child arrays --- cpp/src/arrow/array/util.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 5dca10e5c3402..f3020f3039f69 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -237,6 +237,7 @@ class ArrayDataEndianSwapper { Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } Status Visit(const FixedSizeListType& type) { return Status::OK(); } Status Visit(const StructType& type) { return Status::OK(); } + Status Visit(const RunLengthEncodedType& type) { return Status::OK(); } Status Visit(const UnionType& type) { out_->buffers[1] = data_->buffers[1]; if (type.mode() == UnionMode::DENSE) { @@ -284,10 +285,6 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status Visit(const RunLengthEncodedType& type) { - return Status::NotImplemented("Byte-swap run-length encoded data"); - } - const std::shared_ptr& data_; std::shared_ptr out_; }; From 59f5fa2dd49eab57c3232f634790be241049b8fa Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 1 Sep 2022 16:35:16 +0200 Subject: [PATCH 087/107] fix handling of 0 length arrays at the end of run ends array --- cpp/src/arrow/array/array_encoded_test.cc | 5 +++++ cpp/src/arrow/util/rle_util.cc | 2 +- cpp/src/arrow/util/rle_util.h | 3 ++- cpp/src/arrow/util/rle_util_test.cc | 5 +++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 778a688a504bf..e31ae0a8d47dd 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -112,6 +112,11 @@ TEST(RunLengthEncodedArray, OffsetLength) { std::dynamic_pointer_cast(rle_array->Slice(0, 150)); ASSERT_EQ(slice4->GetPhysicalLength(), 2); ASSERT_EQ(slice4->GetPhysicalOffset(), 0); + + auto zero_length_at_end = + std::dynamic_pointer_cast(rle_array->Slice(500, 0)); + ASSERT_EQ(zero_length_at_end->GetPhysicalLength(), 0); + ASSERT_EQ(zero_length_at_end->GetPhysicalOffset(), 5); } } // anonymous namespace diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index be3dea1fb5ae0..a2bb482eb0efc 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -9,7 +9,7 @@ int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset) { auto it = std::upper_bound(run_ends, run_ends + num_run_ends, logical_offset); int64_t result = std::distance(run_ends, it); - ARROW_DCHECK_LT(result, num_run_ends); + ARROW_DCHECK_LE(result, num_run_ends); return result; } diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 9dbdda7abd67f..30e7f0da7ceeb 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -29,7 +29,8 @@ namespace arrow { namespace rle_util { /// \brief Get the physical offset from a logical offset given run end values using binary -/// search +/// search. Returns num_run_ends if the physical offset is not within the first +/// num_run_ends elements. int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, int64_t logical_offset); diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 20ad6abb6c45a..6bf14b8ef7599 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -40,6 +40,11 @@ TEST(TestRleUtil, FindPhysicalOffsetTest) { 1015, 1020, 1025, 1050}, 15, 1000), 10); + + // out-of-range logical offset should return num_run_ends + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 6), 3); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 10000), 3); + ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 0, 5), 0); } TEST(TestRleUtil, ArtificalOffset) { From a35b6513358ca3353d89973a7c7b1302fa6c3130 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 01:10:02 +0200 Subject: [PATCH 088/107] fix diff error message --- cpp/src/arrow/array/diff.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 04dfa701afe0f..0569e94b63c53 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -119,7 +119,7 @@ struct ValueComparatorVisitor { } Status Visit(const RunLengthEncodedType&) { - return Status::NotImplemented("dictionary type"); + return Status::NotImplemented("run-length encoded type"); } ValueComparator Create(const DataType& type) { From f9ddf79ebf054531536cd81b154acc168964971f Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 03:22:40 +0200 Subject: [PATCH 089/107] fix offset handling --- cpp/src/arrow/array/array_encoded_test.cc | 14 ++++++++++++++ cpp/src/arrow/compare.cc | 10 +++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 62261b48115b2..c40838617ac58 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -136,6 +136,20 @@ TEST(RunLengthEncodedArray, Compare) { RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[]"), ArrayFromJSON(binary(), "[]"), 0)); ASSERT_ARRAYS_EQUAL(*empty_array, *MakeArray(empty_array->data()->Copy())); + + // threee different slices that have the value [3, 3, 3, 4, 4, 4, 4] + ASSERT_OK_AND_ASSIGN(auto different_offsets_a, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[2, 5, 12, 58, 60]"), + ArrayFromJSON(int64(), "[1, 2, 3, 4, 5]"), 60)); + ASSERT_OK_AND_ASSIGN(auto different_offsets_b, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[81, 86, 99, 100]"), + ArrayFromJSON(int64(), "[2, 3, 4, 5]"), 100)); + ASSERT_OK_AND_ASSIGN(auto different_offsets_c, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[3, 7]"), + ArrayFromJSON(int64(), "[3, 4]"), 7)); + ASSERT_ARRAYS_EQUAL(*different_offsets_a->Slice(9, 7), *different_offsets_b->Slice(83, 7)); + ASSERT_ARRAYS_EQUAL(*different_offsets_a->Slice(9, 7), *different_offsets_c); + ASSERT_ARRAYS_EQUAL(*different_offsets_b->Slice(83, 7), *different_offsets_c); } } // anonymous namespace diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 811799266cab4..d654b74cbcad3 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -389,12 +389,16 @@ class RangeDataEqualsImpl { } Status Visit(const RunLengthEncodedType& type) { - for (auto it = rle_util::MergedRunsIterator<2>(ArraySpan(left_), ArraySpan(right_)); + auto left_span = ArraySpan(left_); + auto right_span = ArraySpan(right_); + left_span.SetSlice(left_.offset + left_start_idx_, range_length_); + right_span.SetSlice(right_.offset + right_start_idx_, range_length_); + for (auto it = rle_util::MergedRunsIterator<2>(left_span, right_span); it != rle_util::MergedRunsIterator<2>(); ++it) { RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[1], *right_.child_data[1], - left_start_idx_ + it.physical_index(0), - right_start_idx_ + it.physical_index(1), 1); + it.index_into_values(0), + it.index_into_values(1), 1); if (!impl.Compare()) { result_ = false; return Status::OK(); From f55717daf6aa3c8d506b20f282aede617eb36b9a Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 03:24:47 +0200 Subject: [PATCH 090/107] fix zero-lenght arrays in rle iterator --- cpp/src/arrow/util/rle_util.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 30e7f0da7ceeb..58ea4a3a13adf 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -79,7 +79,9 @@ class MergedRunsIterator { run_index[input_id] = rle_util::FindPhysicalOffset( RunEnds(*input), RunEndsArray(*input).length, input->offset); } - FindMergedRun(); + if (!isEnd()) { + FindMergedRun(); + } } MergedRunsIterator(const ArraySpan& a, const ArraySpan& b) { @@ -96,7 +98,9 @@ class MergedRunsIterator { run_index[input_id] = rle_util::FindPhysicalOffset( RunEnds(*input), RunEndsArray(*input).length, input->offset); } - FindMergedRun(); + if (!isEnd()) { + FindMergedRun(); + } } MergedRunsIterator(const MergedRunsIterator& other) = default; @@ -153,7 +157,6 @@ class MergedRunsIterator { bool isEnd() const { return logical_position == logical_length; } std::array inputs; - std::array physical_offset; std::array run_index; // logical indices of the end of the run we are currently in each input std::array run_end; From 5bdc98431370b044c267053e956875ba70287714 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 04:17:12 +0200 Subject: [PATCH 091/107] rle_util_test: avoid using array span objects beyond thier lifetime --- cpp/src/arrow/util/rle_util_test.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 6bf14b8ef7599..57e6a46d4a48f 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -81,13 +81,14 @@ TEST(TestRleUtil, MergedRunsInterator) { RunLengthEncodedArray::Make(left_run_ends, left_child, 1050)); ASSERT_OK_AND_ASSIGN(std::shared_ptr right_array, RunLengthEncodedArray::Make(right_run_ends, right_child, 2050)); - left_array = left_array->Slice(left_parent_offset); right_array = right_array->Slice(right_parent_offset); + ArraySpan left_span(*left_array->data()); + ArraySpan right_span(*right_array->data()); size_t position = 0; - for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), - ArraySpan(*right_array->data())); + for (auto it = MergedRunsIterator<2>(left_span, + right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), expected_run_lengths[position]); ASSERT_EQ(it.physical_index(0), expected_left_visits[position]); @@ -100,8 +101,8 @@ TEST(TestRleUtil, MergedRunsInterator) { const std::vector left_only_run_lengths = {5, 10, 5, 5, 25}; position = 0; - for (auto it = MergedRunsIterator<2>(ArraySpan(*left_array->data()), - ArraySpan(*left_array->data())); + for (auto it = MergedRunsIterator<2>(left_span, + left_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); ASSERT_EQ(it.physical_index(0), 110 + position); @@ -111,7 +112,7 @@ TEST(TestRleUtil, MergedRunsInterator) { ASSERT_EQ(position, left_only_run_lengths.size()); position = 0; - for (auto it = MergedRunsIterator<1>(ArraySpan(*left_array->data())); + for (auto it = MergedRunsIterator<1>(left_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); ASSERT_EQ(it.physical_index(0), 110 + position); @@ -123,8 +124,8 @@ TEST(TestRleUtil, MergedRunsInterator) { const std::vector right_only_run_lengths = {5, 4, 16, 25}; position = 0; - for (auto it = MergedRunsIterator<2>(ArraySpan(*right_array->data()), - ArraySpan(*right_array->data())); + for (auto it = MergedRunsIterator<2>(right_span, + right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); ASSERT_EQ(it.physical_index(0), 205 + position); @@ -134,7 +135,7 @@ TEST(TestRleUtil, MergedRunsInterator) { ASSERT_EQ(position, right_only_run_lengths.size()); position = 0; - for (auto it = MergedRunsIterator<1>(ArraySpan(*right_array->data())); + for (auto it = MergedRunsIterator<1>(right_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); ASSERT_EQ(it.physical_index(0), 205 + position); From 286fe6c41ccc5193798525e390d065b1fa2abe6d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 04:58:48 +0200 Subject: [PATCH 092/107] rle iterator: add more accessor variants --- cpp/src/arrow/util/rle_util.h | 24 +++++++++++++----- cpp/src/arrow/util/rle_util_test.cc | 39 +++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 58ea4a3a13adf..15e24fa30cfeb 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -106,7 +106,7 @@ class MergedRunsIterator { MergedRunsIterator(const MergedRunsIterator& other) = default; MergedRunsIterator& operator++() { - logical_position += merged_run_length; + logical_position = merged_run_end; for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { if (logical_position == run_end[input_id]) { @@ -133,14 +133,27 @@ class MergedRunsIterator { bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } - int64_t physical_index(int64_t input_id) const { + + /// \brief returns a physical index into the values array buffers of a given input, pointing to + /// the value of the current run. The index includes the array offset, so it can be used to access + /// a buffer directly + int64_t index_into_buffer(int64_t input_id) const { return run_index[input_id] + ValuesArray(*inputs[input_id]).offset; } - int64_t run_length() const { return merged_run_length; } + /// \brief returns a physical index into the values array of a given input, pointing to the value + /// of the current run + int64_t index_into_array(int64_t input_id) const { + return run_index[input_id]; + } + /// \brief returns the logical length of the current run + int64_t run_length() const { return merged_run_end - logical_position; } + /// \brief returns the accumulated length of all runs from the beginning of the array including + /// the current one + int64_t accumulated_run_length() const { return merged_run_end; } private: void FindMergedRun() { - int64_t merged_run_end = std::numeric_limits::max(); + merged_run_end = std::numeric_limits::max(); for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { // logical indices of the end of the run we are currently in each input run_end[input_id] = @@ -151,7 +164,6 @@ class MergedRunsIterator { merged_run_end = std::min(merged_run_end, run_end[input_id]); } - merged_run_length = merged_run_end - logical_position; } bool isEnd() const { return logical_position == logical_length; } @@ -162,7 +174,7 @@ class MergedRunsIterator { std::array run_end; int64_t logical_position = 0; int64_t logical_length = 0; - int64_t merged_run_length; + int64_t merged_run_end; }; // TODO: this may fit better into some testing header diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 57e6a46d4a48f..02b466cd878c1 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -87,13 +87,18 @@ TEST(TestRleUtil, MergedRunsInterator) { ArraySpan right_span(*right_array->data()); size_t position = 0; + size_t logical_position = 0; for (auto it = MergedRunsIterator<2>(left_span, right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), expected_run_lengths[position]); - ASSERT_EQ(it.physical_index(0), expected_left_visits[position]); - ASSERT_EQ(it.physical_index(1), expected_right_visits[position]); + ASSERT_EQ(it.index_into_buffer(0), expected_left_visits[position]); + ASSERT_EQ(it.index_into_buffer(1), expected_right_visits[position]); + ASSERT_EQ(it.index_into_array(0), expected_left_visits[position] - left_child_offset); + ASSERT_EQ(it.index_into_array(1), expected_right_visits[position] - right_child_offset); position++; + logical_position += it.run_length(); + ASSERT_EQ(it.accumulated_run_length(), logical_position); } ASSERT_EQ(position, expected_run_lengths.size()); @@ -101,22 +106,31 @@ TEST(TestRleUtil, MergedRunsInterator) { const std::vector left_only_run_lengths = {5, 10, 5, 5, 25}; position = 0; + logical_position = 0; for (auto it = MergedRunsIterator<2>(left_span, left_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); - ASSERT_EQ(it.physical_index(0), 110 + position); - ASSERT_EQ(it.physical_index(1), 110 + position); + ASSERT_EQ(it.index_into_buffer(0), 110 + position); + ASSERT_EQ(it.index_into_buffer(1), 110 + position); + ASSERT_EQ(it.index_into_array(0), 10 + position); + ASSERT_EQ(it.index_into_array(1), 10 + position); position++; + logical_position += it.run_length(); + ASSERT_EQ(it.accumulated_run_length(), logical_position); } ASSERT_EQ(position, left_only_run_lengths.size()); position = 0; + logical_position = 0; for (auto it = MergedRunsIterator<1>(left_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); - ASSERT_EQ(it.physical_index(0), 110 + position); + ASSERT_EQ(it.index_into_buffer(0), 110 + position); + ASSERT_EQ(it.index_into_array(0), 10 + position); position++; + logical_position += it.run_length(); + ASSERT_EQ(it.accumulated_run_length(), logical_position); } ASSERT_EQ(position, left_only_run_lengths.size()); @@ -124,22 +138,31 @@ TEST(TestRleUtil, MergedRunsInterator) { const std::vector right_only_run_lengths = {5, 4, 16, 25}; position = 0; + logical_position = 0; for (auto it = MergedRunsIterator<2>(right_span, right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); - ASSERT_EQ(it.physical_index(0), 205 + position); - ASSERT_EQ(it.physical_index(1), 205 + position); + ASSERT_EQ(it.index_into_buffer(0), 205 + position); + ASSERT_EQ(it.index_into_buffer(1), 205 + position); + ASSERT_EQ(it.index_into_array(0), 5 + position); + ASSERT_EQ(it.index_into_array(1), 5 + position); position++; + logical_position += it.run_length(); + ASSERT_EQ(it.accumulated_run_length(), logical_position); } ASSERT_EQ(position, right_only_run_lengths.size()); position = 0; + logical_position = 0; for (auto it = MergedRunsIterator<1>(right_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); - ASSERT_EQ(it.physical_index(0), 205 + position); + ASSERT_EQ(it.index_into_buffer(0), 205 + position); + ASSERT_EQ(it.index_into_array(0), 5 + position); position++; + logical_position += it.run_length(); + ASSERT_EQ(it.accumulated_run_length(), logical_position); } ASSERT_EQ(position, right_only_run_lengths.size()); } From f6d78d8044f190112eb832d6e6fac9e57e2797ff Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 04:59:40 +0200 Subject: [PATCH 093/107] formatting --- cpp/src/arrow/util/rle_util.h | 19 ++++++++----------- cpp/src/arrow/util/rle_util_test.cc | 18 +++++++----------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index 15e24fa30cfeb..b183da6c74ad4 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -133,22 +133,19 @@ class MergedRunsIterator { bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } - - /// \brief returns a physical index into the values array buffers of a given input, pointing to - /// the value of the current run. The index includes the array offset, so it can be used to access - /// a buffer directly + /// \brief returns a physical index into the values array buffers of a given input, + /// pointing to the value of the current run. The index includes the array offset, so it + /// can be used to access a buffer directly int64_t index_into_buffer(int64_t input_id) const { return run_index[input_id] + ValuesArray(*inputs[input_id]).offset; } - /// \brief returns a physical index into the values array of a given input, pointing to the value - /// of the current run - int64_t index_into_array(int64_t input_id) const { - return run_index[input_id]; - } + /// \brief returns a physical index into the values array of a given input, pointing to + /// the value of the current run + int64_t index_into_array(int64_t input_id) const { return run_index[input_id]; } /// \brief returns the logical length of the current run int64_t run_length() const { return merged_run_end - logical_position; } - /// \brief returns the accumulated length of all runs from the beginning of the array including - /// the current one + /// \brief returns the accumulated length of all runs from the beginning of the array + /// including the current one int64_t accumulated_run_length() const { return merged_run_end; } private: diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 02b466cd878c1..5a00259b9e96b 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -88,14 +88,14 @@ TEST(TestRleUtil, MergedRunsInterator) { size_t position = 0; size_t logical_position = 0; - for (auto it = MergedRunsIterator<2>(left_span, - right_span); + for (auto it = MergedRunsIterator<2>(left_span, right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), expected_run_lengths[position]); ASSERT_EQ(it.index_into_buffer(0), expected_left_visits[position]); ASSERT_EQ(it.index_into_buffer(1), expected_right_visits[position]); ASSERT_EQ(it.index_into_array(0), expected_left_visits[position] - left_child_offset); - ASSERT_EQ(it.index_into_array(1), expected_right_visits[position] - right_child_offset); + ASSERT_EQ(it.index_into_array(1), + expected_right_visits[position] - right_child_offset); position++; logical_position += it.run_length(); ASSERT_EQ(it.accumulated_run_length(), logical_position); @@ -107,8 +107,7 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<2>(left_span, - left_span); + for (auto it = MergedRunsIterator<2>(left_span, left_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); ASSERT_EQ(it.index_into_buffer(0), 110 + position); @@ -123,8 +122,7 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<1>(left_span); - it != MergedRunsIterator<1>(); ++it) { + for (auto it = MergedRunsIterator<1>(left_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); ASSERT_EQ(it.index_into_buffer(0), 110 + position); ASSERT_EQ(it.index_into_array(0), 10 + position); @@ -139,8 +137,7 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<2>(right_span, - right_span); + for (auto it = MergedRunsIterator<2>(right_span, right_span); it != MergedRunsIterator<2>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); ASSERT_EQ(it.index_into_buffer(0), 205 + position); @@ -155,8 +152,7 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<1>(right_span); - it != MergedRunsIterator<1>(); ++it) { + for (auto it = MergedRunsIterator<1>(right_span); it != MergedRunsIterator<1>(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); ASSERT_EQ(it.index_into_buffer(0), 205 + position); ASSERT_EQ(it.index_into_array(0), 5 + position); From 335d550025b54eec9a575fe3f6b99227a6216063 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Wed, 7 Sep 2022 16:52:27 +0200 Subject: [PATCH 094/107] rle_compare: use index_into_array --- cpp/src/arrow/compare.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index d654b74cbcad3..f03f4744e8f3a 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -397,8 +397,8 @@ class RangeDataEqualsImpl { it != rle_util::MergedRunsIterator<2>(); ++it) { RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[1], *right_.child_data[1], - it.index_into_values(0), - it.index_into_values(1), 1); + it.index_into_array(0), + it.index_into_array(1), 1); if (!impl.Compare()) { result_ = false; return Status::OK(); From 25bc98b30814517ad41430b7d833c2c49dd435e0 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 8 Sep 2022 17:36:36 +0200 Subject: [PATCH 095/107] mark rle diff as not supported correctly --- cpp/src/arrow/array/diff.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 0569e94b63c53..f457ee9814b09 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -386,6 +386,8 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::RUN_LENGTH_ENCODED) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } From de64932b8feba021fcb9e0f5c3d02254e8cac335 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 8 Sep 2022 23:53:49 +0200 Subject: [PATCH 096/107] formatting --- cpp/src/arrow/compare.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f03f4744e8f3a..99f1fd632ee36 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -396,8 +396,7 @@ class RangeDataEqualsImpl { for (auto it = rle_util::MergedRunsIterator<2>(left_span, right_span); it != rle_util::MergedRunsIterator<2>(); ++it) { RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[1], - *right_.child_data[1], - it.index_into_array(0), + *right_.child_data[1], it.index_into_array(0), it.index_into_array(1), 1); if (!impl.Compare()) { result_ = false; From a9e6aba748712f8fa7fd382571c08591c282e1eb Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Thu, 8 Sep 2022 23:54:25 +0200 Subject: [PATCH 097/107] more formatting --- cpp/src/arrow/array/array_encoded_test.cc | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index de2b886af9ab8..e63d27007546c 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -143,16 +143,19 @@ TEST(RunLengthEncodedArray, Compare) { ASSERT_ARRAYS_EQUAL(*empty_array, *MakeArray(empty_array->data()->Copy())); // threee different slices that have the value [3, 3, 3, 4, 4, 4, 4] - ASSERT_OK_AND_ASSIGN(auto different_offsets_a, - RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[2, 5, 12, 58, 60]"), - ArrayFromJSON(int64(), "[1, 2, 3, 4, 5]"), 60)); - ASSERT_OK_AND_ASSIGN(auto different_offsets_b, - RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[81, 86, 99, 100]"), - ArrayFromJSON(int64(), "[2, 3, 4, 5]"), 100)); + ASSERT_OK_AND_ASSIGN( + auto different_offsets_a, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[2, 5, 12, 58, 60]"), + ArrayFromJSON(int64(), "[1, 2, 3, 4, 5]"), 60)); + ASSERT_OK_AND_ASSIGN( + auto different_offsets_b, + RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[81, 86, 99, 100]"), + ArrayFromJSON(int64(), "[2, 3, 4, 5]"), 100)); ASSERT_OK_AND_ASSIGN(auto different_offsets_c, RunLengthEncodedArray::Make(ArrayFromJSON(int32(), "[3, 7]"), ArrayFromJSON(int64(), "[3, 4]"), 7)); - ASSERT_ARRAYS_EQUAL(*different_offsets_a->Slice(9, 7), *different_offsets_b->Slice(83, 7)); + ASSERT_ARRAYS_EQUAL(*different_offsets_a->Slice(9, 7), + *different_offsets_b->Slice(83, 7)); ASSERT_ARRAYS_EQUAL(*different_offsets_a->Slice(9, 7), *different_offsets_c); ASSERT_ARRAYS_EQUAL(*different_offsets_b->Slice(83, 7), *different_offsets_c); } From 2eeb0fd69aa67f06c78ad4bc310e78c663305f95 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 13 Sep 2022 12:16:58 +0200 Subject: [PATCH 098/107] fix rle type construction add test --- cpp/src/arrow/type.cc | 4 ++-- cpp/src/arrow/type_test.cc | 21 +++++++++++++++++++++ cpp/src/arrow/type_traits.h | 1 + 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index dfb19555b9146..01652d52cc185 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -719,9 +719,9 @@ Result> DenseUnionType::Make( // Run-length encoded type RunLengthEncodedType::RunLengthEncodedType(std::shared_ptr encoded_type) - : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(std::move(encoded_type)) { + : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(encoded_type) { children_ = {std::make_shared("run_ends", int32(), false), - std::make_shared("values", encoded_type, true)}; + std::make_shared("values", std::move(encoded_type), true)}; } std::string RunLengthEncodedType::ToString() const { diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 2d1a0078edadd..444921d8bed05 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1821,6 +1821,27 @@ TEST(TypesTest, TestDecimalEquals) { AssertTypeNotEqual(t5, t10); } +TEST(TypesTest, TestRunLengthEncodedType) { + auto int8_make_shared = std::make_shared(list(int8())); + auto int8_factory = run_length_encoded(list(int8())); + auto int32_factory = run_length_encoded(list(int32())); + + ASSERT_EQ(*int8_make_shared, *int8_factory); + ASSERT_NE(*int8_make_shared, *int32_factory); + + ASSERT_EQ(int8_factory->id(), Type::RUN_LENGTH_ENCODED); + ASSERT_EQ(int32_factory->id(), Type::RUN_LENGTH_ENCODED); + + auto int8_rle_type = std::dynamic_pointer_cast(int8_factory); + auto int32_rle_type = std::dynamic_pointer_cast(int32_factory); + + ASSERT_EQ(*int8_rle_type->encoded_type(), *list(int8())); + ASSERT_EQ(*int32_rle_type->encoded_type(), *list(int32())); + + ASSERT_TRUE(int8_rle_type->field(0)->Equals(Field("run_ends", int32(), false))); + ASSERT_TRUE(int8_rle_type->field(1)->Equals(Field("values", list(int8()), true))); +} + #define TEST_PREDICATE(all_types, type_predicate) \ for (auto type : all_types) { \ ASSERT_EQ(type_predicate(type->id()), type_predicate(*type)); \ diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 698068e7d19de..f9a538fe58995 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -1192,6 +1192,7 @@ static inline bool is_nested(Type::type type_id) { case Type::STRUCT: case Type::SPARSE_UNION: case Type::DENSE_UNION: + case Type::RUN_LENGTH_ENCODED: return true; default: break; From 3b5ace16b4e82a2c8c5419bef08c48797aef9728 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 13 Sep 2022 12:23:25 +0200 Subject: [PATCH 099/107] test rle type string --- cpp/src/arrow/type_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 444921d8bed05..f975ef057124a 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1840,6 +1840,8 @@ TEST(TypesTest, TestRunLengthEncodedType) { ASSERT_TRUE(int8_rle_type->field(0)->Equals(Field("run_ends", int32(), false))); ASSERT_TRUE(int8_rle_type->field(1)->Equals(Field("values", list(int8()), true))); + + ASSERT_EQ(int8_factory->ToString(), "run_length_encoded>"); } #define TEST_PREDICATE(all_types, type_predicate) \ From 6733c8f7261d7e1056938378fc894380cf353c51 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 13 Sep 2022 17:46:28 +0200 Subject: [PATCH 100/107] rle_util_test: fix too big Slice() on NullArray --- cpp/src/arrow/util/rle_util_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index 5a00259b9e96b..ac0c17fdbca16 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -74,7 +74,7 @@ TEST(TestRleUtil, MergedRunsInterator) { std::shared_ptr right_child = std::make_shared(right_child_offset + right_run_ends->length()); - left_child = left_child->Slice(left_child_offset, 50); + left_child = left_child->Slice(left_child_offset); right_child = right_child->Slice(right_child_offset); ASSERT_OK_AND_ASSIGN(std::shared_ptr left_array, From 0d80e3a43270b566c31a242e680f95afe568f4ff Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 13 Sep 2022 19:36:44 +0200 Subject: [PATCH 101/107] add diagram for MergedRunsInterator test --- cpp/src/arrow/util/rle_util_test.cc | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index ac0c17fdbca16..bc4c231ceeef3 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -57,6 +57,42 @@ TEST(TestRleUtil, ArtificalOffset) { } TEST(TestRleUtil, MergedRunsInterator) { + /* Construct the following two test arrays with a lot of different offstes to test the + * RLE iterator: left: + * + * child offset: 0 + * | + * +---+---+---+---+---+---+---+---+---+----+----+----+----+----+-----+ + * run_ends | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |1000|1005|1015|1020|1025|30000| + * (Int32Array) +---+---+---+---+---+---+---+---+---+----+----+----+----+----+-----+ + * ---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+-----+ + * values ... | | | | | | | | | | | | | | | | + * (NullArray) ---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+-----+ + * |<--------------- slice of NullArray------------------------------>| + * | | logical length: 50 | + * child offset: 100 |<-------------------->| + * | physical length: 5 | + * | | + * logical offset: 1000 + * physical offset: 10 + * + * right: + * child offset: 0 + * | + * +---+---+---+---+---+------+------+------+------+ + * run_ends | 1 | 2 | 3 | 4 | 5 | 2005 | 2009 | 2025 | 2050 | + * (Int32Array) +---+---+---+---+---+------+------+------+------+ + * ---+---+---+---+---+---+------+------+------+------+ + * values ... | | | | | | | | | | + * (NullArray) ---+---+---+---+---+---+------+------+------+------+ + * |<-------- slice of NullArray------------------>| + * | | logical length: 50 | + * child offset: 200 |<-------------------->| + * | physical length: 4 + * | + * logical offset: 2000 + * physical offset: 5 + */ const auto left_run_ends = ArrayFromJSON( int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); const auto right_run_ends = From 87315598cd75b7e2dc68309e75b199107b0d75cd Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Tue, 13 Sep 2022 20:00:23 +0200 Subject: [PATCH 102/107] fix typo --- cpp/src/arrow/util/rle_util_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index bc4c231ceeef3..f09e8e983058f 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -57,7 +57,7 @@ TEST(TestRleUtil, ArtificalOffset) { } TEST(TestRleUtil, MergedRunsInterator) { - /* Construct the following two test arrays with a lot of different offstes to test the + /* Construct the following two test arrays with a lot of different offsets to test the * RLE iterator: left: * * child offset: 0 From 22726617ac1286705f3e7860cb957a84c4ec8334 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 26 Sep 2022 17:33:55 +0200 Subject: [PATCH 103/107] rle type/array: support non-int32 run ends arrays --- cpp/src/arrow/array/array_encoded.cc | 10 +++--- cpp/src/arrow/array/array_encoded_test.cc | 41 +++++++++++++++++------ cpp/src/arrow/type.cc | 24 +++++++++---- cpp/src/arrow/type.h | 7 +++- cpp/src/arrow/type_fwd.h | 4 +-- cpp/src/arrow/type_test.cc | 17 +++++++--- 6 files changed, 74 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded.cc b/cpp/src/arrow/array/array_encoded.cc index 624bf8cc9ba33..1081c25b7f3f6 100644 --- a/cpp/src/arrow/array/array_encoded.cc +++ b/cpp/src/arrow/array/array_encoded.cc @@ -43,16 +43,16 @@ RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr& ty Result> RunLengthEncodedArray::Make( const std::shared_ptr& run_ends_array, const std::shared_ptr& values_array, int64_t logical_length, int64_t offset) { - if (run_ends_array->type_id() != Type::INT32) { - return Status::Invalid("Run ends array must be int32 type"); + if (!RunLengthEncodedType::RunEndsTypeValid(*run_ends_array->type())) { + return Status::Invalid("Run ends array must be int16, int32 or int64 type"); } if (run_ends_array->null_count() != 0) { return Status::Invalid("Run ends array cannot contain null values"); } - return std::make_shared(run_length_encoded(values_array->type()), - logical_length, run_ends_array, - values_array, offset); + return std::make_shared( + run_length_encoded(run_ends_array->type(), values_array->type()), logical_length, + run_ends_array, values_array, offset); } std::shared_ptr RunLengthEncodedArray::values_array() const { diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index 8a476ee560ff7..da61841bf82a3 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -40,11 +40,27 @@ using internal::checked_cast; namespace { -auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); -auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); -auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); +class TestRunLengthEncodedArray + : public ::testing::TestWithParam> { + protected: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + std::shared_ptr size_values; + std::shared_ptr size_only_null; -TEST(RunLengthEncodedArray, MakeArray) { + virtual void SetUp() override { + std::shared_ptr run_ends_type = GetParam(); + + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); + int16_values = ArrayFromJSON(int16(), "[10, 20, 30]"); + size_values = ArrayFromJSON(run_ends_type, "[10, 20, 30]"); + size_only_null = ArrayFromJSON(run_ends_type, "[null, null, null]"); + } +}; + +TEST_P(TestRunLengthEncodedArray, MakeArray) { ASSERT_OK_AND_ASSIGN(auto rle_array, RunLengthEncodedArray::Make(int32_values, string_values, 3)); auto array_data = rle_array->data(); @@ -55,14 +71,14 @@ TEST(RunLengthEncodedArray, MakeArray) { ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); } -TEST(RunLengthEncodedArray, FromRunEndsAndValues) { +TEST_P(TestRunLengthEncodedArray, FromRunEndsAndValues) { std::shared_ptr rle_array; ASSERT_OK_AND_ASSIGN(rle_array, - RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + RunLengthEncodedArray::Make(size_values, int32_values, 3)); ASSERT_EQ(rle_array->length(), 3); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); - ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *size_values); ASSERT_EQ(rle_array->offset(), 0); ASSERT_EQ(rle_array->data()->null_count, 0); // one dummy buffer, since code may assume there is exactly one buffer @@ -70,21 +86,24 @@ TEST(RunLengthEncodedArray, FromRunEndsAndValues) { // explicitly passing offset ASSERT_OK_AND_ASSIGN(rle_array, - RunLengthEncodedArray::Make(int32_values, string_values, 2, 1)); + RunLengthEncodedArray::Make(size_values, string_values, 2, 1)); ASSERT_EQ(rle_array->length(), 2); ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); - ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *size_values); ASSERT_EQ(rle_array->offset(), 1); // explicitly access null count variable so it is not calculated automatically ASSERT_EQ(rle_array->data()->null_count, 0); - ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array must be int32 type", + ASSERT_RAISES_WITH_MESSAGE(Invalid, + "Invalid: Run ends array must be int16, int32 or int64 type", RunLengthEncodedArray::Make(string_values, int32_values, 3)); ASSERT_RAISES_WITH_MESSAGE( Invalid, "Invalid: Run ends array cannot contain null values", - RunLengthEncodedArray::Make(int32_only_null, int32_values, 3)); + RunLengthEncodedArray::Make(size_only_null, int32_values, 3)); } +INSTANTIATE_TEST_SUITE_P(EncodedArrayTests, TestRunLengthEncodedArray, + ::testing::Values(int16(), int32(), int64())); } // anonymous namespace } // namespace arrow diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 01652d52cc185..aa0398122f612 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -718,18 +718,26 @@ Result> DenseUnionType::Make( // ---------------------------------------------------------------------- // Run-length encoded type -RunLengthEncodedType::RunLengthEncodedType(std::shared_ptr encoded_type) +RunLengthEncodedType::RunLengthEncodedType(std::shared_ptr run_ends_type, + std::shared_ptr encoded_type) : NestedType(Type::RUN_LENGTH_ENCODED), EncodingType(encoded_type) { - children_ = {std::make_shared("run_ends", int32(), false), + assert(RunEndsTypeValid(*run_ends_type)); + children_ = {std::make_shared("run_ends", run_ends_type, false), std::make_shared("values", std::move(encoded_type), true)}; } std::string RunLengthEncodedType::ToString() const { std::stringstream s; - s << name() << "<" << encoded_type()->ToString() << ">"; + s << name() << "ToString() + << ", values: " << encoded_type()->ToString() << ">"; return s.str(); } +bool RunLengthEncodedType::RunEndsTypeValid(const DataType& run_ends_type) { + return run_ends_type.id() == Type::INT16 || run_ends_type.id() == Type::INT32 || + run_ends_type.id() == Type::INT64; +} + // ---------------------------------------------------------------------- // Struct type @@ -2155,7 +2163,8 @@ std::string UnionType::ComputeFingerprint() const { std::string RunLengthEncodedType::ComputeFingerprint() const { std::stringstream ss; ss << TypeIdFingerprint(*this) << "{"; - ss << encoded_type()->fingerprint(); + ss << run_ends_type()->fingerprint() << ";"; + ss << encoded_type()->fingerprint() << ";"; ss << "}"; return ss.str(); } @@ -2296,8 +2305,11 @@ std::shared_ptr struct_(const std::vector>& fie return std::make_shared(fields); } -std::shared_ptr run_length_encoded(std::shared_ptr encoded_type) { - return std::make_shared(std::move(encoded_type)); +std::shared_ptr run_length_encoded( + std::shared_ptr run_ends_type, + std::shared_ptr encoded_type) { + return std::make_shared(std::move(run_ends_type), + std::move(encoded_type)); } std::shared_ptr sparse_union(FieldVector child_fields, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 5525c78cb2772..d474538da2061 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1222,7 +1222,8 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType static constexpr const char* type_name() { return "run_length_encoded"; } - explicit RunLengthEncodedType(std::shared_ptr encoded_type); + explicit RunLengthEncodedType(std::shared_ptr run_ends_type, + std::shared_ptr encoded_type); DataTypeLayout layout() const override { // always add one that is NULLPTR to make code, since existing code may assume there @@ -1230,10 +1231,14 @@ class ARROW_EXPORT RunLengthEncodedType : public NestedType, public EncodingType return DataTypeLayout({DataTypeLayout::AlwaysNull()}); } + const std::shared_ptr& run_ends_type() const { return fields()[0]->type(); } + std::string ToString() const override; std::string name() const override { return "run_length_encoded"; } + static bool RunEndsTypeValid(const DataType& run_ends_type); + private: std::string ComputeFingerprint() const override; }; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index c90192e661b13..2b681c87c532b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -553,8 +553,8 @@ std::shared_ptr ARROW_EXPORT struct_(const std::vector>& fields); /// \brief Create a RunLengthEncoded instance -std::shared_ptr ARROW_EXPORT -run_length_encoded(std::shared_ptr encoded_type); +std::shared_ptr ARROW_EXPORT run_length_encoded( + std::shared_ptr run_ends_type, std::shared_ptr encoded_type); /// \brief Create a SparseUnionType instance std::shared_ptr ARROW_EXPORT sparse_union(FieldVector child_fields, diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index f975ef057124a..ef4b0d9470d17 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1822,9 +1822,9 @@ TEST(TypesTest, TestDecimalEquals) { } TEST(TypesTest, TestRunLengthEncodedType) { - auto int8_make_shared = std::make_shared(list(int8())); - auto int8_factory = run_length_encoded(list(int8())); - auto int32_factory = run_length_encoded(list(int32())); + auto int8_make_shared = std::make_shared(int32(), list(int8())); + auto int8_factory = run_length_encoded(int32(), list(int8())); + auto int32_factory = run_length_encoded(int32(), list(int32())); ASSERT_EQ(*int8_make_shared, *int8_factory); ASSERT_NE(*int8_make_shared, *int32_factory); @@ -1841,7 +1841,16 @@ TEST(TypesTest, TestRunLengthEncodedType) { ASSERT_TRUE(int8_rle_type->field(0)->Equals(Field("run_ends", int32(), false))); ASSERT_TRUE(int8_rle_type->field(1)->Equals(Field("values", list(int8()), true))); - ASSERT_EQ(int8_factory->ToString(), "run_length_encoded>"); + auto int16_int32 = run_length_encoded(int16(), list(int32())); + auto int64_int32 = run_length_encoded(int64(), list(int32())); + ASSERT_NE(*int32_factory, *int16_int32); + ASSERT_NE(*int32_factory, *int64_int32); + ASSERT_NE(*int16_int32, *int64_int32); + + ASSERT_EQ(int8_factory->ToString(), + "run_length_encoded>"); + ASSERT_EQ(int16_int32->ToString(), + "run_length_encoded>"); } #define TEST_PREDICATE(all_types, type_predicate) \ From bda49e774c201900d161dd35514090504abc83ed Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Mon, 26 Sep 2022 17:52:58 +0200 Subject: [PATCH 104/107] type_fwd: C++17 compatibility --- cpp/src/arrow/type_fwd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 10881a3e3f0ce..df60e76357b80 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -553,7 +553,7 @@ ARROW_EXPORT std::shared_ptr struct_( const std::vector>& fields); /// \brief Create a RunLengthEncoded instance -std::shared_ptr ARROW_EXPORT run_length_encoded( +ARROW_EXPORT std::shared_ptr run_length_encoded( std::shared_ptr run_ends_type, std::shared_ptr encoded_type); /// \brief Create a SparseUnionType instance From 90a0814922eb20c10bebac515a131f15354599cf Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 7 Oct 2022 20:05:59 +0200 Subject: [PATCH 105/107] rle_util: support different types for run ends array --- cpp/src/arrow/util/rle_util.cc | 39 ++++-- cpp/src/arrow/util/rle_util.h | 191 +++++++++++++++++----------- cpp/src/arrow/util/rle_util_test.cc | 107 +++++++++------- 3 files changed, 213 insertions(+), 124 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index a2bb482eb0efc..d5a998293bab0 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -5,7 +5,8 @@ namespace arrow { namespace rle_util { -int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, +template +int64_t FindPhysicalOffset(const RunEndsType* run_ends, int64_t num_run_ends, int64_t logical_offset) { auto it = std::upper_bound(run_ends, run_ends + num_run_ends, logical_offset); int64_t result = std::distance(run_ends, it); @@ -23,7 +24,27 @@ void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { int64_t GetPhysicalOffset(const ArraySpan& span) { // TODO: caching - return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); + if (span.type->id() == Type::INT16) { + return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, + span.offset); + } else if (span.type->id() == Type::INT32) { + return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, + span.offset); + } else { + ARROW_CHECK(span.type->id() == Type::INT64); + return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, + span.offset); + } +} + +template +static int64_t GetPhysicalLengthInternal(const ArraySpan& span) { + // find the offset of the last element and add 1 + int64_t physical_offset = GetPhysicalOffset(span); + return FindPhysicalOffset(RunEnds(span) + physical_offset, + RunEndsArray(span).length - physical_offset, + span.offset + span.length - 1) + + 1; } int64_t GetPhysicalLength(const ArraySpan& span) { @@ -31,12 +52,14 @@ int64_t GetPhysicalLength(const ArraySpan& span) { if (span.length == 0) { return 0; } else { - // find the offset of the last element and add 1 - int64_t physical_offset = GetPhysicalOffset(span); - return FindPhysicalOffset(RunEnds(span) + physical_offset, - RunEndsArray(span).length - physical_offset, - span.offset + span.length - 1) + - 1; + if (span.type->id() == Type::INT16) { + return GetPhysicalLengthInternal(span); + } else if (span.type->id() == Type::INT32) { + return GetPhysicalLengthInternal(span); + } else { + ARROW_CHECK(span.type->id() == Type::INT64); + return GetPhysicalLengthInternal(span); + } } } diff --git a/cpp/src/arrow/util/rle_util.h b/cpp/src/arrow/util/rle_util.h index b183da6c74ad4..2976e5ce9769e 100644 --- a/cpp/src/arrow/util/rle_util.h +++ b/cpp/src/arrow/util/rle_util.h @@ -31,7 +31,8 @@ namespace rle_util { /// \brief Get the physical offset from a logical offset given run end values using binary /// search. Returns num_run_ends if the physical offset is not within the first /// num_run_ends elements. -int64_t FindPhysicalOffset(const int32_t* run_ends, int64_t num_run_ends, +template +int64_t FindPhysicalOffset(const RunEndsType* run_ends, int64_t num_run_ends, int64_t logical_offset); /// \brief Get the physical offset of an RLE ArraySpan. Warning: calling this may result @@ -49,8 +50,9 @@ static inline const ArraySpan& RunEndsArray(const ArraySpan& span) { } /// \brief Get a pointer to run ends values of an RLE array -static inline const int32_t* RunEnds(const ArraySpan& span) { - return RunEndsArray(span).GetValues(1); +template +static inline const RunEndsType* RunEnds(const ArraySpan& span) { + return RunEndsArray(span).GetValues(1); } /// \brief Get the child array holding the data values from an RLE array @@ -60,63 +62,60 @@ static inline const ArraySpan& ValuesArray(const ArraySpan& span) { /// \brief Iterate over two run-length encoded arrays in segments of runs that are inside /// run boundaries in each input -template +template class MergedRunsIterator { public: - // end iterator - MergedRunsIterator() {} - - // TODO: genereric constructor - MergedRunsIterator(const ArraySpan& a) { - static_assert(NUM_INPUTS == 1, "incorrect number of inputs"); - - inputs[0] = &a; - - logical_length = a.length; - - for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - const ArraySpan* input = inputs[input_id]; - run_index[input_id] = rle_util::FindPhysicalOffset( - RunEnds(*input), RunEndsArray(*input).length, input->offset); - } - if (!isEnd()) { - FindMergedRun(); + static constexpr size_t NUM_INPUTS = sizeof...(RunEndsTypes); + template + explicit MergedRunsIterator(InputTypes&... array_spans) : inputs(array_spans...) { + static_assert(sizeof...(InputTypes) == sizeof...(RunEndsTypes), + "number of run ends types and input ArraySpans must be the same"); + if constexpr (NUM_INPUTS == 0) { + // end interator + logical_length_ = 0; + } else { + logical_length_ = FindCommonLength(); + if (!isEnd()) { + FindMergedRun(); + } } } - MergedRunsIterator(const ArraySpan& a, const ArraySpan& b) { - static_assert(NUM_INPUTS == 2, "incorrect number of inputs"); - - inputs[0] = &a; - inputs[1] = &b; - - ARROW_DCHECK_EQ(a.length, b.length); - logical_length = a.length; - - for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - const ArraySpan* input = inputs[input_id]; - run_index[input_id] = rle_util::FindPhysicalOffset( - RunEnds(*input), RunEndsArray(*input).length, input->offset); - } - if (!isEnd()) { - FindMergedRun(); + /*explicit MergedRunsIterator(ArraySpan array_span) : inputs(Input(array_span)) + { + //static_assert(sizeof...(InputTypes) == sizeof...(RunEndsTypes), "number of run ends + types and input ArraySpans must be the same"); if constexpr (NUM_INPUTS == 0) { + // end interator + logical_length_ = 0; + } else { + logical_length_ = FindCommonLength(); + if (!isEnd()) { + FindMergedRun(); + } } } + explicit MergedRunsIterator(ArraySpan array_span_a, ArraySpan array_span_b) : + inputs(Input(array_span_a), Input(array_span_b)) { + //static_assert(sizeof...(InputTypes) == sizeof...(RunEndsTypes), "number of run ends + types and input ArraySpans must be the same"); if constexpr (NUM_INPUTS == 0) { + // end interator + logical_length_ = 0; + } else { + logical_length_ = FindCommonLength(); + if (!isEnd()) { + FindMergedRun(); + } + } + }*/ MergedRunsIterator(const MergedRunsIterator& other) = default; MergedRunsIterator& operator++() { - logical_position = merged_run_end; - - for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - if (logical_position == run_end[input_id]) { - run_index[input_id]++; - } - } + logical_position_ = merged_run_end_; + IncrementInputs(); if (!isEnd()) { FindMergedRun(); } - return *this; } @@ -126,52 +125,100 @@ class MergedRunsIterator { return result; } - bool operator==(const MergedRunsIterator& other) const { + template + bool operator==(const MergedRunsIterator& other) const { return (isEnd() && other.isEnd()) || - (!isEnd() && !other.isEnd() && logical_position == other.logical_position); + (!isEnd() && !other.isEnd() && logical_position_ == other.logical_position()); } - bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); } + template + bool operator!=(const MergedRunsIterator& other) const { + return !(*this == other); + } /// \brief returns a physical index into the values array buffers of a given input, /// pointing to the value of the current run. The index includes the array offset, so it /// can be used to access a buffer directly - int64_t index_into_buffer(int64_t input_id) const { - return run_index[input_id] + ValuesArray(*inputs[input_id]).offset; + template + int64_t index_into_buffer() const { + auto& input = std::get(inputs); + return input.run_index + ValuesArray(input.array_span).offset; } /// \brief returns a physical index into the values array of a given input, pointing to /// the value of the current run - int64_t index_into_array(int64_t input_id) const { return run_index[input_id]; } + template + int64_t index_into_array() const { + return std::get(inputs).run_index; + } /// \brief returns the logical length of the current run - int64_t run_length() const { return merged_run_end - logical_position; } + int64_t run_length() const { return merged_run_end_ - logical_position_; } /// \brief returns the accumulated length of all runs from the beginning of the array /// including the current one - int64_t accumulated_run_length() const { return merged_run_end; } + int64_t accumulated_run_length() const { return merged_run_end_; } + + bool isEnd() const { return logical_position_ == logical_length_; } + int64_t logical_position() const { return logical_position_; } private: + template + struct Input { + Input(const ArraySpan& array_span) : array_span{array_span} { + run_ends = RunEnds(array_span); + run_index = rle_util::FindPhysicalOffset(run_ends, RunEndsArray(array_span).length, + array_span.offset); + // actual value found later by FindMergedRun: + current_run_end = 0; + } + + const ArraySpan& array_span; + const RunEndsType* run_ends; + int64_t run_index; + int64_t current_run_end; + }; + + template void FindMergedRun() { - merged_run_end = std::numeric_limits::max(); - for (size_t input_id = 0; input_id < NUM_INPUTS; input_id++) { - // logical indices of the end of the run we are currently in each input - run_end[input_id] = - RunEnds(*inputs[input_id])[run_index[input_id]] - inputs[input_id]->offset; - // the logical length may end in the middle of a run, in case the array was sliced - run_end[input_id] = std::min(run_end[input_id], logical_length); - ARROW_DCHECK_GT(run_end[input_id], logical_position); - - merged_run_end = std::min(merged_run_end, run_end[input_id]); + if constexpr (input_id == 0) { + merged_run_end_ = std::numeric_limits::max(); + } + auto& input = std::get(inputs); + // logical indices of the end of the run we are currently in each input + input.current_run_end = input.run_ends[input.run_index] - input.array_span.offset; + // the logical length may end in the middle of a run, in case the array was sliced + input.current_run_end = std::min(input.current_run_end, logical_length_); + ARROW_DCHECK_GT(input.current_run_end, logical_position_); + merged_run_end_ = std::min(merged_run_end_, input.current_run_end); + if constexpr (input_id < NUM_INPUTS - 1) { + FindMergedRun(); + } + } + + template + int64_t FindCommonLength() { + int64_t our_length = std::get(inputs).array_span.length; + if constexpr (input_id < NUM_INPUTS - 1) { + int64_t other_length = FindCommonLength(); + ARROW_CHECK_EQ(our_length, other_length) + << "MergedRunsIteratror can only be used on arrays of the same length"; } + return our_length; } - bool isEnd() const { return logical_position == logical_length; } + template + void IncrementInputs() { + auto& input = std::get(inputs); + if (logical_position_ == input.current_run_end) { + input.run_index++; + } + if constexpr (input_id < NUM_INPUTS - 1) { + IncrementInputs(); + } + } - std::array inputs; - std::array run_index; - // logical indices of the end of the run we are currently in each input - std::array run_end; - int64_t logical_position = 0; - int64_t logical_length = 0; - int64_t merged_run_end; + std::tuple...> inputs; + int64_t logical_position_ = 0; + int64_t logical_length_ = 0; + int64_t merged_run_end_; }; // TODO: this may fit better into some testing header diff --git a/cpp/src/arrow/util/rle_util_test.cc b/cpp/src/arrow/util/rle_util_test.cc index f09e8e983058f..b20bd8b89cea5 100644 --- a/cpp/src/arrow/util/rle_util_test.cc +++ b/cpp/src/arrow/util/rle_util_test.cc @@ -21,42 +21,50 @@ #include "arrow/builder.h" #include "arrow/compute/api_vector.h" #include "arrow/testing/gtest_util.h" +#include "arrow/type_traits.h" #include "arrow/util/rle_util.h" namespace arrow { namespace rle_util { -TEST(TestRleUtil, FindPhysicalOffsetTest) { - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1}, 1, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 1), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 2), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 0), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 1), 0); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 2), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 3), 2); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 3), 1); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, - 1015, 1020, 1025, 1050}, +template +struct RleUtilTest : public ::testing::Test {}; +TYPED_TEST_SUITE_P(RleUtilTest); + +TYPED_TEST_P(RleUtilTest, PhysicalOffset) { + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){1}, 1, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){1, 2, 3}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){1, 2, 3}, 3, 1), 1); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){1, 2, 3}, 3, 2), 2); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 3, 4}, 3, 0), 0); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 3, 4}, 3, 1), 0); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 3, 4}, 3, 2), 1); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 3, 4}, 3, 3), 2); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 4, 6}, 3, 3), 1); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, + 1015, 1020, 1025, 1050}, 15, 1000), 10); // out-of-range logical offset should return num_run_ends - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 6), 3); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 10000), 3); - ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 0, 5), 0); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 4, 6}, 3, 6), 3); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 4, 6}, 3, 10000), 3); + ASSERT_EQ(FindPhysicalOffset((const TypeParam[]){2, 4, 6}, 0, 5), 0); } -TEST(TestRleUtil, ArtificalOffset) { +TYPED_TEST_P(RleUtilTest, ArtificalOffset) { + const std::shared_ptr run_ends_type = + std::make_shared::ArrowType>(); + const auto values = ArrayFromJSON(int32(), "[1, 2, 3]"); - const auto run_ends = ArrayFromJSON(int32(), "[10, 20, 30]"); + const auto run_ends = ArrayFromJSON(run_ends_type, "[10, 20, 30]"); ASSERT_OK_AND_ASSIGN(auto array, RunLengthEncodedArray::Make(run_ends, values, 30)); AddArtificialOffsetInChildArray(array->data().get(), 100); ASSERT_ARRAYS_EQUAL(*values, *array->values_array()); ASSERT_EQ(array->values_array()->offset(), 100); } -TEST(TestRleUtil, MergedRunsInterator) { +TYPED_TEST_P(RleUtilTest, MergedRunsInterator) { /* Construct the following two test arrays with a lot of different offsets to test the * RLE iterator: left: * @@ -93,10 +101,13 @@ TEST(TestRleUtil, MergedRunsInterator) { * logical offset: 2000 * physical offset: 5 */ + const std::shared_ptr run_ends_type = + std::make_shared::ArrowType>(); + const auto left_run_ends = ArrayFromJSON( - int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); + run_ends_type, "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 1025, 30000]"); const auto right_run_ends = - ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); + ArrayFromJSON(run_ends_type, "[1, 2, 3, 4, 5, 2005, 2009, 2025, 2050]"); const std::vector expected_run_lengths = {5, 4, 6, 5, 5, 25}; const std::vector expected_left_visits = {110, 111, 111, 112, 113, 114}; const std::vector expected_right_visits = {205, 206, 207, 207, 207, 208}; @@ -124,13 +135,14 @@ TEST(TestRleUtil, MergedRunsInterator) { size_t position = 0; size_t logical_position = 0; - for (auto it = MergedRunsIterator<2>(left_span, right_span); - it != MergedRunsIterator<2>(); ++it) { + for (auto it = MergedRunsIterator(left_span, right_span); + it != MergedRunsIterator(); ++it) { ASSERT_EQ(it.run_length(), expected_run_lengths[position]); - ASSERT_EQ(it.index_into_buffer(0), expected_left_visits[position]); - ASSERT_EQ(it.index_into_buffer(1), expected_right_visits[position]); - ASSERT_EQ(it.index_into_array(0), expected_left_visits[position] - left_child_offset); - ASSERT_EQ(it.index_into_array(1), + ASSERT_EQ(it.template index_into_buffer<0>(), expected_left_visits[position]); + ASSERT_EQ(it.template index_into_buffer<1>(), expected_right_visits[position]); + ASSERT_EQ(it.template index_into_array<0>(), + expected_left_visits[position] - left_child_offset); + ASSERT_EQ(it.template index_into_array<1>(), expected_right_visits[position] - right_child_offset); position++; logical_position += it.run_length(); @@ -143,13 +155,13 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<2>(left_span, left_span); - it != MergedRunsIterator<2>(); ++it) { + for (auto it = MergedRunsIterator(left_span, left_span); + it != MergedRunsIterator(); ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); - ASSERT_EQ(it.index_into_buffer(0), 110 + position); - ASSERT_EQ(it.index_into_buffer(1), 110 + position); - ASSERT_EQ(it.index_into_array(0), 10 + position); - ASSERT_EQ(it.index_into_array(1), 10 + position); + ASSERT_EQ(it.template index_into_buffer<0>(), 110 + position); + ASSERT_EQ(it.template index_into_buffer<1>(), 110 + position); + ASSERT_EQ(it.template index_into_array<0>(), 10 + position); + ASSERT_EQ(it.template index_into_array<1>(), 10 + position); position++; logical_position += it.run_length(); ASSERT_EQ(it.accumulated_run_length(), logical_position); @@ -158,10 +170,11 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<1>(left_span); it != MergedRunsIterator<1>(); ++it) { + for (auto it = MergedRunsIterator(left_span); it != MergedRunsIterator(); + ++it) { ASSERT_EQ(it.run_length(), left_only_run_lengths[position]); - ASSERT_EQ(it.index_into_buffer(0), 110 + position); - ASSERT_EQ(it.index_into_array(0), 10 + position); + ASSERT_EQ(it.template index_into_buffer<0>(), 110 + position); + ASSERT_EQ(it.template index_into_array<0>(), 10 + position); position++; logical_position += it.run_length(); ASSERT_EQ(it.accumulated_run_length(), logical_position); @@ -173,13 +186,13 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<2>(right_span, right_span); - it != MergedRunsIterator<2>(); ++it) { + for (auto it = MergedRunsIterator(right_span, right_span); + it != MergedRunsIterator(); ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); - ASSERT_EQ(it.index_into_buffer(0), 205 + position); - ASSERT_EQ(it.index_into_buffer(1), 205 + position); - ASSERT_EQ(it.index_into_array(0), 5 + position); - ASSERT_EQ(it.index_into_array(1), 5 + position); + ASSERT_EQ(it.template index_into_buffer<0>(), 205 + position); + ASSERT_EQ(it.template index_into_buffer<1>(), 205 + position); + ASSERT_EQ(it.template index_into_array<0>(), 5 + position); + ASSERT_EQ(it.template index_into_array<1>(), 5 + position); position++; logical_position += it.run_length(); ASSERT_EQ(it.accumulated_run_length(), logical_position); @@ -188,10 +201,11 @@ TEST(TestRleUtil, MergedRunsInterator) { position = 0; logical_position = 0; - for (auto it = MergedRunsIterator<1>(right_span); it != MergedRunsIterator<1>(); ++it) { + for (auto it = MergedRunsIterator(right_span); it != MergedRunsIterator(); + ++it) { ASSERT_EQ(it.run_length(), right_only_run_lengths[position]); - ASSERT_EQ(it.index_into_buffer(0), 205 + position); - ASSERT_EQ(it.index_into_array(0), 5 + position); + ASSERT_EQ(it.template index_into_buffer<0>(), 205 + position); + ASSERT_EQ(it.template index_into_array<0>(), 5 + position); position++; logical_position += it.run_length(); ASSERT_EQ(it.accumulated_run_length(), logical_position); @@ -199,5 +213,10 @@ TEST(TestRleUtil, MergedRunsInterator) { ASSERT_EQ(position, right_only_run_lengths.size()); } +REGISTER_TYPED_TEST_SUITE_P(RleUtilTest, PhysicalOffset, ArtificalOffset, + MergedRunsInterator); +using RunEndsTypes = testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(RleUtilTestt, RleUtilTest, RunEndsTypes); + } // namespace rle_util } // namespace arrow From 8c2378806951eb2483dd95926175cbeed1acc6c4 Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 7 Oct 2022 20:21:25 +0200 Subject: [PATCH 106/107] fix run-ends type detection in GetPhysicalOffset and GetPhysicalLength --- cpp/src/arrow/util/rle_util.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/rle_util.cc b/cpp/src/arrow/util/rle_util.cc index d5a998293bab0..83a6878cc0bf3 100644 --- a/cpp/src/arrow/util/rle_util.cc +++ b/cpp/src/arrow/util/rle_util.cc @@ -24,14 +24,15 @@ void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { int64_t GetPhysicalOffset(const ArraySpan& span) { // TODO: caching - if (span.type->id() == Type::INT16) { + auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); - } else if (span.type->id() == Type::INT32) { + } else if (type_id == Type::INT32) { return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); } else { - ARROW_CHECK(span.type->id() == Type::INT64); + ARROW_CHECK(type_id == Type::INT64); return FindPhysicalOffset(RunEnds(span), RunEndsArray(span).length, span.offset); } @@ -52,12 +53,13 @@ int64_t GetPhysicalLength(const ArraySpan& span) { if (span.length == 0) { return 0; } else { - if (span.type->id() == Type::INT16) { + auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { return GetPhysicalLengthInternal(span); - } else if (span.type->id() == Type::INT32) { + } else if (type_id == Type::INT32) { return GetPhysicalLengthInternal(span); } else { - ARROW_CHECK(span.type->id() == Type::INT64); + ARROW_CHECK_EQ(type_id, Type::INT64); return GetPhysicalLengthInternal(span); } } From 8534f88816ede47b7d766a7de206d9f9548dfb1d Mon Sep 17 00:00:00 2001 From: Tobias Zagorni Date: Fri, 7 Oct 2022 20:22:16 +0200 Subject: [PATCH 107/107] test mutltple run ends types in rle offset/length test --- cpp/src/arrow/array/array_encoded_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/array_encoded_test.cc b/cpp/src/arrow/array/array_encoded_test.cc index ca1f13243b661..c15e77a9248e0 100644 --- a/cpp/src/arrow/array/array_encoded_test.cc +++ b/cpp/src/arrow/array/array_encoded_test.cc @@ -43,6 +43,7 @@ namespace { class TestRunLengthEncodedArray : public ::testing::TestWithParam> { protected: + std::shared_ptr run_ends_type; std::shared_ptr string_values; std::shared_ptr int32_values; std::shared_ptr int16_values; @@ -50,8 +51,7 @@ class TestRunLengthEncodedArray std::shared_ptr size_only_null; virtual void SetUp() override { - std::shared_ptr run_ends_type = GetParam(); - + run_ends_type = GetParam(); string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); int16_values = ArrayFromJSON(int16(), "[10, 20, 30]"); @@ -102,8 +102,8 @@ TEST_P(TestRunLengthEncodedArray, FromRunEndsAndValues) { RunLengthEncodedArray::Make(size_only_null, int32_values, 3)); } -TEST(RunLengthEncodedArray, OffsetLength) { - auto run_ends = ArrayFromJSON(int32(), "[100, 200, 300, 400, 500]"); +TEST_P(TestRunLengthEncodedArray, OffsetLength) { + auto run_ends = ArrayFromJSON(run_ends_type, "[100, 200, 300, 400, 500]"); auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "RLE"])"); ASSERT_OK_AND_ASSIGN(auto rle_array, RunLengthEncodedArray::Make(run_ends, values, 500));