diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index b4c727997ee7e..3b4736327b47c 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -56,6 +56,35 @@ TEST_F(TestArray, TestLength) { ASSERT_EQ(arr->length(), 100); } +ArrayPtr MakeArrayFromValidBytes(const std::vector& v, MemoryPool* pool) { + int32_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0); + std::shared_ptr null_buf = test::bytes_to_null_buffer(v); + + BufferBuilder value_builder(pool); + for (size_t i = 0; i < v.size(); ++i) { + value_builder.Append(0); + } + + ArrayPtr arr(new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf)); + return arr; +} + +TEST_F(TestArray, TestEquality) { + auto array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_); + auto equal_array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_); + auto unequal_array = MakeArrayFromValidBytes({1, 1, 1, 1, 0, 1, 0, 0}, pool_); + + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + EXPECT_TRUE(array->RangeEquals(4, 8, 4, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 4, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 8, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); +} + TEST_F(TestArray, TestIsNull) { // clang-format off std::vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index c6b9b1599cdd2..d6b081f315532 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -58,4 +58,11 @@ bool NullArray::Equals(const std::shared_ptr& arr) const { return arr->length() == length_; } +bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, + const std::shared_ptr& arr) const { + if (!arr) { return false; } + if (Type::NA != arr->type_enum()) { return false; } + return true; +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index f98c4c28310f8..76dc0f598141f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -59,6 +59,12 @@ class Array { bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; + + // Compare if the range of slots specified are equal for the given array and + // this array. end_idx exclusive. This methods does not bounds check. + virtual bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const = 0; + // Determines if the array is internally consistent. Defaults to always // returning Status::OK. This can be an expensive check. virtual Status Validate() const; @@ -85,10 +91,11 @@ class NullArray : public Array { explicit NullArray(int32_t length) : NullArray(std::make_shared(), length) {} bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, + const std::shared_ptr& arr) const override; }; typedef std::shared_ptr ArrayPtr; - } // namespace arrow #endif diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 6a8ad9aa59ead..2e41b4a61caf2 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -86,6 +86,42 @@ class TestListBuilder : public TestBuilder { shared_ptr result_; }; +TEST_F(TestListBuilder, Equality) { + Int32Builder* vb = static_cast(builder_->value_builder().get()); + + ArrayPtr array, equal_array, unequal_array; + vector equal_offsets = {0, 1, 2, 5}; + vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2}; + vector unequal_offsets = {0, 1, 4}; + vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; + + // setup two equal arrays + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + array = builder_->Finish(); + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + equal_array = builder_->Finish(); + // now an unequal one + ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); + ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); + unequal_array = builder_->Finish(); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); + EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array)); +} + TEST_F(TestListBuilder, TestResize) {} TEST_F(TestListBuilder, TestAppendNull) { diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 76e7fe5f4d429..6334054caf84a 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -44,6 +44,32 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + if (!values_->RangeEquals( + begin_offset, end_offset, other_begin_offset, other->values())) { + return false; + } + } + return true; +} + Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index a020b8ad22668..0a3941633eb83 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -72,6 +72,9 @@ class ListArray : public Array { bool EqualsExact(const ListArray& other) const; bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; + protected: std::shared_ptr offset_buf_; const int32_t* offsets_; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 2b4c0879a28f4..87eb0fe3a8bf7 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -304,6 +304,57 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); } +template +Status MakeArray(const vector& valid_bytes, const vector& draws, int size, + Builder* builder, ArrayPtr* out) { + // Append the first 1000 + for (int i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + RETURN_NOT_OK(builder->Append(draws[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + *out = builder->Finish(); + return Status::OK(); +} + +TYPED_TEST(TestPrimitiveBuilder, Equality) { + DECL_T(); + + const int size = 1000; + this->RandomData(size); + vector& draws = this->draws_; + vector& valid_bytes = this->valid_bytes_; + ArrayPtr array, equal_array, unequal_array; + auto builder = this->builder_.get(); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array)); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array)); + + // Make the not equal array by negating the first valid element with itself. + const auto first_valid = std::find_if( + valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; }); + const int first_valid_idx = std::distance(valid_bytes.begin(), first_valid); + // This should be true with a very high probability, but might introduce flakiness + ASSERT_LT(first_valid_idx, size - 1); + draws[first_valid_idx] = ~*reinterpret_cast(&draws[first_valid_idx]); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); + + // test normal equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_FALSE(array->RangeEquals(0, first_valid_idx + 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(first_valid_idx, size, first_valid_idx, unequal_array)); + EXPECT_TRUE(array->RangeEquals(0, first_valid_idx, 0, unequal_array)); + EXPECT_TRUE( + array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array)); +} + TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 57a3f1e4e150b..8e6c0f809ca44 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -185,10 +185,25 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const { } } -bool BooleanArray::Equals(const std::shared_ptr& arr) const { +bool BooleanArray::Equals(const ArrayPtr& arr) const { if (this == arr.get()) return true; if (Type::BOOL != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } +bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, + int32_t other_start_idx, const ArrayPtr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { + return false; + } + } + return true; +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index fc45f6c5b0568..9597fc8363138 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -66,6 +66,22 @@ class PrimitiveArray : public Array { return PrimitiveArray::EqualsExact(*static_cast(&other)); \ } \ \ + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, \ + const ArrayPtr& arr) const override { \ + if (this == arr.get()) { return true; } \ + if (!arr) { return false; } \ + if (this->type_enum() != arr->type_enum()) { return false; } \ + const auto other = static_cast(arr.get()); \ + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { \ + const bool is_null = IsNull(i); \ + if (is_null != arr->IsNull(o_i) || \ + (!is_null && Value(i) != other->Value(o_i))) { \ + return false; \ + } \ + } \ + return true; \ + } \ + \ const T* raw_data() const { return reinterpret_cast(raw_data_); } \ \ T Value(int i) const { return raw_data()[i]; } \ @@ -95,8 +111,10 @@ class PrimitiveBuilder : public ArrayBuilder { using ArrayBuilder::Advance; // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + Status AppendNulls(const uint8_t* valid_bytes, int32_t length) { + RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); } Status AppendNull() { @@ -139,9 +157,10 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::Reserve; // Scalar append. - void Append(value_type val) { - ArrayBuilder::Reserve(1); + Status Append(value_type val) { + RETURN_NOT_OK(ArrayBuilder::Reserve(1)); UnsafeAppend(val); + return Status::OK(); } // Does not capacity-check; make sure to call Reserve beforehand @@ -248,7 +267,9 @@ class BooleanArray : public PrimitiveArray { int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); bool EqualsExact(const BooleanArray& other) const; - bool Equals(const std::shared_ptr& arr) const override; + bool Equals(const ArrayPtr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } @@ -274,7 +295,8 @@ class BooleanBuilder : public PrimitiveBuilder { using PrimitiveBuilder::Append; // Scalar append - void Append(bool val) { + Status Append(bool val) { + Reserve(1); util::set_bit(null_bitmap_data_, length_); if (val) { util::set_bit(raw_data_, length_); @@ -282,9 +304,10 @@ class BooleanBuilder : public PrimitiveBuilder { util::clear_bit(raw_data_, length_); } ++length_; + return Status::OK(); } - void Append(uint8_t val) { Append(static_cast(val)); } + Status Append(uint8_t val) { return Append(static_cast(val)); } }; } // namespace arrow