Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-206: Expose a C++ api to compare ranges of slots between two arrays #80

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,35 @@ TEST_F(TestArray, TestLength) {
ASSERT_EQ(arr->length(), 100);
}

ArrayPtr MakeArrayFromValidBytes(const std::vector<uint8_t>& v, MemoryPool* pool) {
int32_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0);
std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(v);

BufferBuilder value_builder(pool);
for (size_t i = 0; i < v.size(); ++i) {
value_builder.Append<int32_t>(0);
}

ArrayPtr arr(new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf));
return arr;
}

TEST_F(TestArray, TestEquality) {
auto array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_);
auto equal_array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_);
auto unequal_array = MakeArrayFromValidBytes({1, 1, 1, 1, 0, 1, 0, 0}, pool_);

EXPECT_TRUE(array->Equals(array));
EXPECT_TRUE(array->Equals(equal_array));
EXPECT_TRUE(equal_array->Equals(array));
EXPECT_FALSE(equal_array->Equals(unequal_array));
EXPECT_FALSE(unequal_array->Equals(equal_array));
EXPECT_TRUE(array->RangeEquals(4, 8, 4, unequal_array));
EXPECT_FALSE(array->RangeEquals(0, 4, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(0, 8, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
}

TEST_F(TestArray, TestIsNull) {
// clang-format off
std::vector<uint8_t> null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0,
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,11 @@ bool NullArray::Equals(const std::shared_ptr<Array>& arr) const {
return arr->length() == length_;
}

bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
const std::shared_ptr<Array>& arr) const {
if (!arr) { return false; }
if (Type::NA != arr->type_enum()) { return false; }
return true;
}

} // namespace arrow
9 changes: 8 additions & 1 deletion cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ class Array {

bool EqualsExact(const Array& arr) const;
virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0;

// Compare if the range of slots specified are equal for the given array and
// this array. end_idx exclusive. This methods does not bounds check.
virtual bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const std::shared_ptr<Array>& arr) const = 0;

// Determines if the array is internally consistent. Defaults to always
// returning Status::OK. This can be an expensive check.
virtual Status Validate() const;
Expand All @@ -85,10 +91,11 @@ class NullArray : public Array {
explicit NullArray(int32_t length) : NullArray(std::make_shared<NullType>(), length) {}

bool Equals(const std::shared_ptr<Array>& arr) const override;
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
const std::shared_ptr<Array>& arr) const override;
};

typedef std::shared_ptr<Array> ArrayPtr;

} // namespace arrow

#endif
36 changes: 36 additions & 0 deletions cpp/src/arrow/types/list-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,42 @@ class TestListBuilder : public TestBuilder {
shared_ptr<ListArray> result_;
};

TEST_F(TestListBuilder, Equality) {
Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get());

ArrayPtr array, equal_array, unequal_array;
vector<int32_t> equal_offsets = {0, 1, 2, 5};
vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2};
vector<int32_t> unequal_offsets = {0, 1, 4};
vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};

// setup two equal arrays
ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size()));
ASSERT_OK(vb->Append(equal_values.data(), equal_values.size()));
array = builder_->Finish();
ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size()));
ASSERT_OK(vb->Append(equal_values.data(), equal_values.size()));
equal_array = builder_->Finish();
// now an unequal one
ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size()));
ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size()));
unequal_array = builder_->Finish();

// Test array equality
EXPECT_TRUE(array->Equals(array));
EXPECT_TRUE(array->Equals(equal_array));
EXPECT_TRUE(equal_array->Equals(array));
EXPECT_FALSE(equal_array->Equals(unequal_array));
EXPECT_FALSE(unequal_array->Equals(equal_array));

// Test range equality
EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array));
}

TEST_F(TestListBuilder, TestResize) {}

TEST_F(TestListBuilder, TestAppendNull) {
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/arrow/types/list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,32 @@ bool ListArray::Equals(const std::shared_ptr<Array>& arr) const {
return EqualsExact(*static_cast<const ListArray*>(arr.get()));
}

bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const std::shared_ptr<Array>& arr) const {
if (this == arr.get()) { return true; }
if (!arr) { return false; }
if (this->type_enum() != arr->type_enum()) { return false; }
Copy link

@fengguangyuan fengguangyuan May 23, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do as your've done the check if (!arr) in other place?

const auto other = static_cast<ListArray*>(arr.get());
for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
const bool is_null = IsNull(i);
if (is_null != arr->IsNull(o_i)) { return false; }
if (is_null) continue;
const int32_t begin_offset = offset(i);
const int32_t end_offset = offset(i + 1);
const int32_t other_begin_offset = other->offset(o_i);
const int32_t other_end_offset = other->offset(o_i + 1);
// Underlying can't be equal if the size isn't equal
if (end_offset - begin_offset != other_end_offset - other_begin_offset) {
return false;
}
if (!values_->RangeEquals(
begin_offset, end_offset, other_begin_offset, other->values())) {
return false;
}
}
return true;
}

Status ListArray::Validate() const {
if (length_ < 0) { return Status::Invalid("Length was negative"); }
if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); }
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/types/list.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class ListArray : public Array {
bool EqualsExact(const ListArray& other) const;
bool Equals(const std::shared_ptr<Array>& arr) const override;

bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const ArrayPtr& arr) const override;

protected:
std::shared_ptr<Buffer> offset_buf_;
const int32_t* offsets_;
Expand Down
51 changes: 51 additions & 0 deletions cpp/src/arrow/types/primitive-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,57 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
ASSERT_EQ(memory_before, this->pool_->bytes_allocated());
}

template <class T, class Builder>
Status MakeArray(const vector<uint8_t>& valid_bytes, const vector<T>& draws, int size,
Builder* builder, ArrayPtr* out) {
// Append the first 1000
for (int i = 0; i < size; ++i) {
if (valid_bytes[i] > 0) {
RETURN_NOT_OK(builder->Append(draws[i]));
} else {
RETURN_NOT_OK(builder->AppendNull());
}
}
*out = builder->Finish();
return Status::OK();
}

TYPED_TEST(TestPrimitiveBuilder, Equality) {
DECL_T();

const int size = 1000;
this->RandomData(size);
vector<T>& draws = this->draws_;
vector<uint8_t>& valid_bytes = this->valid_bytes_;
ArrayPtr array, equal_array, unequal_array;
auto builder = this->builder_.get();
ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array));
ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array));

// Make the not equal array by negating the first valid element with itself.
const auto first_valid = std::find_if(
valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; });
const int first_valid_idx = std::distance(valid_bytes.begin(), first_valid);
// This should be true with a very high probability, but might introduce flakiness
ASSERT_LT(first_valid_idx, size - 1);
draws[first_valid_idx] = ~*reinterpret_cast<int64_t*>(&draws[first_valid_idx]);
ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array));

// test normal equality
EXPECT_TRUE(array->Equals(array));
EXPECT_TRUE(array->Equals(equal_array));
EXPECT_TRUE(equal_array->Equals(array));
EXPECT_FALSE(equal_array->Equals(unequal_array));
EXPECT_FALSE(unequal_array->Equals(equal_array));

// Test range equality
EXPECT_FALSE(array->RangeEquals(0, first_valid_idx + 1, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(first_valid_idx, size, first_valid_idx, unequal_array));
EXPECT_TRUE(array->RangeEquals(0, first_valid_idx, 0, unequal_array));
EXPECT_TRUE(
array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array));
}

TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) {
DECL_T();

Expand Down
17 changes: 16 additions & 1 deletion cpp/src/arrow/types/primitive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,25 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const {
}
}

bool BooleanArray::Equals(const std::shared_ptr<Array>& arr) const {
bool BooleanArray::Equals(const ArrayPtr& arr) const {
if (this == arr.get()) return true;
if (Type::BOOL != arr->type_enum()) { return false; }
return EqualsExact(*static_cast<const BooleanArray*>(arr.get()));
}

bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx,
int32_t other_start_idx, const ArrayPtr& arr) const {
if (this == arr.get()) { return true; }
if (!arr) { return false; }
if (this->type_enum() != arr->type_enum()) { return false; }
const auto other = static_cast<BooleanArray*>(arr.get());
for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) {
const bool is_null = IsNull(i);
if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) {
return false;
}
}
return true;
}

} // namespace arrow
35 changes: 29 additions & 6 deletions cpp/src/arrow/types/primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,22 @@ class PrimitiveArray : public Array {
return PrimitiveArray::EqualsExact(*static_cast<const PrimitiveArray*>(&other)); \
} \
\
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, \
const ArrayPtr& arr) const override { \
if (this == arr.get()) { return true; } \
if (!arr) { return false; } \
if (this->type_enum() != arr->type_enum()) { return false; } \
const auto other = static_cast<NAME*>(arr.get()); \
for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { \
const bool is_null = IsNull(i); \
if (is_null != arr->IsNull(o_i) || \
(!is_null && Value(i) != other->Value(o_i))) { \
return false; \
} \
} \
return true; \
} \
\
const T* raw_data() const { return reinterpret_cast<const T*>(raw_data_); } \
\
T Value(int i) const { return raw_data()[i]; } \
Expand Down Expand Up @@ -95,8 +111,10 @@ class PrimitiveBuilder : public ArrayBuilder {
using ArrayBuilder::Advance;

// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
void AppendNulls(const uint8_t* valid_bytes, int32_t length) {
Status AppendNulls(const uint8_t* valid_bytes, int32_t length) {
RETURN_NOT_OK(Reserve(length));
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}

Status AppendNull() {
Expand Down Expand Up @@ -139,9 +157,10 @@ class NumericBuilder : public PrimitiveBuilder<T> {
using PrimitiveBuilder<T>::Reserve;

// Scalar append.
void Append(value_type val) {
ArrayBuilder::Reserve(1);
Status Append(value_type val) {
RETURN_NOT_OK(ArrayBuilder::Reserve(1));
UnsafeAppend(val);
return Status::OK();
}

// Does not capacity-check; make sure to call Reserve beforehand
Expand Down Expand Up @@ -248,7 +267,9 @@ class BooleanArray : public PrimitiveArray {
int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);

bool EqualsExact(const BooleanArray& other) const;
bool Equals(const std::shared_ptr<Array>& arr) const override;
bool Equals(const ArrayPtr& arr) const override;
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const ArrayPtr& arr) const override;

const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_); }

Expand All @@ -274,17 +295,19 @@ class BooleanBuilder : public PrimitiveBuilder<BooleanType> {
using PrimitiveBuilder<BooleanType>::Append;

// Scalar append
void Append(bool val) {
Status Append(bool val) {
Reserve(1);
util::set_bit(null_bitmap_data_, length_);
if (val) {
util::set_bit(raw_data_, length_);
} else {
util::clear_bit(raw_data_, length_);
}
++length_;
return Status::OK();
}

void Append(uint8_t val) { Append(static_cast<bool>(val)); }
Status Append(uint8_t val) { return Append(static_cast<bool>(val)); }
};

} // namespace arrow
Expand Down