Skip to content

Commit

Permalink
PARQUET-809: Add SchemaDescriptor::Equals method
Browse files Browse the repository at this point in the history
To make it simpler to compare file metadata

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes apache#214 from wesm/PARQUET-809 and squashes the following commits:

691e5bc [Wes McKinney] Add SchemaDescriptor::Equals method
  • Loading branch information
wesm committed Sep 2, 2018
1 parent 94bcfa1 commit 7c66f55
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 0 deletions.
20 changes: 20 additions & 0 deletions cpp/src/parquet/schema/descriptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ void SchemaDescriptor::Init(const NodePtr& schema) {
}
}

bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
if (this->num_columns() != other.num_columns()) {
return false;
}

for (int i = 0; i < this->num_columns(); ++i) {
if (!this->Column(i)->Equals(*other.Column(i))) {
return false;
}
}

return true;
}

void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
int16_t max_rep_level, const NodePtr& base) {
if (node->is_optional()) {
Expand Down Expand Up @@ -82,6 +96,12 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
}

bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
return primitive_node_->Equals(other.primitive_node_) &&
max_repetition_level() == other.max_repetition_level() &&
max_definition_level() == other.max_definition_level();
}

const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
return &leaves_[i];
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/parquet/schema/descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class PARQUET_EXPORT ColumnDescriptor {
ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);

bool Equals(const ColumnDescriptor& other) const;

int16_t max_definition_level() const { return max_definition_level_; }

int16_t max_repetition_level() const { return max_repetition_level_; }
Expand Down Expand Up @@ -97,6 +99,8 @@ class PARQUET_EXPORT SchemaDescriptor {

const ColumnDescriptor* Column(int i) const;

bool Equals(const SchemaDescriptor& other) const;

// The number of physical columns appearing in the file
int num_columns() const { return leaves_.size(); }

Expand Down
54 changes: 54 additions & 0 deletions cpp/src/parquet/schema/schema-descriptor-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,60 @@ TEST_F(TestSchemaDescriptor, InitNonGroup) {
ASSERT_THROW(descr_.Init(node), ParquetException);
}

TEST_F(TestSchemaDescriptor, Equals) {
NodePtr schema;

NodePtr inta = Int32("a", Repetition::REQUIRED);
NodePtr intb = Int64("b", Repetition::OPTIONAL);
NodePtr intb2 = Int64("b2", Repetition::OPTIONAL);
NodePtr intc = ByteArray("c", Repetition::REPEATED);

NodePtr item1 = Int64("item1", Repetition::REQUIRED);
NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
NodePtr item3 = Int32("item3", Repetition::REPEATED);
NodePtr list(GroupNode::Make(
"records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));

NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list}));

SchemaDescriptor descr1;
descr1.Init(GroupNode::Make("schema", Repetition::REPEATED,
{inta, intb, intc, bag}));

ASSERT_TRUE(descr1.Equals(descr1));

SchemaDescriptor descr2;
descr2.Init(GroupNode::Make("schema", Repetition::REPEATED,
{inta, intb, intc, bag2}));
ASSERT_FALSE(descr1.Equals(descr2));

SchemaDescriptor descr3;
descr3.Init(GroupNode::Make("schema", Repetition::REPEATED,
{inta, intb2, intc, bag}));
ASSERT_FALSE(descr1.Equals(descr3));

// Robust to name of parent node
SchemaDescriptor descr4;
descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED,
{inta, intb, intc, bag}));
ASSERT_TRUE(descr1.Equals(descr4));

SchemaDescriptor descr5;
descr5.Init(GroupNode::Make("schema", Repetition::REPEATED,
{inta, intb, intc, bag, intb2}));
ASSERT_FALSE(descr1.Equals(descr5));

// Different max repetition / definition levels
ColumnDescriptor col1(inta, 5, 1);
ColumnDescriptor col2(inta, 6, 1);
ColumnDescriptor col3(inta, 5, 2);

ASSERT_TRUE(col1.Equals(col1));
ASSERT_FALSE(col1.Equals(col2));
ASSERT_FALSE(col1.Equals(col3));
}

TEST_F(TestSchemaDescriptor, BuildTree) {
NodeVector fields;
NodePtr schema;
Expand Down

0 comments on commit 7c66f55

Please sign in to comment.