Skip to content

Commit

Permalink
PARQUET-842: Do not set unnecessary fields in the parquet::SchemaElement
Browse files Browse the repository at this point in the history
The Impala Parquet scanner can reject our data otherwise

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes apache#226 from wesm/PARQUET-842 and squashes the following commits:

0cfa53c [Wes McKinney] Do not set field_id in SchemaElement until we understand why it is causing parquet-mr problems
4220b48 [Wes McKinney] Do not set unnecessary fields in the parquet::SchemaElement

Change-Id: I2a7902765e92f0da386e9e65513dd0954113fa60
  • Loading branch information
wesm authored and xhochy committed Jan 25, 2017
1 parent 7fbc1b1 commit bcaa02e
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 16 deletions.
2 changes: 0 additions & 2 deletions cpp/src/parquet/schema/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@ class SchemaVisitor : public Node::ConstVisitor {
void Visit(const Node* node) override {
format::SchemaElement element;
node->ToParquet(&element);
// Override field_id here as we can get user-generated Nodes without a valid id
element.__set_field_id(elements_->size());
elements_->push_back(element);

if (node->is_group()) {
Expand Down
6 changes: 0 additions & 6 deletions cpp/src/parquet/schema/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ static inline SchemaElement NewPrimitive(const std::string& name,
result.__set_repetition_type(repetition);
result.__set_type(type);
result.__set_num_children(0);
result.__set_field_id(id);
// Set default (non-set) values
result.__set_type_length(-1);
result.__set_precision(-1);
result.__set_scale(-1);

return result;
}
Expand All @@ -57,7 +52,6 @@ static inline SchemaElement NewGroup(const std::string& name,
result.__set_name(name);
result.__set_repetition_type(repetition);
result.__set_num_children(num_children);
result.__set_field_id(id);

return result;
}
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/parquet/schema/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio
physical_type_(type),
type_length_(length) {
std::stringstream ss;
decimal_metadata_.isset = false;

// PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
// set to true, but Impala will raise an incompatible metadata in such cases
memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));

// Check if the physical and logical types match
// Mapping referred from Apache parquet-mr as on 2016-02-22
switch (logical_type) {
case LogicalType::NONE:
// Logical type not set
// Clients should be able to read these values
decimal_metadata_.isset = true;
decimal_metadata_.precision = precision;
decimal_metadata_.scale = scale;
break;
case LogicalType::UTF8:
case LogicalType::JSON:
Expand Down Expand Up @@ -289,7 +289,6 @@ void GroupNode::ToParquet(void* opaque_element) const {
if (logical_type_ != LogicalType::NONE) {
element->__set_converted_type(ToThrift(logical_type_));
}
// FIXME: SchemaFlattener does this for us: element->__set_field_id(id_);
}

void PrimitiveNode::ToParquet(void* opaque_element) const {
Expand All @@ -302,8 +301,9 @@ void PrimitiveNode::ToParquet(void* opaque_element) const {
element->__set_converted_type(ToThrift(logical_type_));
}
element->__set_type(ToThrift(physical_type_));
// FIXME: SchemaFlattener does this for us: element->__set_field_id(id_);
element->__set_type_length(type_length_);
if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
element->__set_type_length(type_length_);
}
if (decimal_metadata_.isset) {
element->__set_precision(decimal_metadata_.precision);
element->__set_scale(decimal_metadata_.scale);
Expand Down

0 comments on commit bcaa02e

Please sign in to comment.