-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-92: Arrow to Parquet Schema conversion #68
Changes from 5 commits
8a0293e
9a6c876
38e68e5
42ed0ea
9c5b085
e3aa261
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,13 +17,16 @@ | |
|
||
#include "arrow/parquet/schema.h" | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "parquet/api/schema.h" | ||
|
||
#include "arrow/types/decimal.h" | ||
#include "arrow/types/string.h" | ||
#include "arrow/util/status.h" | ||
|
||
using parquet::Repetition; | ||
using parquet::schema::Node; | ||
using parquet::schema::NodePtr; | ||
using parquet::schema::GroupNode; | ||
|
@@ -182,6 +185,126 @@ Status FromParquetSchema( | |
return Status::OK(); | ||
} | ||
|
||
Status StructToNode(const std::shared_ptr<StructType>& type, const std::string& name, | ||
bool nullable, NodePtr* out) { | ||
Repetition::type repetition = Repetition::REQUIRED; | ||
if (nullable) { repetition = Repetition::OPTIONAL; } | ||
|
||
std::vector<NodePtr> children(type->num_children()); | ||
for (int i = 0; i < type->num_children(); i++) { | ||
RETURN_NOT_OK(FieldToNode(type->child(i), &children[i])); | ||
} | ||
|
||
*out = GroupNode::Make(name, repetition, children); | ||
return Status::OK(); | ||
} | ||
|
||
Status FieldToNode(const std::shared_ptr<Field>& field, NodePtr* out) { | ||
LogicalType::type logical_type = LogicalType::NONE; | ||
ParquetType::type type; | ||
Repetition::type repetition = Repetition::REQUIRED; | ||
if (field->nullable) { repetition = Repetition::OPTIONAL; } | ||
int length = -1; | ||
|
||
switch (field->type->type) { | ||
// TODO: | ||
// case Type::NA: | ||
// break; | ||
case Type::BOOL: | ||
type = ParquetType::BOOLEAN; | ||
break; | ||
case Type::UINT8: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::UINT_8; | ||
break; | ||
case Type::INT8: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::INT_8; | ||
break; | ||
case Type::UINT16: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::UINT_16; | ||
break; | ||
case Type::INT16: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::INT_16; | ||
break; | ||
case Type::UINT32: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::UINT_32; | ||
break; | ||
case Type::INT32: | ||
type = ParquetType::INT32; | ||
break; | ||
case Type::UINT64: | ||
type = ParquetType::INT64; | ||
logical_type = LogicalType::UINT_64; | ||
break; | ||
case Type::INT64: | ||
type = ParquetType::INT64; | ||
break; | ||
case Type::FLOAT: | ||
type = ParquetType::FLOAT; | ||
break; | ||
case Type::DOUBLE: | ||
type = ParquetType::DOUBLE; | ||
break; | ||
case Type::CHAR: | ||
type = ParquetType::FIXED_LEN_BYTE_ARRAY; | ||
logical_type = LogicalType::UTF8; | ||
length = static_cast<CharType*>(field->type.get())->size; | ||
break; | ||
case Type::STRING: | ||
type = ParquetType::BYTE_ARRAY; | ||
logical_type = LogicalType::UTF8; | ||
break; | ||
case Type::BINARY: | ||
type = ParquetType::BYTE_ARRAY; | ||
break; | ||
case Type::DATE: | ||
type = ParquetType::INT32; | ||
logical_type = LogicalType::DATE; | ||
break; | ||
case Type::TIMESTAMP: | ||
type = ParquetType::INT64; | ||
logical_type = LogicalType::TIMESTAMP_MILLIS; | ||
break; | ||
case Type::TIMESTAMP_DOUBLE: | ||
type = ParquetType::INT64; | ||
// This is specified as seconds since the UNIX epoch | ||
// TODO: Converted type in Parquet? | ||
// logical_type = LogicalType::TIMESTAMP_MILLIS; | ||
break; | ||
case Type::TIME: | ||
type = ParquetType::INT64; | ||
logical_type = LogicalType::TIME_MILLIS; | ||
break; | ||
case Type::STRUCT: { | ||
auto struct_type = std::static_pointer_cast<StructType>(field->type); | ||
return StructToNode(struct_type, field->name, field->nullable, out); | ||
} break; | ||
default: | ||
// TODO: LIST, DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL, DECIMAL_TEXT, VARCHAR | ||
return Status::NotImplemented("unhandled type"); | ||
} | ||
*out = PrimitiveNode::Make(field->name, repetition, type, logical_type, length); | ||
return Status::OK(); | ||
} | ||
|
||
Status ToParquetSchema( | ||
const Schema* arrow_schema, std::shared_ptr<::parquet::SchemaDescriptor>* out) { | ||
std::vector<NodePtr> nodes(arrow_schema->num_fields()); | ||
for (int i = 0; i < arrow_schema->num_fields(); i++) { | ||
RETURN_NOT_OK(FieldToNode(arrow_schema->field(i), &nodes[i])); | ||
} | ||
|
||
NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); | ||
*out = std::make_shared<::parquet::SchemaDescriptor>(); | ||
(*out)->Init(schema); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we cannot distinguish why a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'll probably need to add Status type for "thirdparty exceptions". |
||
|
||
return Status::OK(); | ||
} | ||
|
||
} // namespace parquet | ||
|
||
} // namespace arrow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aside: we'll need to visit the string encoding question, as logical unicode characters won't map neatly onto a
char(n)
type