Skip to content

Commit

Permalink
Make StringType use OriginalType.UTF8.
Browse files Browse the repository at this point in the history
  • Loading branch information
ueshin committed Jul 11, 2014
1 parent f4f46de commit 616e04a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ private[sql] object CatalystConverter {
}
}
// All other primitive types use the default converter
case ctype: NativeType => { // note: need the type tag here!
case ctype: PrimitiveType => { // note: need the type tag here!
new CatalystPrimitiveConverter(parent, fieldIndex)
}
case _ => throw new RuntimeException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ private[sql] object ParquetTestData {
"""message myrecord {
optional boolean myboolean;
optional int32 myint;
optional binary mystring;
optional binary mystring (UTF8);
optional int64 mylong;
optional float myfloat;
optional double mydouble;
Expand Down Expand Up @@ -87,7 +87,7 @@ private[sql] object ParquetTestData {
message myrecord {
required boolean myboolean;
required int32 myint;
required binary mystring;
required binary mystring (UTF8);
required int64 mylong;
required float myfloat;
required double mydouble;
Expand Down Expand Up @@ -119,14 +119,14 @@ private[sql] object ParquetTestData {
// so that array types can be translated correctly.
"""
message AddressBook {
required binary owner;
required binary owner (UTF8);
optional group ownerPhoneNumbers {
repeated binary array;
repeated binary array (UTF8);
}
optional group contacts {
repeated group array {
required binary name;
optional binary phoneNumber;
required binary name (UTF8);
optional binary phoneNumber (UTF8);
}
}
}
Expand Down Expand Up @@ -181,16 +181,16 @@ private[sql] object ParquetTestData {
required int32 x;
optional group data1 {
repeated group map {
required binary key;
required binary key (UTF8);
required int32 value;
}
}
required group data2 {
repeated group map {
required binary key;
required binary key (UTF8);
required group value {
required int64 payload1;
optional binary payload2;
optional binary payload2 (UTF8);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,21 @@ private[parquet] object ParquetTypesConverter extends Logging {
def isPrimitiveType(ctype: DataType): Boolean =
classOf[PrimitiveType] isAssignableFrom ctype.getClass

def toPrimitiveDataType(parquetType : ParquetPrimitiveTypeName): DataType = parquetType match {
case ParquetPrimitiveTypeName.BINARY => StringType
case ParquetPrimitiveTypeName.BOOLEAN => BooleanType
case ParquetPrimitiveTypeName.DOUBLE => DoubleType
case ParquetPrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => ArrayType(ByteType)
case ParquetPrimitiveTypeName.FLOAT => FloatType
case ParquetPrimitiveTypeName.INT32 => IntegerType
case ParquetPrimitiveTypeName.INT64 => LongType
case ParquetPrimitiveTypeName.INT96 =>
// TODO: add BigInteger type? TODO(andre) use DecimalType instead????
sys.error("Potential loss of precision: cannot convert INT96")
case _ => sys.error(
s"Unsupported parquet datatype $parquetType")
}
def toPrimitiveDataType(parquetType: ParquetPrimitiveType): DataType =
parquetType.getPrimitiveTypeName match {
case ParquetPrimitiveTypeName.BINARY
if parquetType.getOriginalType == ParquetOriginalType.UTF8 => StringType
case ParquetPrimitiveTypeName.BOOLEAN => BooleanType
case ParquetPrimitiveTypeName.DOUBLE => DoubleType
case ParquetPrimitiveTypeName.FLOAT => FloatType
case ParquetPrimitiveTypeName.INT32 => IntegerType
case ParquetPrimitiveTypeName.INT64 => LongType
case ParquetPrimitiveTypeName.INT96 =>
// TODO: add BigInteger type? TODO(andre) use DecimalType instead????
sys.error("Potential loss of precision: cannot convert INT96")
case _ => sys.error(
s"Unsupported parquet datatype $parquetType")
}

/**
* Converts a given Parquet `Type` into the corresponding
Expand Down Expand Up @@ -104,7 +105,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
}

if (parquetType.isPrimitive) {
toPrimitiveDataType(parquetType.asPrimitiveType.getPrimitiveTypeName)
toPrimitiveDataType(parquetType.asPrimitiveType)
} else {
val groupType = parquetType.asGroupType()
parquetType.getOriginalType match {
Expand Down Expand Up @@ -164,18 +165,16 @@ private[parquet] object ParquetTypesConverter extends Logging {
* @return The name of the corresponding Parquet primitive type
*/
def fromPrimitiveDataType(ctype: DataType):
Option[ParquetPrimitiveTypeName] = ctype match {
case StringType => Some(ParquetPrimitiveTypeName.BINARY)
case BooleanType => Some(ParquetPrimitiveTypeName.BOOLEAN)
case DoubleType => Some(ParquetPrimitiveTypeName.DOUBLE)
case ArrayType(ByteType) =>
Some(ParquetPrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
case FloatType => Some(ParquetPrimitiveTypeName.FLOAT)
case IntegerType => Some(ParquetPrimitiveTypeName.INT32)
Option[(ParquetPrimitiveTypeName, Option[ParquetOriginalType])] = ctype match {
case StringType => Some(ParquetPrimitiveTypeName.BINARY, Some(ParquetOriginalType.UTF8))
case BooleanType => Some(ParquetPrimitiveTypeName.BOOLEAN, None)
case DoubleType => Some(ParquetPrimitiveTypeName.DOUBLE, None)
case FloatType => Some(ParquetPrimitiveTypeName.FLOAT, None)
case IntegerType => Some(ParquetPrimitiveTypeName.INT32, None)
// There is no type for Byte or Short so we promote them to INT32.
case ShortType => Some(ParquetPrimitiveTypeName.INT32)
case ByteType => Some(ParquetPrimitiveTypeName.INT32)
case LongType => Some(ParquetPrimitiveTypeName.INT64)
case ShortType => Some(ParquetPrimitiveTypeName.INT32, None)
case ByteType => Some(ParquetPrimitiveTypeName.INT32, None)
case LongType => Some(ParquetPrimitiveTypeName.INT64, None)
case _ => None
}

Expand Down Expand Up @@ -227,17 +226,18 @@ private[parquet] object ParquetTypesConverter extends Logging {
if (nullable) Repetition.OPTIONAL else Repetition.REQUIRED
}
val primitiveType = fromPrimitiveDataType(ctype)
if (primitiveType.isDefined) {
new ParquetPrimitiveType(repetition, primitiveType.get, name)
} else {
primitiveType.map {
case (primitiveType, originalType) =>
new ParquetPrimitiveType(repetition, primitiveType, name, originalType.orNull)
}.getOrElse {
ctype match {
case ArrayType(elementType) => {
val parquetElementType = fromDataType(
elementType,
CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
nullable = false,
inArray = true)
ConversionPatterns.listType(repetition, name, parquetElementType)
ConversionPatterns.listType(repetition, name, parquetElementType)
}
case StructType(structFields) => {
val fields = structFields.map {
Expand Down

0 comments on commit 616e04a

Please sign in to comment.