From cb3f1bd9756809883e520ba5a09b675fa2963fd5 Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 10:20:35 +0800 Subject: [PATCH 1/7] fix type bug --- schema/gettype.go | 4 +-- types/types.go | 66 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/schema/gettype.go b/schema/gettype.go index 14d2991f..45073493 100644 --- a/schema/gettype.go +++ b/schema/gettype.go @@ -41,10 +41,10 @@ func (self *SchemaHandler) GetTypes() []reflect.Type { if nc == 0 { if *rT != parquet.FieldRepetitionType_REPEATED { - elementTypes[idx] = types.ParquetTypeToGoReflectType(pT, rT) + elementTypes[idx] = types.ParquetTypeToGoReflectType(pT, cT, rT) } else { - elementTypes[idx] = reflect.SliceOf(types.ParquetTypeToGoReflectType(pT, nil)) + elementTypes[idx] = reflect.SliceOf(types.ParquetTypeToGoReflectType(pT, cT, nil)) } } else { diff --git a/types/types.go b/types/types.go index b2310c57..b2ace071 100644 --- a/types/types.go +++ b/types/types.go @@ -69,16 +69,40 @@ func TypeNameToParquetType(name string, baseName string) (*parquet.Type, *parque panic(fmt.Errorf("Unknown data type: '%s'", name)) } -func ParquetTypeToGoReflectType(pT *parquet.Type, rT *parquet.FieldRepetitionType) reflect.Type { +func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT *parquet.FieldRepetitionType) reflect.Type { if rT == nil || *rT != parquet.FieldRepetitionType_OPTIONAL { if *pT == parquet.Type_BOOLEAN { return reflect.TypeOf(true) - } else if *pT == parquet.Type_INT32 { + } else if *pT == parquet.Type_INT32 && cT == nil { return reflect.TypeOf(int32(0)) - } else if *pT == parquet.Type_INT64 { + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { + return reflect.TypeOf(int8(0)) + + }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { + return reflect.TypeOf(int16(0)) + + }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { + return reflect.TypeOf(int32(0)) + + }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { + return reflect.TypeOf(uint8(0)) + + }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { + return reflect.TypeOf(uint16(0)) + + }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { + return reflect.TypeOf(uint32(0)) + + }else if *pT == parquet.Type_INT64 && cT == nil { + return reflect.TypeOf(int64(0)) + + }else if *pT == parquet.Type_INT64 && *cT == parquet.ConvertedType_INT_64 { return reflect.TypeOf(int64(0)) + + }else if *pT == parquet.Type_INT64 && *cT == parquet.ConvertedType_UINT_64 { + return reflect.TypeOf(uint64(0)) } else if *pT == parquet.Type_INT96 { return reflect.TypeOf("") @@ -104,14 +128,46 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, rT *parquet.FieldRepetitionTyp v := true return reflect.TypeOf(&v) - } else if *pT == parquet.Type_INT32 { + } else if *pT == parquet.Type_INT32 && cT == nil{ v := int32(0) return reflect.TypeOf(&v) - } else if *pT == parquet.Type_INT64 { + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { + v := int8(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { + v := int16(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { + v := int32(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { + v := uint8(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { + v := uint16(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { + v := uint32(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT64 && cT == nil { v := int64(0) return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT64 && *cT == parquet.ConvertedType_INT_64 { + v := int64(0) + return reflect.TypeOf(&v) + + } else if *pT == parquet.Type_INT64 && *cT == parquet.ConvertedType_UINT_64 { + v := uint64(0) + return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT96 { v := "" return reflect.TypeOf(&v) From 6ceb0e0110db81f28bb0efe725296e6f76843a7c Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 10:27:53 +0800 Subject: [PATCH 2/7] fix gettypes --- example/column_read.go | 6 +++--- types/types.go | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/example/column_read.go b/example/column_read.go index 6d218791..47071d89 100644 --- a/example/column_read.go +++ b/example/column_read.go @@ -34,8 +34,8 @@ func main() { log.Println("Can't create parquet writer") return } - num := 10 - for i := 0; i < num; i++ { + num := int64(10) + for i := 0; int64(i) < num; i++ { stu := Student{ Name: "StudentName", Age: int32(20 + i%5), @@ -70,7 +70,7 @@ func main() { log.Println("Can't create column reader", err) return } - num = int(pr.GetNumRows()) + num = int64(pr.GetNumRows()) pr.SkipRowsByPath("parquet_go_root.name", 5) //skip the first five rows names, rls, dls, err = pr.ReadColumnByPath("parquet_go_root.name", num) diff --git a/types/types.go b/types/types.go index b2ace071..1ed02b69 100644 --- a/types/types.go +++ b/types/types.go @@ -95,6 +95,9 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { return reflect.TypeOf(uint32(0)) + } else if *pT == parquet.Type_INT32 { + return reflect.TypeOf(int32(0)) + }else if *pT == parquet.Type_INT64 && cT == nil { return reflect.TypeOf(int64(0)) @@ -104,6 +107,9 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT }else if *pT == parquet.Type_INT64 && *cT == parquet.ConvertedType_UINT_64 { return reflect.TypeOf(uint64(0)) + } else if *pT == parquet.Type_INT64 { + return reflect.TypeOf(int64(0)) + } else if *pT == parquet.Type_INT96 { return reflect.TypeOf("") @@ -156,6 +162,10 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT v := uint32(0) return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT32 { + v := int32(0) + return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT64 && cT == nil { v := int64(0) return reflect.TypeOf(&v) @@ -168,6 +178,10 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT v := uint64(0) return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT64 { + v := int64(0) + return reflect.TypeOf(&v) + } else if *pT == parquet.Type_INT96 { v := "" return reflect.TypeOf(&v) From 364f0bd65d27c8dbe28c03f2142f11276bc2c8d5 Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 10:32:59 +0800 Subject: [PATCH 3/7] fix types --- types/types.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/types/types.go b/types/types.go index 1ed02b69..215704f5 100644 --- a/types/types.go +++ b/types/types.go @@ -78,19 +78,19 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(int32(0)) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { - return reflect.TypeOf(int8(0)) + return reflect.TypeOf(int32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { - return reflect.TypeOf(int16(0)) + return reflect.TypeOf(int32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { return reflect.TypeOf(int32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { - return reflect.TypeOf(uint8(0)) + return reflect.TypeOf(uint32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { - return reflect.TypeOf(uint16(0)) + return reflect.TypeOf(uint32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { return reflect.TypeOf(uint32(0)) @@ -139,11 +139,11 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { - v := int8(0) + v := int32(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { - v := int16(0) + v := int32(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { @@ -151,11 +151,11 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { - v := uint8(0) + v := uint32(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { - v := uint16(0) + v := uint32(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { From 02e6e283303fe739fea684892ad7af45b659c9e1 Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 11:03:28 +0800 Subject: [PATCH 4/7] refactor types --- example/type.go | 16 ++++++++-------- types/types.go | 36 ++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/example/type.go b/example/type.go index 0acef95f..7a09904f 100644 --- a/example/type.go +++ b/example/type.go @@ -20,12 +20,12 @@ type TypeList struct { FixedLenByteArray string `parquet:"name=FixedLenByteArray, type=FIXED_LEN_BYTE_ARRAY, length=10"` Utf8 string `parquet:"name=utf8, type=UTF8, encoding=PLAIN_DICTIONARY"` - Int_8 int32 `parquet:"name=int_8, type=INT_8"` - Int_16 int32 `parquet:"name=int_16, type=INT_16"` + Int_8 int8 `parquet:"name=int_8, type=INT_8"` + Int_16 int16 `parquet:"name=int_16, type=INT_16"` Int_32 int32 `parquet:"name=int_32, type=INT_32"` Int_64 int64 `parquet:"name=int_64, type=INT_64"` - Uint_8 uint32 `parquet:"name=uint_8, type=UINT_8"` - Uint_16 uint32 `parquet:"name=uint_16, type=UINT_16"` + Uint_8 uint8 `parquet:"name=uint_8, type=UINT_8"` + Uint_16 uint16 `parquet:"name=uint_16, type=UINT_16"` Uint_32 uint32 `parquet:"name=uint_32, type=UINT_32"` Uint_64 uint64 `parquet:"name=uint_64, type=UINT_64"` Date int32 `parquet:"name=date, type=DATE"` @@ -71,12 +71,12 @@ func main() { FixedLenByteArray: "HelloWorld", Utf8: "utf8", - Int_8: int32(i), - Int_16: int32(i), + Int_8: int8(i), + Int_16: int16(i), Int_32: int32(i), Int_64: int64(i), - Uint_8: uint32(i), - Uint_16: uint32(i), + Uint_8: uint8(i), + Uint_16: uint16(i), Uint_32: uint32(i), Uint_64: uint64(i), Date: int32(i), diff --git a/types/types.go b/types/types.go index 215704f5..5c6a0362 100644 --- a/types/types.go +++ b/types/types.go @@ -78,19 +78,19 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(int32(0)) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { - return reflect.TypeOf(int32(0)) + return reflect.TypeOf(int8(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { - return reflect.TypeOf(int32(0)) + return reflect.TypeOf(int16(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { return reflect.TypeOf(int32(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { - return reflect.TypeOf(uint32(0)) + return reflect.TypeOf(uint8(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { - return reflect.TypeOf(uint32(0)) + return reflect.TypeOf(uint16(0)) }else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { return reflect.TypeOf(uint32(0)) @@ -139,11 +139,11 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_8 { - v := int32(0) + v := int8(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_16 { - v := int32(0) + v := int16(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_INT_32 { @@ -151,11 +151,11 @@ func ParquetTypeToGoReflectType(pT *parquet.Type, cT *parquet.ConvertedType, rT return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_8 { - v := uint32(0) + v := uint8(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_16 { - v := uint32(0) + v := uint16(0) return reflect.TypeOf(&v) } else if *pT == parquet.Type_INT32 && *cT == parquet.ConvertedType_UINT_32 { @@ -216,10 +216,14 @@ func ParquetTypeToGoType(src interface{}, pT *parquet.Type, cT *parquet.Converte return src } - if *cT == parquet.ConvertedType_UINT_8 { - return uint32(src.(int32)) + if *cT == parquet.ConvertedType_INT_8 { + return int8(src.(int32)) + } else if *cT == parquet.ConvertedType_INT_16 { + return int16(src.(int32)) + } else if *cT == parquet.ConvertedType_UINT_8 { + return uint8(src.(int32)) } else if *cT == parquet.ConvertedType_UINT_16 { - return uint32(src.(int32)) + return uint16(src.(int32)) } else if *cT == parquet.ConvertedType_UINT_32 { return uint32(src.(int32)) } else if *cT == parquet.ConvertedType_UINT_64 { @@ -234,10 +238,14 @@ func GoTypeToParquetType(src interface{}, pT *parquet.Type, cT *parquet.Converte return src } - if *cT == parquet.ConvertedType_UINT_8 { - return int32(src.(uint32)) + if *cT == parquet.ConvertedType_INT_8 { + return int32(src.(int8)) + }else if *cT == parquet.ConvertedType_INT_16 { + return int32(src.(int16)) + } else if *cT == parquet.ConvertedType_UINT_8 { + return int32(src.(uint8)) } else if *cT == parquet.ConvertedType_UINT_16 { - return int32(src.(uint32)) + return int32(src.(uint16)) } else if *cT == parquet.ConvertedType_UINT_32 { return int32(src.(uint32)) } else if *cT == parquet.ConvertedType_UINT_64 { From 9a06f95266a18a1c3475594c6cd460e3c7d7c7fe Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 11:11:08 +0800 Subject: [PATCH 5/7] refactor types --- types/types.go | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/types/types.go b/types/types.go index 5c6a0362..bec38c5c 100644 --- a/types/types.go +++ b/types/types.go @@ -299,16 +299,40 @@ func StrToParquetType(s string, pT *parquet.Type, cT *parquet.ConvertedType, len if *cT == parquet.ConvertedType_UTF8 { return s - } else if *cT == parquet.ConvertedType_INT_8 || *cT == parquet.ConvertedType_INT_16 || *cT == parquet.ConvertedType_INT_32 || - *cT == parquet.ConvertedType_DATE || *cT == parquet.ConvertedType_TIME_MILLIS { + } else if *cT == parquet.ConvertedType_INT_8 { + var v int8 + fmt.Sscanf(s, "%d", &v) + return v + + } else if *cT == parquet.ConvertedType_INT_16 { + var v int16 + fmt.Sscanf(s, "%d", &v) + return v + + } else if *cT == parquet.ConvertedType_INT_32 { var v int32 fmt.Sscanf(s, "%d", &v) return v - } else if *cT == parquet.ConvertedType_UINT_8 || *cT == parquet.ConvertedType_UINT_16 || *cT == parquet.ConvertedType_UINT_32 { - var vt uint32 - fmt.Sscanf(s, "%d", &vt) - return int32(vt) + } else if *cT == parquet.ConvertedType_UINT_8 { + var v uint8 + fmt.Sscanf(s, "%d", &v) + return v + + } else if *cT == parquet.ConvertedType_UINT_16 { + var v uint16 + fmt.Sscanf(s, "%d", &v) + return v + + } else if *cT == parquet.ConvertedType_UINT_32 { + var v uint32 + fmt.Sscanf(s, "%d", &v) + return v + + } else if *cT == parquet.ConvertedType_DATE || *cT == parquet.ConvertedType_TIME_MILLIS { + var v int32 + fmt.Sscanf(s, "%d", &v) + return v } else if *cT == parquet.ConvertedType_UINT_64 { var vt uint64 From 009ee10970698e8b44f0535e03b8d00decb70fde Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 11:13:44 +0800 Subject: [PATCH 6/7] refactor --- types/types.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/types/types.go b/types/types.go index bec38c5c..ecb8d128 100644 --- a/types/types.go +++ b/types/types.go @@ -302,37 +302,37 @@ func StrToParquetType(s string, pT *parquet.Type, cT *parquet.ConvertedType, len } else if *cT == parquet.ConvertedType_INT_8 { var v int8 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_INT_16 { var v int16 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_INT_32 { var v int32 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_UINT_8 { var v uint8 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_UINT_16 { var v uint16 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_UINT_32 { var v uint32 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_DATE || *cT == parquet.ConvertedType_TIME_MILLIS { var v int32 fmt.Sscanf(s, "%d", &v) - return v + return int32(v) } else if *cT == parquet.ConvertedType_UINT_64 { var vt uint64 From 818e1c36b2c3272f301778a0c4935c9e528c80b4 Mon Sep 17 00:00:00 2001 From: xitongsys Date: Wed, 29 Jan 2020 11:15:13 +0800 Subject: [PATCH 7/7] update readme --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5aa92cc1..f45451ff 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,12 @@ There are two types in Parquet: Primitive Type and Logical Type. Logical types a |BYTE_ARRAY|BYTE_ARRAY|string| |FIXED_LEN_BYTE_ARRAY|FIXED_LEN_BYTE_ARRAY|string| |UTF8|BYTE_ARRAY|string| -|INT_8|INT32|int32| -|INT_16|INT32|int32| +|INT_8|INT32|int8| +|INT_16|INT32|int16| |INT_32|INT32|int32| |INT_64|INT64|int64| -|UINT_8|INT32|uint32| -|UINT_16|INT32|uint32| +|UINT_8|INT32|uint8| +|UINT_16|INT32|uint16| |UINT_32|INT32|uint32| |UINT_64|INT64|uint64| |DATE|INT32|int32|