diff --git a/bigquery/storage/managedwriter/adapt/protoconversion.go b/bigquery/storage/managedwriter/adapt/protoconversion.go index c1c264dfc978..63b81e5072fd 100644 --- a/bigquery/storage/managedwriter/adapt/protoconversion.go +++ b/bigquery/storage/managedwriter/adapt/protoconversion.go @@ -63,6 +63,27 @@ var bqTypeToFieldTypeMap = map[storagepb.TableFieldSchema_Type]descriptorpb.Fiel storagepb.TableFieldSchema_TIMESTAMP: descriptorpb.FieldDescriptorProto_TYPE_INT64, } +// Primitive types which can leverage packed encoding when repeated/arrays. +// +// Note: many/most of these aren't used when doing schema to proto conversion, but +// are included for completeness. +var packedTypes = []descriptorpb.FieldDescriptorProto_Type{ + descriptorpb.FieldDescriptorProto_TYPE_INT32, + descriptorpb.FieldDescriptorProto_TYPE_INT64, + descriptorpb.FieldDescriptorProto_TYPE_UINT32, + descriptorpb.FieldDescriptorProto_TYPE_UINT64, + descriptorpb.FieldDescriptorProto_TYPE_SINT32, + descriptorpb.FieldDescriptorProto_TYPE_SINT64, + descriptorpb.FieldDescriptorProto_TYPE_FIXED32, + descriptorpb.FieldDescriptorProto_TYPE_FIXED64, + descriptorpb.FieldDescriptorProto_TYPE_SFIXED32, + descriptorpb.FieldDescriptorProto_TYPE_SFIXED64, + descriptorpb.FieldDescriptorProto_TYPE_FLOAT, + descriptorpb.FieldDescriptorProto_TYPE_DOUBLE, + descriptorpb.FieldDescriptorProto_TYPE_BOOL, + descriptorpb.FieldDescriptorProto_TYPE_ENUM, +} + // For TableFieldSchema OPTIONAL mode, we use the wrapper types to allow for the // proper representation of NULL values, as proto3 semantics would just use default value. var bqTypeToWrapperMap = map[storagepb.TableFieldSchema_Type]string{ @@ -85,7 +106,7 @@ var wellKnownTypesWrapperName = "google/protobuf/wrappers.proto" // dependencyCache is used to reduce the number of unique messages we generate by caching based on the tableschema. // -// keys are based on the base64-encoded serialized tableschema value. +// Keys are based on the base64-encoded serialized tableschema value. type dependencyCache map[string]protoreflect.MessageDescriptor func (dm dependencyCache) get(schema *storagepb.TableSchema) protoreflect.MessageDescriptor { @@ -143,7 +164,7 @@ func StorageSchemaToProto3Descriptor(inSchema *storagepb.TableSchema, scope stri return storageSchemaToDescriptorInternal(inSchema, scope, &dc, true) } -// internal implementation of the conversion code. +// Internal implementation of the conversion code. func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope string, cache *dependencyCache, useProto3 bool) (protoreflect.MessageDescriptor, error) { if inSchema == nil { return nil, newConversionError(scope, fmt.Errorf("no input schema was provided")) @@ -170,11 +191,11 @@ func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope st break } } - // if dep is missing, add to current dependencies + // If dep is missing, add to current dependencies. if !haveDep { deps = append(deps, foundDesc.ParentFile()) } - // construct field descriptor for the message + // Construct field descriptor for the message. fdp, err := tableFieldSchemaToFieldDescriptorProto(f, fNumber, string(foundDesc.FullName()), useProto3) if err != nil { return nil, newConversionError(scope, fmt.Errorf("couldn't convert field to FieldDescriptorProto: %w", err)) @@ -277,12 +298,25 @@ func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, i // For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types. if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 { - return &descriptorpb.FieldDescriptorProto{ + outType := bqTypeToFieldTypeMap[field.GetType()] + fdp := &descriptorpb.FieldDescriptorProto{ Name: proto.String(name), Number: proto.Int32(idx), - Type: bqTypeToFieldTypeMap[field.GetType()].Enum(), + Type: outType.Enum(), Label: convertModeToLabel(field.GetMode(), useProto3), - }, nil + } + // Special case: proto2 repeated fields may benefit from using packed annotation. + if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 { + for _, v := range packedTypes { + if outType == v { + fdp.Options = &descriptorpb.FieldOptions{ + Packed: proto.Bool(true), + } + break + } + } + } + return fdp, nil } // For NULLABLE proto3 fields, use a wrapper type. return &descriptorpb.FieldDescriptorProto{ @@ -445,7 +479,7 @@ func normalizeName(in string) string { return strings.Replace(in, ".", "_", -1) } -// these types don't get normalized into the fully-contained structure. +// These types don't get normalized into the fully-contained structure. var normalizationSkipList = []string{ /* TODO: when backend supports resolving well known types, this list should be enabled. diff --git a/bigquery/storage/managedwriter/adapt/protoconversion_test.go b/bigquery/storage/managedwriter/adapt/protoconversion_test.go index aaf6b4cb6630..8444b6f0d38d 100644 --- a/bigquery/storage/managedwriter/adapt/protoconversion_test.go +++ b/bigquery/storage/managedwriter/adapt/protoconversion_test.go @@ -384,6 +384,35 @@ func TestSchemaToProtoConversion(t *testing.T) { }, }, }, + { + description: "repeated w/packed", + bq: &storagepb.TableSchema{ + Fields: []*storagepb.TableFieldSchema{ + {Name: "name", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_NULLABLE}, + {Name: "some_lengths", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_REPEATED}, + {Name: "nicknames", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_REPEATED}, + }}, + wantProto2: &descriptorpb.DescriptorProto{ + Name: proto.String("root"), + Field: []*descriptorpb.FieldDescriptorProto{ + { + Name: proto.String("name"), + Number: proto.Int32(1), + Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()}, + { + Name: proto.String("some_lengths"), + Number: proto.Int32(2), + Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(), + Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum(), + Options: &descriptorpb.FieldOptions{ + Packed: proto.Bool(true), + }, + }, + {Name: proto.String("nicknames"), Number: proto.Int32(3), Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(), Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum()}, + }, + }, + }, } for _, tc := range testCases { // Proto2