From fff9beb4d8f641953c51bac6bc3d725a5c1760c1 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Sun, 5 Dec 2021 16:17:59 +0200 Subject: [PATCH 01/20] add namespace to avro record type --- .../s3/avro/JsonToAvroSchemaConverter.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index e723beacc1c8..ba9ba53f2bb6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -9,6 +9,7 @@ import com.google.common.base.Preconditions; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; + import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -46,6 +47,7 @@ public class JsonToAvroSchemaConverter { .addToSchema(Schema.create(Schema.Type.LONG)); private final Map standardizedNames = new HashMap<>(); + private final List recordFieldNames = new ArrayList<>(); static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { return getTypes(fieldName, fieldDefinition).stream() @@ -150,6 +152,7 @@ public Schema getAvroSchema(final JsonNode jsonSchema, AvroConstants.DOC_KEY_VALUE_DELIMITER, fieldName)); } + assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) .withDefault(null); } @@ -207,13 +210,21 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType String.format("Array field %s has invalid items property: %s", fieldName, items)); } } - case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, null, false); + case OBJECT -> { + final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); + recordFieldNames.add(stdName); + fieldSchema = getAvroSchema(fieldDefinition, fieldName, resovleNamespace(stdName), false); + } default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } return fieldSchema; } + private String resovleNamespace(String fieldName) { + return fieldName.concat("_").concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); + } + List getSchemasFromTypes(final String fieldName, final ArrayNode types) { return MoreIterators.toList(types.elements()) .stream() From 2f259cd39841ce99c6806cc6bf820e603e42f90d Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Mon, 6 Dec 2021 10:29:03 +0200 Subject: [PATCH 02/20] refactoring --- .../destination/s3/avro/AvroConstants.java | 1 + .../destination/s3/avro/AvroNameTransformer.java | 13 +++++++++++++ .../s3/avro/JsonToAvroSchemaConverter.java | 15 +++++++++++---- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java index 50b9012fbbd9..f47e79979bce 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java @@ -14,6 +14,7 @@ public class AvroConstants { public static final String DOC_KEY_ORIGINAL_NAME = "_airbyte_original_name"; public static final String AVRO_EXTRA_PROPS_FIELD = "_airbyte_additional_properties"; + public static final String AVRO_RECORD_NAMESPACE = "_airbyte_avro_namespace"; // This set must include _ab_additional_col in source_s3/source_files_abstract/stream.py public static final Set JSON_EXTRA_PROPS_FIELDS = Set.of("_ab_additional_properties", AVRO_EXTRA_PROPS_FIELD); public static final AvroNameTransformer NAME_TRANSFORMER = new AvroNameTransformer(); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index c1dc15a076d1..d56e217bb403 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -6,6 +6,11 @@ import io.airbyte.integrations.destination.ExtendedNameTransformer; +import java.util.Collections; +import java.util.List; + +import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; + public class AvroNameTransformer extends ExtendedNameTransformer { @Override @@ -26,4 +31,12 @@ private String checkFirsCharInStreamName(final String name) { } } + public String resovleNamespace(String fieldName, List recordFieldNames) { + return AVRO_RECORD_NAMESPACE + .concat(".") + .concat(fieldName) + .concat("_") + .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index ba9ba53f2bb6..2af9c2a6ed55 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -27,6 +27,8 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; +import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; + /** * The main function of this class is to convert a JsonSchema to Avro schema. It can also * standardize schema names, and keep track of a mapping from the original names to the standardized @@ -213,7 +215,8 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType case OBJECT -> { final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); recordFieldNames.add(stdName); - fieldSchema = getAvroSchema(fieldDefinition, fieldName, resovleNamespace(stdName), false); + fieldSchema = getAvroSchema(fieldDefinition, fieldName, + AvroConstants.NAME_TRANSFORMER.resovleNamespace(stdName, recordFieldNames), false); } default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); @@ -221,9 +224,13 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType return fieldSchema; } - private String resovleNamespace(String fieldName) { - return fieldName.concat("_").concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); - } +// private String resovleNamespace(String fieldName) { +// return AVRO_RECORD_NAMESPACE +// .concat(".") +// .concat(fieldName) +// .concat("_") +// .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); +// } List getSchemasFromTypes(final String fieldName, final ArrayNode types) { return MoreIterators.toList(types.elements()) From 487360d6ff0219979d4f1ed301ee51f0037a8832 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Tue, 7 Dec 2021 11:46:11 +0200 Subject: [PATCH 03/20] Add unit tests --- .../s3/avro/AvroNameTransformer.java | 13 +- .../s3/avro/JsonToAvroSchemaConverter.java | 13 +- .../json_conversion_test_cases.json | 367 +++++++++++++++++- .../type_conversion_test_cases.json | 61 ++- 4 files changed, 407 insertions(+), 47 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index d56e217bb403..e223155e858c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -4,13 +4,12 @@ package io.airbyte.integrations.destination.s3.avro; -import io.airbyte.integrations.destination.ExtendedNameTransformer; +import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; +import io.airbyte.integrations.destination.ExtendedNameTransformer; import java.util.Collections; import java.util.List; -import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; - public class AvroNameTransformer extends ExtendedNameTransformer { @Override @@ -33,10 +32,10 @@ private String checkFirsCharInStreamName(final String name) { public String resovleNamespace(String fieldName, List recordFieldNames) { return AVRO_RECORD_NAMESPACE - .concat(".") - .concat(fieldName) - .concat("_") - .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); + .concat(".") + .concat(fieldName) + .concat("_") + .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 2af9c2a6ed55..342e48332f19 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -9,7 +9,6 @@ import com.google.common.base.Preconditions; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; - import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -27,8 +26,6 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; -import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; - /** * The main function of this class is to convert a JsonSchema to Avro schema. It can also * standardize schema names, and keep track of a mapping from the original names to the standardized @@ -216,7 +213,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); recordFieldNames.add(stdName); fieldSchema = getAvroSchema(fieldDefinition, fieldName, - AvroConstants.NAME_TRANSFORMER.resovleNamespace(stdName, recordFieldNames), false); + AvroConstants.NAME_TRANSFORMER.resovleNamespace(stdName, recordFieldNames), false); } default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); @@ -224,14 +221,6 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType return fieldSchema; } -// private String resovleNamespace(String fieldName) { -// return AVRO_RECORD_NAMESPACE -// .concat(".") -// .concat(fieldName) -// .concat("_") -// .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); -// } - List getSchemasFromTypes(final String fieldName, final ArrayNode types) { return MoreIterators.toList(types.elements()) .stream() diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index b027dae755a4..379b45c06352 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -26,7 +26,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -83,7 +89,7 @@ { "type": "record", "name": "user", - "namespace": "", + "namespace": "_airbyte_avro_namespace.user_1", "fields": [ { "name": "first_name", @@ -97,7 +103,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -107,7 +119,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -165,7 +183,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -206,7 +230,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -243,7 +273,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -289,7 +325,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -346,7 +388,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -390,7 +438,10 @@ "name": "created_at", "type": [ "null", - { "type": "long", "logicalType": "timestamp-micros" }, + { + "type": "long", + "logicalType": "timestamp-micros" + }, "string", "int" ], @@ -398,7 +449,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -452,13 +509,16 @@ { "type": "record", "name": "user", - "namespace": "", + "namespace": "_airbyte_avro_namespace.user_2", "fields": [ { "name": "created_at", "type": [ "null", - { "type": "long", "logicalType": "timestamp-micros" }, + { + "type": "long", + "logicalType": "timestamp-micros" + }, "string", "int" ], @@ -466,7 +526,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -476,7 +542,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -499,7 +571,14 @@ "type": "array", "items": [ { - "oneOf": [{ "type": "integer" }, { "type": "string" }] + "oneOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] }, { "type": "boolean" @@ -529,7 +608,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -567,7 +652,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -612,7 +703,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -661,7 +758,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -694,7 +797,13 @@ "fields": [ { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -706,5 +815,221 @@ "active": "true" } } + }, + { + "schemaName": "schema_with_the_same_object_names", + "namespace": "namespace15", + "appendAirbyteFields": false, + "jsonSchema": { + "type": "object", + "properties": { + "author": { + "type": "object", + "properties": { + "login": { + "type": ["null", "string"] + }, + "id": { + "type": ["null", "integer"] + }, + "node_id": { + "type": ["null", "string"] + } + } + }, + "commit": { + "type": ["null", "object"], + "properties": { + "author": { + "type": ["null", "object"], + "properties": { + "name": { + "type": ["null", "string"] + }, + "email": { + "type": ["null", "string"] + }, + "date": { + "type": ["null", "string"], + "format": "date-time" + } + } + }, + "message": { + "type": ["null", "string"] + } + } + } + } + }, + "jsonObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123" + }, + "commit": { + "message": "test commit message", + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": "2021-01-01T01:01:01+01:00" + } + } + }, + "avroSchema": { + "type": "record", + "name": "schema_with_the_same_object_names", + "namespace": "namespace15", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_1", + "fields": [ + { + "name": "login", + "type": ["null", "string"], + "default": null + }, + { + "name": "id", + "type": ["null", "int"], + "default": null + }, + { + "name": "node_id", + "type": ["null", "string"], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "commit", + "type": [ + "null", + { + "type": "record", + "name": "commit", + "namespace": "_airbyte_avro_namespace.commit_1", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_2", + "fields": [ + { + "name": "name", + "type": ["null", "string"], + "default": null + }, + { + "name": "email", + "type": ["null", "string"], + "default": null + }, + { + "name": "date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + }, + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "message", + "type": ["null", "string"], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + }, + "avroObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123", + "_airbyte_additional_properties": null + }, + "commit": { + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": 1609459261000000, + "_airbyte_additional_properties": null + }, + "message": "test commit message", + "_airbyte_additional_properties": null + }, + "_airbyte_additional_properties": null + } } ] diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json index 3171888d2734..6acbdaab5a47 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json @@ -89,6 +89,7 @@ { "type": "record", "name": "object_field", + "namespace": "_airbyte_avro_namespace.object_field_1", "fields": [ { "name": "id", @@ -102,7 +103,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -119,10 +126,17 @@ { "type": "record", "name": "object_field_without_properties", + "namespace": "_airbyte_avro_namespace.object_field_without_properties_1", "fields": [ { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -134,33 +148,66 @@ "jsonFieldSchema": { "type": "object" }, - "avroFieldType": ["null", { "type": "map", "values": "string" }] + "avroFieldType": [ + "null", + { + "type": "map", + "values": "string" + } + ] }, { "fieldName": "_ab_additional_properties", "jsonFieldSchema": { "type": "object" }, - "avroFieldType": ["null", { "type": "map", "values": "string" }] + "avroFieldType": [ + "null", + { + "type": "map", + "values": "string" + } + ] }, { "fieldName": "any_of_field", "jsonFieldSchema": { - "anyOf": [{ "type": "string" }, { "type": "integer" }] + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] }, "avroFieldType": ["null", "string", "int"] }, { "fieldName": "all_of_field", "jsonFieldSchema": { - "allOf": [{ "type": "string" }, { "type": "integer" }] + "allOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] }, "avroFieldType": ["null", "string", "int"] }, { "fieldName": "one_of_field", "jsonFieldSchema": { - "oneOf": [{ "type": "string" }, { "type": "integer" }] + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] }, "avroFieldType": ["null", "string", "int"] } From f75c85ee3cd6cf193e171942e3fa3e45b1180f6d Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Tue, 7 Dec 2021 11:56:50 +0200 Subject: [PATCH 04/20] added CHANGELOG --- docs/integrations/destinations/s3.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index cf5d9dbec13c..338ea5eac970 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -224,6 +224,8 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | +| 0.1.16 | 2021-12-07 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types | +| 0.1.15 | 2021-12-03 | [\#9999](https://github.com/airbytehq/airbyte/pull/9999) | Remove excessive logging for Avro and Parquet invalid date strings. | | 0.1.14 | 2021-11-09 | [\#7732](https://github.com/airbytehq/airbyte/pull/7732) | Support timestamp in Avro and Parquet | | 0.1.13 | 2021-11-03 | [\#7288](https://github.com/airbytehq/airbyte/issues/7288) | Support Json `additionalProperties`. | | 0.1.12 | 2021-09-13 | [\#5720](https://github.com/airbytehq/airbyte/issues/5720) | Added configurable block size for stream. Each stream is limited to 10,000 by S3 | From 8ca116542713480effcf676dca4581ce97cf2411 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Wed, 8 Dec 2021 11:31:21 +0200 Subject: [PATCH 05/20] fix typo in method name --- .../integrations/destination/s3/avro/AvroNameTransformer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index e223155e858c..d781913ca3b1 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -30,7 +30,7 @@ private String checkFirsCharInStreamName(final String name) { } } - public String resovleNamespace(String fieldName, List recordFieldNames) { + public String resolveNamespace(String fieldName, List recordFieldNames) { return AVRO_RECORD_NAMESPACE .concat(".") .concat(fieldName) From dcde0620936265af2ec8154a620c44244633d3f1 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Wed, 8 Dec 2021 12:01:48 +0200 Subject: [PATCH 06/20] fix typo in method name --- .../destination/s3/avro/JsonToAvroSchemaConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 342e48332f19..4e1acc18a2c9 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -213,7 +213,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); recordFieldNames.add(stdName); fieldSchema = getAvroSchema(fieldDefinition, fieldName, - AvroConstants.NAME_TRANSFORMER.resovleNamespace(stdName, recordFieldNames), false); + AvroConstants.NAME_TRANSFORMER.resolveNamespace(stdName, recordFieldNames), false); } default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); From 93d53ac478146d7c5c5b7ef62585c90a4021dd83 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Wed, 15 Dec 2021 10:45:04 +0200 Subject: [PATCH 07/20] fix for jdk 17 --- .../integrations/destination/snowflake/SnowflakeDatabase.java | 2 ++ docs/integrations/destinations/snowflake.md | 3 +++ 2 files changed, 5 insertions(+) diff --git a/airbyte-integrations/connectors/destination-snowflake/src/main/java/io/airbyte/integrations/destination/snowflake/SnowflakeDatabase.java b/airbyte-integrations/connectors/destination-snowflake/src/main/java/io/airbyte/integrations/destination/snowflake/SnowflakeDatabase.java index c484a78452af..a1548c162373 100644 --- a/airbyte-integrations/connectors/destination-snowflake/src/main/java/io/airbyte/integrations/destination/snowflake/SnowflakeDatabase.java +++ b/airbyte-integrations/connectors/destination-snowflake/src/main/java/io/airbyte/integrations/destination/snowflake/SnowflakeDatabase.java @@ -42,6 +42,8 @@ public static Connection getConnection(final JsonNode config) throws SQLExceptio // https://docs.snowflake.com/en/user-guide/jdbc-parameters.html#application // identify airbyte traffic to snowflake to enable partnership & optimization opportunities properties.put("application", "airbyte"); + // Needed for JDK17 - see https://stackoverflow.com/questions/67409650/snowflake-jdbc-driver-internal-error-fail-to-retrieve-row-count-for-first-arrow + properties.put("JDBC_QUERY_RESULT_FORMAT", "JSON"); return DriverManager.getConnection(connectUrl, properties); } diff --git a/docs/integrations/destinations/snowflake.md b/docs/integrations/destinations/snowflake.md index b84bbc84f841..74e4db39efbb 100644 --- a/docs/integrations/destinations/snowflake.md +++ b/docs/integrations/destinations/snowflake.md @@ -152,6 +152,8 @@ By default, Airbyte uses batches of `INSERT` commands to add data to a temporary Internal named stages are storage location objects within a Snowflake database/schema. Because they are database objects, the same security permissions apply as with any other database objects. No need to provide additional properties for internal staging +**Operating on a stage also requires the USAGE privilege on the parent database and schema.** + ### AWS S3 For AWS S3, you will need to create a bucket and provide credentials to access the bucket. We recommend creating a bucket that is only used for Airbyte to stage data to Snowflake. Airbyte needs read/write access to interact with this bucket. @@ -194,6 +196,7 @@ Finally, you need to add read/write permissions to your bucket with that email. | Version | Date | Pull Request | Subject | | :------ | :-------- | :----- | :------ | +| 0.3.21 | 2021-12-15 | [#8781](https://github.com/airbytehq/airbyte/pull/8781) | Updated check method to verify permissions to create/drop stage for internal staging; compatibility fix for Java 17 | | 0.3.20 | 2021-12-10 | [#8562](https://github.com/airbytehq/airbyte/pull/8562) | Moving classes around for better dependency management; compatibility fix for Java 17 | | 0.3.19 | 2021-12-06 | [#8528](https://github.com/airbytehq/airbyte/pull/8528) | Set Internal Staging as default choice | | 0.3.18 | 2021-11-26 | [#8253](https://github.com/airbytehq/airbyte/pull/8253) | Snowflake Internal Staging Support | From 8ad0a653e11096b5fa035d5efb17dcf9aeb9e5aa Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Thu, 16 Dec 2021 10:45:57 +0200 Subject: [PATCH 08/20] created recursive keys adding --- .../gcs/writer/ProductionWriterFactory.java | 2 +- .../connectors/destination-s3/Dockerfile | 1 + .../s3/avro/JsonToAvroSchemaConverter.java | 65 +- .../destination/s3/util/AvroRecordHelper.java | 2 +- .../s3/writer/ProductionWriterFactory.java | 2 +- .../s3/avro/JsonToAvroConverterTest.java | 2 +- .../json_conversion_test_cases.json | 1065 +++++++++++++---- 7 files changed, 874 insertions(+), 265 deletions(-) diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java index 45d1e334d2dc..6b9faae5fae9 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/writer/ProductionWriterFactory.java @@ -38,7 +38,7 @@ public S3Writer create(final GcsDestinationConfig config, LOGGER.info("Json schema for stream {}: {}", stream.getName(), stream.getJsonSchema()); final JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true); + final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true, true); LOGGER.info("Avro schema for stream {}: {}", stream.getName(), avroSchema.toString(false)); diff --git a/airbyte-integrations/connectors/destination-s3/Dockerfile b/airbyte-integrations/connectors/destination-s3/Dockerfile index e68bba6018a1..8aa37e692503 100644 --- a/airbyte-integrations/connectors/destination-s3/Dockerfile +++ b/airbyte-integrations/connectors/destination-s3/Dockerfile @@ -2,6 +2,7 @@ FROM airbyte/integration-base-java:dev WORKDIR /airbyte ENV APPLICATION destination-s3 +#ENV JAVA_TOOL_OPTIONS -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005 COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 4e1acc18a2c9..283363368e8b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -6,15 +6,13 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.databind.node.ValueNode; import com.google.common.base.Preconditions; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; + +import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; @@ -47,6 +45,7 @@ public class JsonToAvroSchemaConverter { private final Map standardizedNames = new HashMap<>(); private final List recordFieldNames = new ArrayList<>(); + Map jsonNodeStringMap = new HashMap<>(); static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { return getTypes(fieldName, fieldDefinition).stream() @@ -100,9 +99,14 @@ public Map getStandardizedNames() { public Schema getAvroSchema(final JsonNode jsonSchema, final String name, @Nullable final String namespace, - final boolean appendAirbyteFields) { + final boolean appendAirbyteFields, + final boolean rootNode) { final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); RecordBuilder builder = SchemaBuilder.record(stdName); + if(rootNode) { + Map schemaMap = new HashMap<>(); + addKeys("", jsonSchema, schemaMap, jsonNodeStringMap); + } if (!stdName.equals(name)) { standardizedNames.put(name, stdName); LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, @@ -211,9 +215,21 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType } case OBJECT -> { final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); + String s = jsonNodeStringMap.get(fieldDefinition); + String path = null; + if(s!=null){ + path = Arrays.stream(s.split("/")) + .filter(key -> !key.isBlank()) + .filter(key -> !key.equals("items")) + .filter(key -> !key.equals("properties")) + .filter(key -> !key.equals("format")) + .collect(Collectors.joining(".")); +// .collect(Collectors.toList()); + int i =9; + } recordFieldNames.add(stdName); fieldSchema = getAvroSchema(fieldDefinition, fieldName, - AvroConstants.NAME_TRANSFORMER.resolveNamespace(stdName, recordFieldNames), false); + path, false,false); } default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); @@ -273,4 +289,37 @@ Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinit } } + private static void addKeys(String currentPath, JsonNode jsonNode, Map map, Map jsonNodeStringMap) { + if (jsonNode.isObject()) { + ObjectNode objectNode = (ObjectNode) jsonNode; + Iterator> iter = objectNode.fields(); + String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; + jsonNodeStringMap.put(jsonNode,pathPrefix); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, jsonNodeStringMap); + } + } else if (jsonNode.isArray()) { + ArrayNode arrayNode = (ArrayNode) jsonNode; + + for (int i = 0; i < arrayNode.size(); i++) { + String arrayPath = currentPath + "/" + i; + addKeys(arrayPath, arrayNode.get(i), map,jsonNodeStringMap); + } + + } else if (jsonNode.isValueNode()) { + ValueNode valueNode = (ValueNode) jsonNode; +// if (jsonSchema) { +// if (schemaContainsProperties(currentPath, valueNode, "format", List.of("date", "date-time", "time"))) { + map.put("/" + currentPath, valueNode.asText()); +// } +// } else { +// String value = valueNode.asText(); +// if (!value.equals("null") && !value.isBlank() && !Boolean.parseBoolean(value)) { +// map.put("/" + currentPath, value); +// } + } +// } + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java index ad2e42efc64b..33d32ed742f3 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java @@ -18,7 +18,7 @@ public class AvroRecordHelper { public static JsonFieldNameUpdater getFieldNameUpdater(final String streamName, final String namespace, final JsonNode streamSchema) { final JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true); + schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true, true); return new JsonFieldNameUpdater(schemaConverter.getStandardizedNames()); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java index 39041093271c..55969edba75c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java @@ -37,7 +37,7 @@ public S3Writer create(final S3DestinationConfig config, LOGGER.info("Json schema for stream {}: {}", stream.getName(), stream.getJsonSchema()); final JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true); + final Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true, true); LOGGER.info("Avro schema for stream {}: {}", stream.getName(), avroSchema.toString(false)); diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java index 2a90a03c2582..2f5584e404ae 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroConverterTest.java @@ -110,7 +110,7 @@ public void testJsonAvroConversion(final String schemaName, final JsonNode avroSchema, final JsonNode avroObject) throws Exception { - final Schema actualAvroSchema = SCHEMA_CONVERTER.getAvroSchema(jsonSchema, schemaName, namespace, appendAirbyteFields); + final Schema actualAvroSchema = SCHEMA_CONVERTER.getAvroSchema(jsonSchema, schemaName, namespace, appendAirbyteFields, true); assertEquals( avroSchema, Jsons.deserialize(actualAvroSchema.toString()), diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index 9a53a42a257e..d23ed51743b9 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -1,4 +1,284 @@ -[ +[{ + "schemaName": "schema_with_the_same_object_names_and_inner", + "namespace": "namespace17", + "appendAirbyteFields": false, + "jsonSchema": { + "type": "object", + "properties": { + "author": { + "type": "object", + "properties": { + "login": { + "type": [ + "null", + "string" + ] + }, + "id": { + "type": [ + "null", + "integer" + ] + }, + "node_id": { + "type": [ + "null", + "string" + ] + } + } + }, + "commit": { + "type": [ + "null", + "object" + ], + "properties": { + "author": { + "type": [ + "null", + "object" + ], + "properties": { + "name": { + "type": [ + "null", + "string" + ] + }, + "email": { + "type": [ + "null", + "string" + ] + }, + "date": { + "type": [ + "null", + "string" + ], + "format": "date-time" + }, + "pr": { + "type": [ + "null", + "object" + ], + "properties": { + "test1": { + "type": [ + "null", + "string" + ] + }, + "test2": { + "type": [ + "null", + "string" + ] + } + } + } + } + }, + "message": { + "type": [ + "null", + "string" + ] + } + } + } + } + }, + "jsonObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123" + }, + "commit": { + "message": "test commit message", + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": "2021-01-01T01:01:01+01:00" + } + } + }, + "avroSchema": { + "type": "record", + "name": "schema_with_the_same_object_names", + "namespace": "namespace16", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_1", + "fields": [ + { + "name": "login", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "id", + "type": [ + "null", + "int" + ], + "default": null + }, + { + "name": "node_id", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "commit", + "type": [ + "null", + { + "type": "record", + "name": "commit", + "namespace": "_airbyte_avro_namespace.commit_1", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_2", + "fields": [ + { + "name": "name", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "email", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + }, + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "message", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + }, + "avroObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123", + "_airbyte_additional_properties": null + }, + "commit": { + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": 1609459261000000, + "_airbyte_additional_properties": null + }, + "message": "test commit message", + "_airbyte_additional_properties": null + }, + "_airbyte_additional_properties": null + } +}, { "schemaName": "simple_schema", "namespace": "namespace1", @@ -7,7 +287,10 @@ "type": "object", "properties": { "node_id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] } } }, @@ -21,12 +304,21 @@ "fields": [ { "name": "node_id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -44,10 +336,16 @@ "type": "object", "properties": { "node_id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] }, "user": { - "type": ["null", "object"], + "type": [ + "null", + "object" + ], "properties": { "first_name": { "type": "string" @@ -73,7 +371,10 @@ "fields": [ { "name": "node_id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { @@ -87,17 +388,29 @@ "fields": [ { "name": "first_name", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "last_name", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -107,7 +420,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -130,7 +449,10 @@ "type": "object", "properties": { "node_id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] } } }, @@ -160,12 +482,21 @@ }, { "name": "node_id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -185,7 +516,10 @@ "type": "object", "properties": { "node:id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] } } }, @@ -201,12 +535,21 @@ { "name": "node_id", "doc": "_airbyte_original_name:node:id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -224,7 +567,11 @@ "type": "object", "properties": { "identifier": { - "type": ["null", "number", "string"] + "type": [ + "null", + "number", + "string" + ] } } }, @@ -238,12 +585,22 @@ "fields": [ { "name": "identifier", - "type": ["null", "double", "string"], + "type": [ + "null", + "double", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -269,7 +626,10 @@ } }, "jsonObject": { - "identifier": ["151", "152"] + "identifier": [ + "151", + "152" + ] }, "avroSchema": { "type": "record", @@ -282,20 +642,32 @@ "null", { "type": "array", - "items": ["null", "string"] + "items": [ + "null", + "string" + ] } ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] }, "avroObject": { - "identifier": ["151", "152"], + "identifier": [ + "151", + "152" + ], "_airbyte_additional_properties": null } }, @@ -326,7 +698,12 @@ } }, "jsonObject": { - "identifiers": ["151", 152, true, false] + "identifiers": [ + "151", + 152, + true, + false + ] }, "avroSchema": { "type": "record", @@ -339,20 +716,36 @@ "null", { "type": "array", - "items": ["null", "string", "int", "boolean"] + "items": [ + "null", + "string", + "int", + "boolean" + ] } ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] }, "avroObject": { - "identifiers": ["151", 152, true, false], + "identifiers": [ + "151", + 152, + true, + false + ], "_airbyte_additional_properties": null } }, @@ -369,7 +762,10 @@ "format": "date-time" }, { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] }, { "type": "integer" @@ -390,7 +786,10 @@ "name": "created_at", "type": [ "null", - { "type": "long", "logicalType": "timestamp-micros" }, + { + "type": "long", + "logicalType": "timestamp-micros" + }, "string", "int" ], @@ -398,7 +797,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -424,7 +829,10 @@ "format": "date-time" }, { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] }, { "type": "integer" @@ -458,7 +866,10 @@ "name": "created_at", "type": [ "null", - { "type": "long", "logicalType": "timestamp-micros" }, + { + "type": "long", + "logicalType": "timestamp-micros" + }, "string", "int" ], @@ -466,7 +877,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -476,7 +893,13 @@ }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -499,7 +922,14 @@ "type": "array", "items": [ { - "oneOf": [{ "type": "integer" }, { "type": "string" }] + "oneOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] }, { "type": "boolean" @@ -509,7 +939,13 @@ } }, "jsonObject": { - "identifiers": [151, 152, "153", true, false] + "identifiers": [ + 151, + 152, + "153", + true, + false + ] }, "avroSchema": { "type": "record", @@ -522,20 +958,37 @@ "null", { "type": "array", - "items": ["null", "int", "string", "boolean"] + "items": [ + "null", + "int", + "string", + "boolean" + ] } ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] }, "avroObject": { - "identifiers": [151, 152, "153", true, false], + "identifiers": [ + 151, + 152, + "153", + true, + false + ], "_airbyte_additional_properties": null } }, @@ -547,7 +1000,10 @@ "type": "object", "properties": { "5field_name": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] } } }, @@ -561,13 +1017,22 @@ "fields": [ { "name": "_5field_name", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "doc": "_airbyte_original_name:5field_name", "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -585,7 +1050,10 @@ "type": "object", "properties": { "node_id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] }, "_airbyte_additional_properties": { "type": "object" @@ -607,12 +1075,21 @@ "fields": [ { "name": "node_id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -634,7 +1111,10 @@ "type": "object", "properties": { "node_id": { - "type": ["null", "string"] + "type": [ + "null", + "string" + ] }, "_ab_additional_properties": { "type": "object" @@ -656,12 +1136,21 @@ "fields": [ { "name": "node_id", - "type": ["null", "string"], + "type": [ + "null", + "string" + ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -694,7 +1183,13 @@ "fields": [ { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -742,28 +1237,47 @@ "name": "date_time_field", "type": [ "null", - { "type": "long", "logicalType": "timestamp-micros" }, + { + "type": "long", + "logicalType": "timestamp-micros" + }, "string" ], "default": null }, { "name": "date_field", - "type": ["null", { "type": "int", "logicalType": "date" }, "string"], + "type": [ + "null", + { + "type": "int", + "logicalType": "date" + }, + "string" + ], "default": null }, { "name": "time_field", "type": [ "null", - { "type": "long", "logicalType": "time-micros" }, + { + "type": "long", + "logicalType": "time-micros" + }, "string" ], "default": null }, { "name": "_airbyte_additional_properties", - "type": ["null", { "type": "map", "values": "string" }], + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], "default": null } ] @@ -776,219 +1290,264 @@ } }, { - "schemaName": "schema_with_the_same_object_names", - "namespace": "namespace16", - "appendAirbyteFields": false, - "jsonSchema": { - "type": "object", - "properties": { - "author": { - "type": "object", - "properties": { - "login": { - "type": ["null", "string"] - }, - "id": { - "type": ["null", "integer"] - }, - "node_id": { - "type": ["null", "string"] + "schemaName": "schema_with_the_same_object_names", + "namespace": "namespace16", + "appendAirbyteFields": false, + "jsonSchema": { + "type": "object", + "properties": { + "author": { + "type": "object", + "properties": { + "login": { + "type": [ + "null", + "string" + ] + }, + "id": { + "type": [ + "null", + "integer" + ] + }, + "node_id": { + "type": [ + "null", + "string" + ] + } } - } - }, - "commit": { - "type": ["null", "object"], - "properties": { - "author": { - "type": ["null", "object"], - "properties": { - "name": { - "type": ["null", "string"] - }, - "email": { - "type": ["null", "string"] - }, - "date": { - "type": ["null", "string"], - "format": "date-time" + }, + "commit": { + "type": [ + "null", + "object" + ], + "properties": { + "author": { + "type": [ + "null", + "object" + ], + "properties": { + "name": { + "type": [ + "null", + "string" + ] + }, + "email": { + "type": [ + "null", + "string" + ] + }, + "date": { + "type": [ + "null", + "string" + ], + "format": "date-time" + } } + }, + "message": { + "type": [ + "null", + "string" + ] } - }, - "message": { - "type": ["null", "string"] } } } - } - }, - "jsonObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123" }, - "commit": { - "message": "test commit message", + "jsonObject": { "author": { - "name": "Test Author", - "email": "test@example.com", - "date": "2021-01-01T01:01:01+01:00" - } - } - }, - "avroSchema": { - "type": "record", - "name": "schema_with_the_same_object_names", - "namespace": "namespace16", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "_airbyte_avro_namespace.author_1", - "fields": [ - { - "name": "login", - "type": ["null", "string"], - "default": null - }, - { - "name": "id", - "type": ["null", "int"], - "default": null - }, - { - "name": "node_id", - "type": ["null", "string"], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null + "login": "test", + "id": 12345, + "node_id": "abc123" }, - { - "name": "commit", - "type": [ - "null", - { - "type": "record", - "name": "commit", - "namespace": "_airbyte_avro_namespace.commit_1", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "_airbyte_avro_namespace.author_2", - "fields": [ - { - "name": "name", - "type": ["null", "string"], - "default": null - }, - { - "name": "email", - "type": ["null", "string"], - "default": null - }, - { - "name": "date", - "type": [ - "null", - { - "type": "long", - "logicalType": "timestamp-micros" - }, - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "message", - "type": ["null", "string"], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null + "commit": { + "message": "test commit message", + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": "2021-01-01T01:01:01+01:00" + } } - ] - }, - "avroObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123", - "_airbyte_additional_properties": null }, - "commit": { + "avroSchema": { + "type": "record", + "name": "schema_with_the_same_object_names", + "namespace": "namespace16", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_1", + "fields": [ + { + "name": "login", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "id", + "type": [ + "null", + "int" + ], + "default": null + }, + { + "name": "node_id", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "commit", + "type": [ + "null", + { + "type": "record", + "name": "commit", + "namespace": "_airbyte_avro_namespace.commit_1", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "_airbyte_avro_namespace.author_2", + "fields": [ + { + "name": "name", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "email", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + }, + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "message", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + }, + "avroObject": { "author": { - "name": "Test Author", - "email": "test@example.com", - "date": 1609459261000000, + "login": "test", + "id": 12345, + "node_id": "abc123", + "_airbyte_additional_properties": null + }, + "commit": { + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": 1609459261000000, + "_airbyte_additional_properties": null + }, + "message": "test commit message", "_airbyte_additional_properties": null }, - "message": "test commit message", "_airbyte_additional_properties": null - }, - "_airbyte_additional_properties": null + } } -} ] From 1aed97431c393bed16304792b85934058700f16b Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Thu, 16 Dec 2021 14:48:38 +0200 Subject: [PATCH 09/20] refactoring --- .../destination/s3/avro/AvroConstants.java | 1 - .../s3/avro/AvroNameTransformer.java | 20 +- .../s3/avro/JsonToAvroSchemaConverter.java | 496 +++++++------- .../destination/s3/util/AvroRecordHelper.java | 24 + .../json_conversion_test_cases.json | 623 ++++++++++-------- .../type_conversion_test_cases.json | 61 +- 6 files changed, 603 insertions(+), 622 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java index f47e79979bce..50b9012fbbd9 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroConstants.java @@ -14,7 +14,6 @@ public class AvroConstants { public static final String DOC_KEY_ORIGINAL_NAME = "_airbyte_original_name"; public static final String AVRO_EXTRA_PROPS_FIELD = "_airbyte_additional_properties"; - public static final String AVRO_RECORD_NAMESPACE = "_airbyte_avro_namespace"; // This set must include _ab_additional_col in source_s3/source_files_abstract/stream.py public static final Set JSON_EXTRA_PROPS_FIELDS = Set.of("_ab_additional_properties", AVRO_EXTRA_PROPS_FIELD); public static final AvroNameTransformer NAME_TRANSFORMER = new AvroNameTransformer(); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index d781913ca3b1..354d971733bb 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -4,11 +4,10 @@ package io.airbyte.integrations.destination.s3.avro; -import static io.airbyte.integrations.destination.s3.avro.AvroConstants.AVRO_RECORD_NAMESPACE; - import io.airbyte.integrations.destination.ExtendedNameTransformer; -import java.util.Collections; -import java.util.List; + +import java.util.Arrays; +import java.util.stream.Collectors; public class AvroNameTransformer extends ExtendedNameTransformer { @@ -30,12 +29,13 @@ private String checkFirsCharInStreamName(final String name) { } } - public String resolveNamespace(String fieldName, List recordFieldNames) { - return AVRO_RECORD_NAMESPACE - .concat(".") - .concat(fieldName) - .concat("_") - .concat(String.valueOf(Collections.frequency(recordFieldNames, fieldName))); + public static String resolveNamespace(String fullPathToNode) { + return fullPathToNode==null ? null : Arrays.stream(fullPathToNode.split("/")) + .filter(key -> !key.isBlank()) + .filter(key -> !key.equals("items")) + .filter(key -> !key.equals("properties")) + .filter(key -> !key.equals("format")) + .collect(Collectors.joining(".")); } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 283363368e8b..3ad1f525692a 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -6,16 +6,9 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.fasterxml.jackson.databind.node.ValueNode; import com.google.common.base.Preconditions; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; - -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import javax.annotation.Nullable; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; @@ -24,6 +17,19 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; +import javax.annotation.Nullable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static io.airbyte.integrations.destination.s3.avro.AvroNameTransformer.resolveNamespace; +import static io.airbyte.integrations.destination.s3.util.AvroRecordHelper.obtainPaths; + /** * The main function of this class is to convert a JsonSchema to Avro schema. It can also * standardize schema names, and keep track of a mapping from the original names to the standardized @@ -35,291 +41,239 @@ */ public class JsonToAvroSchemaConverter { - private static final Schema UUID_SCHEMA = LogicalTypes.uuid() - .addToSchema(Schema.create(Schema.Type.STRING)); - private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL); - private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); - private static final Logger LOGGER = LoggerFactory.getLogger(JsonToAvroSchemaConverter.class); - private static final Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() - .addToSchema(Schema.create(Schema.Type.LONG)); - - private final Map standardizedNames = new HashMap<>(); - private final List recordFieldNames = new ArrayList<>(); - Map jsonNodeStringMap = new HashMap<>(); - - static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { - return getTypes(fieldName, fieldDefinition).stream() - .filter(type -> type != JsonSchemaType.NULL).collect(Collectors.toList()); - } - - static List getTypes(final String fieldName, final JsonNode fieldDefinition) { - final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); - if (combinedRestriction.isPresent()) { - return Collections.singletonList(JsonSchemaType.COMBINED); - } + private static final Schema UUID_SCHEMA = LogicalTypes.uuid() + .addToSchema(Schema.create(Schema.Type.STRING)); + private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL); + private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); + private static final Logger LOGGER = LoggerFactory.getLogger(JsonToAvroSchemaConverter.class); + private static final Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() + .addToSchema(Schema.create(Schema.Type.LONG)); - final JsonNode typeProperty = fieldDefinition.get("type"); - if (typeProperty == null || typeProperty.isNull()) { - throw new IllegalStateException(String.format("Field %s has no type", fieldName)); - } + private final Map standardizedNames = new HashMap<>(); + private final Map jsonNodePathMap = new HashMap<>(); - if (typeProperty.isArray()) { - return MoreIterators.toList(typeProperty.elements()).stream() - .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) - .collect(Collectors.toList()); + static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { + return getTypes(fieldName, fieldDefinition).stream() + .filter(type -> type != JsonSchemaType.NULL).collect(Collectors.toList()); } - if (typeProperty.isTextual()) { - return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText())); - } + static List getTypes(final String fieldName, final JsonNode fieldDefinition) { + final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); + if (combinedRestriction.isPresent()) { + return Collections.singletonList(JsonSchemaType.COMBINED); + } - throw new IllegalStateException("Unexpected type: " + typeProperty); - } + final JsonNode typeProperty = fieldDefinition.get("type"); + if (typeProperty == null || typeProperty.isNull()) { + throw new IllegalStateException(String.format("Field %s has no type", fieldName)); + } - static Optional getCombinedRestriction(final JsonNode fieldDefinition) { - if (fieldDefinition.has("anyOf")) { - return Optional.of(fieldDefinition.get("anyOf")); - } - if (fieldDefinition.has("allOf")) { - return Optional.of(fieldDefinition.get("allOf")); - } - if (fieldDefinition.has("oneOf")) { - return Optional.of(fieldDefinition.get("oneOf")); - } - return Optional.empty(); - } - - public Map getStandardizedNames() { - return standardizedNames; - } - - /** - * @return - Avro schema based on the input {@code jsonSchema}. - */ - public Schema getAvroSchema(final JsonNode jsonSchema, - final String name, - @Nullable final String namespace, - final boolean appendAirbyteFields, - final boolean rootNode) { - final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); - RecordBuilder builder = SchemaBuilder.record(stdName); - if(rootNode) { - Map schemaMap = new HashMap<>(); - addKeys("", jsonSchema, schemaMap, jsonNodeStringMap); - } - if (!stdName.equals(name)) { - standardizedNames.put(name, stdName); - LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, - stdName); - builder = builder.doc( - String.format("%s%s%s", - AvroConstants.DOC_KEY_ORIGINAL_NAME, - AvroConstants.DOC_KEY_VALUE_DELIMITER, - name)); - } - if (namespace != null) { - builder = builder.namespace(namespace); - } + if (typeProperty.isArray()) { + return MoreIterators.toList(typeProperty.elements()).stream() + .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) + .collect(Collectors.toList()); + } - final JsonNode properties = jsonSchema.get("properties"); - // object field with no "properties" will be handled by the default additional properties - // field during object conversion; so it is fine if there is no "properties" - final List fieldNames = properties == null - ? Collections.emptyList() - : new ArrayList<>(MoreIterators.toList(properties.fieldNames())); + if (typeProperty.isTextual()) { + return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText())); + } - SchemaBuilder.FieldAssembler assembler = builder.fields(); + throw new IllegalStateException("Unexpected type: " + typeProperty); + } - if (appendAirbyteFields) { - assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_AB_ID).type(UUID_SCHEMA).noDefault(); - assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_EMITTED_AT) - .type(TIMESTAMP_MILLIS_SCHEMA).noDefault(); + static Optional getCombinedRestriction(final JsonNode fieldDefinition) { + if (fieldDefinition.has("anyOf")) { + return Optional.of(fieldDefinition.get("anyOf")); + } + if (fieldDefinition.has("allOf")) { + return Optional.of(fieldDefinition.get("allOf")); + } + if (fieldDefinition.has("oneOf")) { + return Optional.of(fieldDefinition.get("oneOf")); + } + return Optional.empty(); } - for (final String fieldName : fieldNames) { - // ignore additional properties fields, which will be consolidated - // into one field at the end - if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { - continue; - } - - final String stdFieldName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); - final JsonNode fieldDefinition = properties.get(fieldName); - SchemaBuilder.FieldBuilder fieldBuilder = assembler.name(stdFieldName); - if (!stdFieldName.equals(fieldName)) { - standardizedNames.put(fieldName, stdFieldName); - LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}", - fieldName, stdFieldName); - fieldBuilder = fieldBuilder.doc(String.format("%s%s%s", - AvroConstants.DOC_KEY_ORIGINAL_NAME, - AvroConstants.DOC_KEY_VALUE_DELIMITER, - fieldName)); - } - - assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) - .withDefault(null); + public Map getStandardizedNames() { + return standardizedNames; } - // support additional properties in one field - assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) - .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); + /** + * @return - Avro schema based on the input {@code jsonSchema}. + */ + public Schema getAvroSchema(final JsonNode jsonSchema, + final String name, + @Nullable final String namespace, + final boolean appendAirbyteFields, + final boolean rootNode) { + final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); + RecordBuilder builder = SchemaBuilder.record(stdName); + if (rootNode) { + obtainPaths("", jsonSchema, jsonNodePathMap); + } + if (!stdName.equals(name)) { + standardizedNames.put(name, stdName); + LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, + stdName); + builder = builder.doc( + String.format("%s%s%s", + AvroConstants.DOC_KEY_ORIGINAL_NAME, + AvroConstants.DOC_KEY_VALUE_DELIMITER, + name)); + } + if (namespace != null) { + builder = builder.namespace(namespace); + } - return assembler.endRecord(); - } + final JsonNode properties = jsonSchema.get("properties"); + // object field with no "properties" will be handled by the default additional properties + // field during object conversion; so it is fine if there is no "properties" + final List fieldNames = properties == null + ? Collections.emptyList() + : new ArrayList<>(MoreIterators.toList(properties.fieldNames())); - Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition) { - Preconditions - .checkState(fieldType != JsonSchemaType.NULL, "Null types should have been filtered out"); + SchemaBuilder.FieldAssembler assembler = builder.fields(); - // the additional properties fields are filtered out and never passed into this method; - // but this method is able to handle them for completeness - if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { - return AdditionalPropertyField.FIELD_SCHEMA; - } + if (appendAirbyteFields) { + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_AB_ID).type(UUID_SCHEMA).noDefault(); + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_EMITTED_AT) + .type(TIMESTAMP_MILLIS_SCHEMA).noDefault(); + } - final Schema fieldSchema; - switch (fieldType) { - case NUMBER, INTEGER, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType()); - case STRING -> { - if (fieldDefinition.has("format")) { - String format = fieldDefinition.get("format").asText(); - fieldSchema = switch (format) { - case "date-time" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); - case "date" -> LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); - case "time" -> LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); - default -> Schema.create(fieldType.getAvroType()); - }; - } else { - fieldSchema = Schema.create(fieldType.getAvroType()); + for (final String fieldName : fieldNames) { + // ignore additional properties fields, which will be consolidated + // into one field at the end + if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { + continue; + } + + final String stdFieldName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); + final JsonNode fieldDefinition = properties.get(fieldName); + SchemaBuilder.FieldBuilder fieldBuilder = assembler.name(stdFieldName); + if (!stdFieldName.equals(fieldName)) { + standardizedNames.put(fieldName, stdFieldName); + LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}", + fieldName, stdFieldName); + fieldBuilder = fieldBuilder.doc(String.format("%s%s%s", + AvroConstants.DOC_KEY_ORIGINAL_NAME, + AvroConstants.DOC_KEY_VALUE_DELIMITER, + fieldName)); + } + + assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) + .withDefault(null); } - } - case COMBINED -> { - final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); - final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get()); - fieldSchema = Schema.createUnion(unionTypes); - } - case ARRAY -> { - final JsonNode items = fieldDefinition.get("items"); - Preconditions.checkNotNull(items, "Array field %s misses the items property.", fieldName); - - if (items.isObject()) { - fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items)); - } else if (items.isArray()) { - final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items); - arrayElementTypes.add(0, NULL_SCHEMA); - fieldSchema = Schema.createArray(Schema.createUnion(arrayElementTypes)); - } else { - throw new IllegalStateException( - String.format("Array field %s has invalid items property: %s", fieldName, items)); + + // support additional properties in one field + assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) + .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); + + return assembler.endRecord(); + } + + Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition) { + Preconditions + .checkState(fieldType != JsonSchemaType.NULL, "Null types should have been filtered out"); + + // the additional properties fields are filtered out and never passed into this method; + // but this method is able to handle them for completeness + if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { + return AdditionalPropertyField.FIELD_SCHEMA; } - } - case OBJECT -> { - final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); - String s = jsonNodeStringMap.get(fieldDefinition); - String path = null; - if(s!=null){ - path = Arrays.stream(s.split("/")) - .filter(key -> !key.isBlank()) - .filter(key -> !key.equals("items")) - .filter(key -> !key.equals("properties")) - .filter(key -> !key.equals("format")) - .collect(Collectors.joining(".")); -// .collect(Collectors.toList()); - int i =9; + + final Schema fieldSchema; + switch (fieldType) { + case NUMBER, INTEGER, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType()); + case STRING -> { + if (fieldDefinition.has("format")) { + String format = fieldDefinition.get("format").asText(); + fieldSchema = switch (format) { + case "date-time" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + case "date" -> LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); + case "time" -> LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); + default -> Schema.create(fieldType.getAvroType()); + }; + } else { + fieldSchema = Schema.create(fieldType.getAvroType()); + } + } + case COMBINED -> { + final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); + final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get()); + fieldSchema = Schema.createUnion(unionTypes); + } + case ARRAY -> { + final JsonNode items = fieldDefinition.get("items"); + Preconditions.checkNotNull(items, "Array field %s misses the items property.", fieldName); + + if (items.isObject()) { + fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items)); + } else if (items.isArray()) { + final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items); + arrayElementTypes.add(0, NULL_SCHEMA); + fieldSchema = Schema.createArray(Schema.createUnion(arrayElementTypes)); + } else { + throw new IllegalStateException( + String.format("Array field %s has invalid items property: %s", fieldName, items)); + } + } + case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, resolveNamespace(jsonNodePathMap.get(fieldDefinition)), false, false); + default -> throw new IllegalStateException( + String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } - recordFieldNames.add(stdName); - fieldSchema = getAvroSchema(fieldDefinition, fieldName, - path, false,false); - } - default -> throw new IllegalStateException( - String.format("Unexpected type for field %s: %s", fieldName, fieldType)); + return fieldSchema; } - return fieldSchema; - } - - List getSchemasFromTypes(final String fieldName, final ArrayNode types) { - return MoreIterators.toList(types.elements()) - .stream() - .flatMap(definition -> getNonNullTypes(fieldName, definition).stream().flatMap(type -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition); - if (singleFieldSchema.isUnion()) { - return singleFieldSchema.getTypes().stream(); - } else { - return Stream.of(singleFieldSchema); - } - })) - .distinct() - .collect(Collectors.toList()); - } - - /** - * @param fieldDefinition - Json schema field definition. E.g. { type: "number" }. - */ - Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition) { - // Filter out null types, which will be added back in the end. - final List nonNullFieldTypes = getNonNullTypes(fieldName, fieldDefinition) - .stream() - .flatMap(fieldType -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition); - if (singleFieldSchema.isUnion()) { - return singleFieldSchema.getTypes().stream(); - } else { - return Stream.of(singleFieldSchema); - } - }) - .distinct() - .collect(Collectors.toList()); - - if (nonNullFieldTypes.isEmpty()) { - return Schema.create(Schema.Type.NULL); - } else { - // Mark every field as nullable to prevent missing value exceptions from Avro / Parquet. - if (!nonNullFieldTypes.contains(NULL_SCHEMA)) { - nonNullFieldTypes.add(0, NULL_SCHEMA); - } - // Logical types are converted to a union of logical type itself and string. The purpose is to - // default the logical type field to a string, if the value of the logical type field is invalid and - // cannot be properly processed. - if ((nonNullFieldTypes - .stream().anyMatch(schema -> schema.getLogicalType() != null)) && - (!nonNullFieldTypes.contains(STRING_SCHEMA))) { - nonNullFieldTypes.add(STRING_SCHEMA); - } - return Schema.createUnion(nonNullFieldTypes); + + List getSchemasFromTypes(final String fieldName, final ArrayNode types) { + return MoreIterators.toList(types.elements()) + .stream() + .flatMap(definition -> getNonNullTypes(fieldName, definition).stream().flatMap(type -> { + final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition); + if (singleFieldSchema.isUnion()) { + return singleFieldSchema.getTypes().stream(); + } else { + return Stream.of(singleFieldSchema); + } + })) + .distinct() + .collect(Collectors.toList()); } - } - - private static void addKeys(String currentPath, JsonNode jsonNode, Map map, Map jsonNodeStringMap) { - if (jsonNode.isObject()) { - ObjectNode objectNode = (ObjectNode) jsonNode; - Iterator> iter = objectNode.fields(); - String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; - jsonNodeStringMap.put(jsonNode,pathPrefix); - while (iter.hasNext()) { - Map.Entry entry = iter.next(); - addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, jsonNodeStringMap); - } - } else if (jsonNode.isArray()) { - ArrayNode arrayNode = (ArrayNode) jsonNode; - - for (int i = 0; i < arrayNode.size(); i++) { - String arrayPath = currentPath + "/" + i; - addKeys(arrayPath, arrayNode.get(i), map,jsonNodeStringMap); - } - - } else if (jsonNode.isValueNode()) { - ValueNode valueNode = (ValueNode) jsonNode; -// if (jsonSchema) { -// if (schemaContainsProperties(currentPath, valueNode, "format", List.of("date", "date-time", "time"))) { - map.put("/" + currentPath, valueNode.asText()); -// } -// } else { -// String value = valueNode.asText(); -// if (!value.equals("null") && !value.isBlank() && !Boolean.parseBoolean(value)) { -// map.put("/" + currentPath, value); -// } - } -// } - } + /** + * @param fieldDefinition - Json schema field definition. E.g. { type: "number" }. + */ + Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition) { + // Filter out null types, which will be added back in the end. + final List nonNullFieldTypes = getNonNullTypes(fieldName, fieldDefinition) + .stream() + .flatMap(fieldType -> { + final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition); + if (singleFieldSchema.isUnion()) { + return singleFieldSchema.getTypes().stream(); + } else { + return Stream.of(singleFieldSchema); + } + }) + .distinct() + .collect(Collectors.toList()); + + if (nonNullFieldTypes.isEmpty()) { + return Schema.create(Schema.Type.NULL); + } else { + // Mark every field as nullable to prevent missing value exceptions from Avro / Parquet. + if (!nonNullFieldTypes.contains(NULL_SCHEMA)) { + nonNullFieldTypes.add(0, NULL_SCHEMA); + } + // Logical types are converted to a union of logical type itself and string. The purpose is to + // default the logical type field to a string, if the value of the logical type field is invalid and + // cannot be properly processed. + if ((nonNullFieldTypes + .stream().anyMatch(schema -> schema.getLogicalType() != null)) && + (!nonNullFieldTypes.contains(STRING_SCHEMA))) { + nonNullFieldTypes.add(STRING_SCHEMA); + } + return Schema.createUnion(nonNullFieldTypes); + } + } } + diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java index 33d32ed742f3..0affef6cd3a0 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java @@ -5,12 +5,16 @@ package io.airbyte.integrations.destination.s3.util; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; +import java.util.Iterator; +import java.util.Map; + /** * Helper methods for unit tests. This is needed by multiple modules, so it is in the src directory. */ @@ -47,4 +51,24 @@ public static JsonNode pruneAirbyteJson(final JsonNode input) { return output; } + public static void obtainPaths(String currentPath, JsonNode jsonNode, Map jsonNodeStringMap) { + if (jsonNode.isObject()) { + ObjectNode objectNode = (ObjectNode) jsonNode; + Iterator> iter = objectNode.fields(); + String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; + jsonNodeStringMap.put(jsonNode,pathPrefix); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + obtainPaths(pathPrefix + entry.getKey(), entry.getValue(), jsonNodeStringMap); + } + } else if (jsonNode.isArray()) { + ArrayNode arrayNode = (ArrayNode) jsonNode; + + for (int i = 0; i < arrayNode.size(); i++) { + String arrayPath = currentPath + "/" + i; + obtainPaths(arrayPath, arrayNode.get(i),jsonNodeStringMap); + } + } + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index d23ed51743b9..e5ba714240ab 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -1,284 +1,4 @@ -[{ - "schemaName": "schema_with_the_same_object_names_and_inner", - "namespace": "namespace17", - "appendAirbyteFields": false, - "jsonSchema": { - "type": "object", - "properties": { - "author": { - "type": "object", - "properties": { - "login": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - } - } - }, - "commit": { - "type": [ - "null", - "object" - ], - "properties": { - "author": { - "type": [ - "null", - "object" - ], - "properties": { - "name": { - "type": [ - "null", - "string" - ] - }, - "email": { - "type": [ - "null", - "string" - ] - }, - "date": { - "type": [ - "null", - "string" - ], - "format": "date-time" - }, - "pr": { - "type": [ - "null", - "object" - ], - "properties": { - "test1": { - "type": [ - "null", - "string" - ] - }, - "test2": { - "type": [ - "null", - "string" - ] - } - } - } - } - }, - "message": { - "type": [ - "null", - "string" - ] - } - } - } - } - }, - "jsonObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123" - }, - "commit": { - "message": "test commit message", - "author": { - "name": "Test Author", - "email": "test@example.com", - "date": "2021-01-01T01:01:01+01:00" - } - } - }, - "avroSchema": { - "type": "record", - "name": "schema_with_the_same_object_names", - "namespace": "namespace16", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "_airbyte_avro_namespace.author_1", - "fields": [ - { - "name": "login", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "id", - "type": [ - "null", - "int" - ], - "default": null - }, - { - "name": "node_id", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "commit", - "type": [ - "null", - { - "type": "record", - "name": "commit", - "namespace": "_airbyte_avro_namespace.commit_1", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "_airbyte_avro_namespace.author_2", - "fields": [ - { - "name": "name", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "email", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "date", - "type": [ - "null", - { - "type": "long", - "logicalType": "timestamp-micros" - }, - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "message", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - }, - "avroObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123", - "_airbyte_additional_properties": null - }, - "commit": { - "author": { - "name": "Test Author", - "email": "test@example.com", - "date": 1609459261000000, - "_airbyte_additional_properties": null - }, - "message": "test commit message", - "_airbyte_additional_properties": null - }, - "_airbyte_additional_properties": null - } -}, +[ { "schemaName": "simple_schema", "namespace": "namespace1", @@ -384,7 +104,7 @@ { "type": "record", "name": "user", - "namespace": "_airbyte_avro_namespace.user_1", + "namespace": "user", "fields": [ { "name": "first_name", @@ -860,7 +580,7 @@ { "type": "record", "name": "user", - "namespace": "_airbyte_avro_namespace.user_2", + "namespace": "user", "fields": [ { "name": "created_at", @@ -1389,7 +1109,7 @@ { "type": "record", "name": "author", - "namespace": "_airbyte_avro_namespace.author_1", + "namespace": "author", "fields": [ { "name": "login", @@ -1438,7 +1158,7 @@ { "type": "record", "name": "commit", - "namespace": "_airbyte_avro_namespace.commit_1", + "namespace": "commit", "fields": [ { "name": "author", @@ -1447,7 +1167,7 @@ { "type": "record", "name": "author", - "namespace": "_airbyte_avro_namespace.author_2", + "namespace": "commit.author", "fields": [ { "name": "name", @@ -1549,5 +1269,336 @@ }, "_airbyte_additional_properties": null } + }, + { + "schemaName": "schema_with_the_same_object_names_and_inner_object", + "namespace": "namespace17", + "appendAirbyteFields": false, + "jsonSchema": { + "type": "object", + "properties": { + "author": { + "type": "object", + "properties": { + "login": { + "type": [ + "null", + "string" + ] + }, + "id": { + "type": [ + "null", + "integer" + ] + }, + "node_id": { + "type": [ + "null", + "string" + ] + } + } + }, + "commit": { + "type": [ + "null", + "object" + ], + "properties": { + "author": { + "type": [ + "null", + "object" + ], + "properties": { + "name": { + "type": [ + "null", + "string" + ] + }, + "email": { + "type": [ + "null", + "string" + ] + }, + "date": { + "type": [ + "null", + "string" + ], + "format": "date-time" + }, + "pr": { + "type": [ + "null", + "object" + ], + "properties": { + "id": { + "type": [ + "null", + "string" + ] + }, + "title": { + "type": [ + "null", + "string" + ] + } + } + } + } + }, + "message": { + "type": [ + "null", + "string" + ] + } + } + } + } + }, + "jsonObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123" + }, + "commit": { + "message": "test commit message", + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": "2021-01-01T01:01:01+01:00", + "pr": { + "id": "random id", + "title": "Conversion test" + } + } + } + }, + "avroSchema": { + "type": "record", + "name": "schema_with_the_same_object_names_and_inner_object", + "namespace": "namespace17", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "author", + "fields": [ + { + "name": "login", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "id", + "type": [ + "null", + "int" + ], + "default": null + }, + { + "name": "node_id", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "commit", + "type": [ + "null", + { + "type": "record", + "name": "commit", + "namespace": "commit", + "fields": [ + { + "name": "author", + "type": [ + "null", + { + "type": "record", + "name": "author", + "namespace": "commit.author", + "fields": [ + { + "name": "name", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "email", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "date", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + }, + "string" + ], + "default": null + }, + { + "name": "pr", + "type": [ + "null", + { + "type": "record", + "name": "pr", + "namespace": "commit.author.pr", + "fields": [ + { + "name": "id", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "title", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "message", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "_airbyte_additional_properties", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ], + "default": null + } + ] + }, + "avroObject": { + "author": { + "login": "test", + "id": 12345, + "node_id": "abc123", + "_airbyte_additional_properties": null + }, + "commit": { + "author": { + "name": "Test Author", + "email": "test@example.com", + "date": 1609459261000000, + "pr": { + "id": "random id", + "title": "Conversion test", + "_airbyte_additional_properties": null + }, + "_airbyte_additional_properties": null + }, + "message": "test commit message", + "_airbyte_additional_properties": null + }, + "_airbyte_additional_properties": null + } } ] diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json index 1673d8951c4d..5881a5332c22 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/type_conversion_test_cases.json @@ -89,7 +89,6 @@ { "type": "record", "name": "object_field", - "namespace": "_airbyte_avro_namespace.object_field_1", "fields": [ { "name": "id", @@ -103,13 +102,7 @@ }, { "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], + "type": ["null", { "type": "map", "values": "string" }], "default": null } ] @@ -126,17 +119,10 @@ { "type": "record", "name": "object_field_without_properties", - "namespace": "_airbyte_avro_namespace.object_field_without_properties_1", "fields": [ { "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], + "type": ["null", { "type": "map", "values": "string" }], "default": null } ] @@ -148,66 +134,33 @@ "jsonFieldSchema": { "type": "object" }, - "avroFieldType": [ - "null", - { - "type": "map", - "values": "string" - } - ] + "avroFieldType": ["null", { "type": "map", "values": "string" }] }, { "fieldName": "_ab_additional_properties", "jsonFieldSchema": { "type": "object" }, - "avroFieldType": [ - "null", - { - "type": "map", - "values": "string" - } - ] + "avroFieldType": ["null", { "type": "map", "values": "string" }] }, { "fieldName": "any_of_field", "jsonFieldSchema": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] + "anyOf": [{ "type": "string" }, { "type": "integer" }] }, "avroFieldType": ["null", "string", "int"] }, { "fieldName": "all_of_field", "jsonFieldSchema": { - "allOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] + "allOf": [{ "type": "string" }, { "type": "integer" }] }, "avroFieldType": ["null", "string", "int"] }, { "fieldName": "one_of_field", "jsonFieldSchema": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - } - ] + "oneOf": [{ "type": "string" }, { "type": "integer" }] }, "avroFieldType": ["null", "string", "int"] }, From 5c5bbb119b816128aa9bcc1abba259968a8e8901 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Thu, 16 Dec 2021 14:57:56 +0200 Subject: [PATCH 10/20] format code --- .../s3/avro/JsonToAvroSchemaConverter.java | 430 +++++++++--------- 1 file changed, 214 insertions(+), 216 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 3ad1f525692a..00bf8f93d9cb 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -9,15 +9,6 @@ import com.google.common.base.Preconditions; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.SchemaBuilder.RecordBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; - -import javax.annotation.Nullable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -26,6 +17,14 @@ import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; +import javax.annotation.Nullable; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaBuilder.RecordBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; import static io.airbyte.integrations.destination.s3.avro.AvroNameTransformer.resolveNamespace; import static io.airbyte.integrations.destination.s3.util.AvroRecordHelper.obtainPaths; @@ -41,239 +40,238 @@ */ public class JsonToAvroSchemaConverter { - private static final Schema UUID_SCHEMA = LogicalTypes.uuid() - .addToSchema(Schema.create(Schema.Type.STRING)); - private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL); - private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); - private static final Logger LOGGER = LoggerFactory.getLogger(JsonToAvroSchemaConverter.class); - private static final Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() - .addToSchema(Schema.create(Schema.Type.LONG)); + private static final Schema UUID_SCHEMA = LogicalTypes.uuid() + .addToSchema(Schema.create(Schema.Type.STRING)); + private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL); + private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); + private static final Logger LOGGER = LoggerFactory.getLogger(JsonToAvroSchemaConverter.class); + private static final Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() + .addToSchema(Schema.create(Schema.Type.LONG)); private final Map standardizedNames = new HashMap<>(); private final Map jsonNodePathMap = new HashMap<>(); - static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { - return getTypes(fieldName, fieldDefinition).stream() - .filter(type -> type != JsonSchemaType.NULL).collect(Collectors.toList()); + static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { + return getTypes(fieldName, fieldDefinition).stream() + .filter(type -> type != JsonSchemaType.NULL).collect(Collectors.toList()); + } + + static List getTypes(final String fieldName, final JsonNode fieldDefinition) { + final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); + if (combinedRestriction.isPresent()) { + return Collections.singletonList(JsonSchemaType.COMBINED); } - static List getTypes(final String fieldName, final JsonNode fieldDefinition) { - final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); - if (combinedRestriction.isPresent()) { - return Collections.singletonList(JsonSchemaType.COMBINED); - } + final JsonNode typeProperty = fieldDefinition.get("type"); + if (typeProperty == null || typeProperty.isNull()) { + throw new IllegalStateException(String.format("Field %s has no type", fieldName)); + } - final JsonNode typeProperty = fieldDefinition.get("type"); - if (typeProperty == null || typeProperty.isNull()) { - throw new IllegalStateException(String.format("Field %s has no type", fieldName)); - } + if (typeProperty.isArray()) { + return MoreIterators.toList(typeProperty.elements()).stream() + .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) + .collect(Collectors.toList()); + } - if (typeProperty.isArray()) { - return MoreIterators.toList(typeProperty.elements()).stream() - .map(s -> JsonSchemaType.fromJsonSchemaType(s.asText())) - .collect(Collectors.toList()); - } + if (typeProperty.isTextual()) { + return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText())); + } - if (typeProperty.isTextual()) { - return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText())); - } + throw new IllegalStateException("Unexpected type: " + typeProperty); + } - throw new IllegalStateException("Unexpected type: " + typeProperty); + static Optional getCombinedRestriction(final JsonNode fieldDefinition) { + if (fieldDefinition.has("anyOf")) { + return Optional.of(fieldDefinition.get("anyOf")); } - - static Optional getCombinedRestriction(final JsonNode fieldDefinition) { - if (fieldDefinition.has("anyOf")) { - return Optional.of(fieldDefinition.get("anyOf")); - } - if (fieldDefinition.has("allOf")) { - return Optional.of(fieldDefinition.get("allOf")); - } - if (fieldDefinition.has("oneOf")) { - return Optional.of(fieldDefinition.get("oneOf")); - } - return Optional.empty(); + if (fieldDefinition.has("allOf")) { + return Optional.of(fieldDefinition.get("allOf")); } - - public Map getStandardizedNames() { - return standardizedNames; + if (fieldDefinition.has("oneOf")) { + return Optional.of(fieldDefinition.get("oneOf")); + } + return Optional.empty(); + } + + public Map getStandardizedNames() { + return standardizedNames; + } + + /** + * @return - Avro schema based on the input {@code jsonSchema}. + */ + public Schema getAvroSchema(final JsonNode jsonSchema, + final String name, + @Nullable final String namespace, + final boolean appendAirbyteFields, + final boolean isRootNode) { + final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); + RecordBuilder builder = SchemaBuilder.record(stdName); + if (isRootNode) { + obtainPaths("", jsonSchema, jsonNodePathMap); + } + if (!stdName.equals(name)) { + standardizedNames.put(name, stdName); + LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, + stdName); + builder = builder.doc( + String.format("%s%s%s", + AvroConstants.DOC_KEY_ORIGINAL_NAME, + AvroConstants.DOC_KEY_VALUE_DELIMITER, + name)); + } + if (namespace != null) { + builder = builder.namespace(namespace); } - /** - * @return - Avro schema based on the input {@code jsonSchema}. - */ - public Schema getAvroSchema(final JsonNode jsonSchema, - final String name, - @Nullable final String namespace, - final boolean appendAirbyteFields, - final boolean rootNode) { - final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); - RecordBuilder builder = SchemaBuilder.record(stdName); - if (rootNode) { - obtainPaths("", jsonSchema, jsonNodePathMap); - } - if (!stdName.equals(name)) { - standardizedNames.put(name, stdName); - LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, - stdName); - builder = builder.doc( - String.format("%s%s%s", - AvroConstants.DOC_KEY_ORIGINAL_NAME, - AvroConstants.DOC_KEY_VALUE_DELIMITER, - name)); - } - if (namespace != null) { - builder = builder.namespace(namespace); - } - - final JsonNode properties = jsonSchema.get("properties"); - // object field with no "properties" will be handled by the default additional properties - // field during object conversion; so it is fine if there is no "properties" - final List fieldNames = properties == null - ? Collections.emptyList() - : new ArrayList<>(MoreIterators.toList(properties.fieldNames())); + final JsonNode properties = jsonSchema.get("properties"); + // object field with no "properties" will be handled by the default additional properties + // field during object conversion; so it is fine if there is no "properties" + final List fieldNames = properties == null + ? Collections.emptyList() + : new ArrayList<>(MoreIterators.toList(properties.fieldNames())); - SchemaBuilder.FieldAssembler assembler = builder.fields(); + SchemaBuilder.FieldAssembler assembler = builder.fields(); - if (appendAirbyteFields) { - assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_AB_ID).type(UUID_SCHEMA).noDefault(); - assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_EMITTED_AT) - .type(TIMESTAMP_MILLIS_SCHEMA).noDefault(); - } + if (appendAirbyteFields) { + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_AB_ID).type(UUID_SCHEMA).noDefault(); + assembler = assembler.name(JavaBaseConstants.COLUMN_NAME_EMITTED_AT) + .type(TIMESTAMP_MILLIS_SCHEMA).noDefault(); + } - for (final String fieldName : fieldNames) { - // ignore additional properties fields, which will be consolidated - // into one field at the end - if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { - continue; - } + for (final String fieldName : fieldNames) { + // ignore additional properties fields, which will be consolidated + // into one field at the end + if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { + continue; + } + + final String stdFieldName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); + final JsonNode fieldDefinition = properties.get(fieldName); + SchemaBuilder.FieldBuilder fieldBuilder = assembler.name(stdFieldName); + if (!stdFieldName.equals(fieldName)) { + standardizedNames.put(fieldName, stdFieldName); + LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}", + fieldName, stdFieldName); + fieldBuilder = fieldBuilder.doc(String.format("%s%s%s", + AvroConstants.DOC_KEY_ORIGINAL_NAME, + AvroConstants.DOC_KEY_VALUE_DELIMITER, + fieldName)); + } + assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) + .withDefault(null); + } - final String stdFieldName = AvroConstants.NAME_TRANSFORMER.getIdentifier(fieldName); - final JsonNode fieldDefinition = properties.get(fieldName); - SchemaBuilder.FieldBuilder fieldBuilder = assembler.name(stdFieldName); - if (!stdFieldName.equals(fieldName)) { - standardizedNames.put(fieldName, stdFieldName); - LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}", - fieldName, stdFieldName); - fieldBuilder = fieldBuilder.doc(String.format("%s%s%s", - AvroConstants.DOC_KEY_ORIGINAL_NAME, - AvroConstants.DOC_KEY_VALUE_DELIMITER, - fieldName)); - } + // support additional properties in one field + assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) + .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); - assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) - .withDefault(null); - } + return assembler.endRecord(); + } - // support additional properties in one field - assembler = assembler.name(AvroConstants.AVRO_EXTRA_PROPS_FIELD) - .type(AdditionalPropertyField.FIELD_SCHEMA).withDefault(null); + Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition) { + Preconditions + .checkState(fieldType != JsonSchemaType.NULL, "Null types should have been filtered out"); - return assembler.endRecord(); + // the additional properties fields are filtered out and never passed into this method; + // but this method is able to handle them for completeness + if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { + return AdditionalPropertyField.FIELD_SCHEMA; } - Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType, final JsonNode fieldDefinition) { - Preconditions - .checkState(fieldType != JsonSchemaType.NULL, "Null types should have been filtered out"); - - // the additional properties fields are filtered out and never passed into this method; - // but this method is able to handle them for completeness - if (AvroConstants.JSON_EXTRA_PROPS_FIELDS.contains(fieldName)) { - return AdditionalPropertyField.FIELD_SCHEMA; + final Schema fieldSchema; + switch (fieldType) { + case NUMBER, INTEGER, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType()); + case STRING -> { + if (fieldDefinition.has("format")) { + String format = fieldDefinition.get("format").asText(); + fieldSchema = switch (format) { + case "date-time" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + case "date" -> LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); + case "time" -> LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); + default -> Schema.create(fieldType.getAvroType()); + }; + } else { + fieldSchema = Schema.create(fieldType.getAvroType()); } - - final Schema fieldSchema; - switch (fieldType) { - case NUMBER, INTEGER, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType()); - case STRING -> { - if (fieldDefinition.has("format")) { - String format = fieldDefinition.get("format").asText(); - fieldSchema = switch (format) { - case "date-time" -> LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); - case "date" -> LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); - case "time" -> LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); - default -> Schema.create(fieldType.getAvroType()); - }; - } else { - fieldSchema = Schema.create(fieldType.getAvroType()); - } - } - case COMBINED -> { - final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); - final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get()); - fieldSchema = Schema.createUnion(unionTypes); - } - case ARRAY -> { - final JsonNode items = fieldDefinition.get("items"); - Preconditions.checkNotNull(items, "Array field %s misses the items property.", fieldName); - - if (items.isObject()) { - fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items)); - } else if (items.isArray()) { - final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items); - arrayElementTypes.add(0, NULL_SCHEMA); - fieldSchema = Schema.createArray(Schema.createUnion(arrayElementTypes)); - } else { - throw new IllegalStateException( - String.format("Array field %s has invalid items property: %s", fieldName, items)); - } - } - case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, resolveNamespace(jsonNodePathMap.get(fieldDefinition)), false, false); - default -> throw new IllegalStateException( - String.format("Unexpected type for field %s: %s", fieldName, fieldType)); + } + case COMBINED -> { + final Optional combinedRestriction = getCombinedRestriction(fieldDefinition); + final List unionTypes = getSchemasFromTypes(fieldName, (ArrayNode) combinedRestriction.get()); + fieldSchema = Schema.createUnion(unionTypes); + } + case ARRAY -> { + final JsonNode items = fieldDefinition.get("items"); + Preconditions.checkNotNull(items, "Array field %s misses the items property.", fieldName); + + if (items.isObject()) { + fieldSchema = Schema.createArray(getNullableFieldTypes(String.format("%s.items", fieldName), items)); + } else if (items.isArray()) { + final List arrayElementTypes = getSchemasFromTypes(fieldName, (ArrayNode) items); + arrayElementTypes.add(0, NULL_SCHEMA); + fieldSchema = Schema.createArray(Schema.createUnion(arrayElementTypes)); + } else { + throw new IllegalStateException( + String.format("Array field %s has invalid items property: %s", fieldName, items)); } - return fieldSchema; + } + case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, resolveNamespace(jsonNodePathMap.get(fieldDefinition)), false, false); + default -> throw new IllegalStateException( + String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } - - List getSchemasFromTypes(final String fieldName, final ArrayNode types) { - return MoreIterators.toList(types.elements()) - .stream() - .flatMap(definition -> getNonNullTypes(fieldName, definition).stream().flatMap(type -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition); - if (singleFieldSchema.isUnion()) { - return singleFieldSchema.getTypes().stream(); - } else { - return Stream.of(singleFieldSchema); - } - })) - .distinct() - .collect(Collectors.toList()); + return fieldSchema; + } + + List getSchemasFromTypes(final String fieldName, final ArrayNode types) { + return MoreIterators.toList(types.elements()) + .stream() + .flatMap(definition -> getNonNullTypes(fieldName, definition).stream().flatMap(type -> { + final Schema singleFieldSchema = getSingleFieldType(fieldName, type, definition); + if (singleFieldSchema.isUnion()) { + return singleFieldSchema.getTypes().stream(); + } else { + return Stream.of(singleFieldSchema); + } + })) + .distinct() + .collect(Collectors.toList()); + } + + /** + * @param fieldDefinition - Json schema field definition. E.g. { type: "number" }. + */ + Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition) { + // Filter out null types, which will be added back in the end. + final List nonNullFieldTypes = getNonNullTypes(fieldName, fieldDefinition) + .stream() + .flatMap(fieldType -> { + final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition); + if (singleFieldSchema.isUnion()) { + return singleFieldSchema.getTypes().stream(); + } else { + return Stream.of(singleFieldSchema); + } + }) + .distinct() + .collect(Collectors.toList()); + + if (nonNullFieldTypes.isEmpty()) { + return Schema.create(Schema.Type.NULL); + } else { + // Mark every field as nullable to prevent missing value exceptions from Avro / Parquet. + if (!nonNullFieldTypes.contains(NULL_SCHEMA)) { + nonNullFieldTypes.add(0, NULL_SCHEMA); + } + // Logical types are converted to a union of logical type itself and string. The purpose is to + // default the logical type field to a string, if the value of the logical type field is invalid and + // cannot be properly processed. + if ((nonNullFieldTypes + .stream().anyMatch(schema -> schema.getLogicalType() != null)) && + (!nonNullFieldTypes.contains(STRING_SCHEMA))) { + nonNullFieldTypes.add(STRING_SCHEMA); + } + return Schema.createUnion(nonNullFieldTypes); } + } - /** - * @param fieldDefinition - Json schema field definition. E.g. { type: "number" }. - */ - Schema getNullableFieldTypes(final String fieldName, final JsonNode fieldDefinition) { - // Filter out null types, which will be added back in the end. - final List nonNullFieldTypes = getNonNullTypes(fieldName, fieldDefinition) - .stream() - .flatMap(fieldType -> { - final Schema singleFieldSchema = getSingleFieldType(fieldName, fieldType, fieldDefinition); - if (singleFieldSchema.isUnion()) { - return singleFieldSchema.getTypes().stream(); - } else { - return Stream.of(singleFieldSchema); - } - }) - .distinct() - .collect(Collectors.toList()); - - if (nonNullFieldTypes.isEmpty()) { - return Schema.create(Schema.Type.NULL); - } else { - // Mark every field as nullable to prevent missing value exceptions from Avro / Parquet. - if (!nonNullFieldTypes.contains(NULL_SCHEMA)) { - nonNullFieldTypes.add(0, NULL_SCHEMA); - } - // Logical types are converted to a union of logical type itself and string. The purpose is to - // default the logical type field to a string, if the value of the logical type field is invalid and - // cannot be properly processed. - if ((nonNullFieldTypes - .stream().anyMatch(schema -> schema.getLogicalType() != null)) && - (!nonNullFieldTypes.contains(STRING_SCHEMA))) { - nonNullFieldTypes.add(STRING_SCHEMA); - } - return Schema.createUnion(nonNullFieldTypes); - } - } } - From 2575dcf53c979c06c654db1c33ea80a737f9ebcf Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Thu, 16 Dec 2021 15:01:14 +0200 Subject: [PATCH 11/20] cleanup Dockerfile --- airbyte-integrations/connectors/destination-s3/Dockerfile | 1 - .../destination/s3/util/AvroRecordHelper.java | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/Dockerfile b/airbyte-integrations/connectors/destination-s3/Dockerfile index 8aa37e692503..e68bba6018a1 100644 --- a/airbyte-integrations/connectors/destination-s3/Dockerfile +++ b/airbyte-integrations/connectors/destination-s3/Dockerfile @@ -2,7 +2,6 @@ FROM airbyte/integration-base-java:dev WORKDIR /airbyte ENV APPLICATION destination-s3 -#ENV JAVA_TOOL_OPTIONS -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005 COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java index 0affef6cd3a0..a053b0d75b1b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java @@ -51,22 +51,22 @@ public static JsonNode pruneAirbyteJson(final JsonNode input) { return output; } - public static void obtainPaths(String currentPath, JsonNode jsonNode, Map jsonNodeStringMap) { + public static void obtainPaths(String currentPath, JsonNode jsonNode, Map jsonNodePathMap) { if (jsonNode.isObject()) { ObjectNode objectNode = (ObjectNode) jsonNode; Iterator> iter = objectNode.fields(); String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; - jsonNodeStringMap.put(jsonNode,pathPrefix); + jsonNodePathMap.put(jsonNode,pathPrefix); while (iter.hasNext()) { Map.Entry entry = iter.next(); - obtainPaths(pathPrefix + entry.getKey(), entry.getValue(), jsonNodeStringMap); + obtainPaths(pathPrefix + entry.getKey(), entry.getValue(), jsonNodePathMap); } } else if (jsonNode.isArray()) { ArrayNode arrayNode = (ArrayNode) jsonNode; for (int i = 0; i < arrayNode.size(); i++) { String arrayPath = currentPath + "/" + i; - obtainPaths(arrayPath, arrayNode.get(i),jsonNodeStringMap); + obtainPaths(arrayPath, arrayNode.get(i),jsonNodePathMap); } } } From 44094054bd8bf0a50f3d26008420634e1a801107 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Thu, 16 Dec 2021 15:19:24 +0200 Subject: [PATCH 12/20] refactoring --- .../integrations/destination/s3/avro/AvroNameTransformer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index 354d971733bb..76ec494763e6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -34,7 +34,6 @@ public static String resolveNamespace(String fullPathToNode) { .filter(key -> !key.isBlank()) .filter(key -> !key.equals("items")) .filter(key -> !key.equals("properties")) - .filter(key -> !key.equals("format")) .collect(Collectors.joining(".")); } From e2816e533013ae7bbf48422114c5f8b1102c1fbc Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Mon, 20 Dec 2021 11:20:45 +0200 Subject: [PATCH 13/20] removed unneded tests case --- .../json_conversion_test_cases.json | 265 +----------------- 1 file changed, 2 insertions(+), 263 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index e5ba714240ab..51eddff67d44 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -1013,267 +1013,6 @@ "schemaName": "schema_with_the_same_object_names", "namespace": "namespace16", "appendAirbyteFields": false, - "jsonSchema": { - "type": "object", - "properties": { - "author": { - "type": "object", - "properties": { - "login": { - "type": [ - "null", - "string" - ] - }, - "id": { - "type": [ - "null", - "integer" - ] - }, - "node_id": { - "type": [ - "null", - "string" - ] - } - } - }, - "commit": { - "type": [ - "null", - "object" - ], - "properties": { - "author": { - "type": [ - "null", - "object" - ], - "properties": { - "name": { - "type": [ - "null", - "string" - ] - }, - "email": { - "type": [ - "null", - "string" - ] - }, - "date": { - "type": [ - "null", - "string" - ], - "format": "date-time" - } - } - }, - "message": { - "type": [ - "null", - "string" - ] - } - } - } - } - }, - "jsonObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123" - }, - "commit": { - "message": "test commit message", - "author": { - "name": "Test Author", - "email": "test@example.com", - "date": "2021-01-01T01:01:01+01:00" - } - } - }, - "avroSchema": { - "type": "record", - "name": "schema_with_the_same_object_names", - "namespace": "namespace16", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "author", - "fields": [ - { - "name": "login", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "id", - "type": [ - "null", - "int" - ], - "default": null - }, - { - "name": "node_id", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "commit", - "type": [ - "null", - { - "type": "record", - "name": "commit", - "namespace": "commit", - "fields": [ - { - "name": "author", - "type": [ - "null", - { - "type": "record", - "name": "author", - "namespace": "commit.author", - "fields": [ - { - "name": "name", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "email", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "date", - "type": [ - "null", - { - "type": "long", - "logicalType": "timestamp-micros" - }, - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "message", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "_airbyte_additional_properties", - "type": [ - "null", - { - "type": "map", - "values": "string" - } - ], - "default": null - } - ] - }, - "avroObject": { - "author": { - "login": "test", - "id": 12345, - "node_id": "abc123", - "_airbyte_additional_properties": null - }, - "commit": { - "author": { - "name": "Test Author", - "email": "test@example.com", - "date": 1609459261000000, - "_airbyte_additional_properties": null - }, - "message": "test commit message", - "_airbyte_additional_properties": null - }, - "_airbyte_additional_properties": null - } - }, - { - "schemaName": "schema_with_the_same_object_names_and_inner_object", - "namespace": "namespace17", - "appendAirbyteFields": false, "jsonSchema": { "type": "object", "properties": { @@ -1384,8 +1123,8 @@ }, "avroSchema": { "type": "record", - "name": "schema_with_the_same_object_names_and_inner_object", - "namespace": "namespace17", + "name": "schema_with_the_same_object_names", + "namespace": "namespace16", "fields": [ { "name": "author", From f35e1e38aa0ce125668c34eaea509090e578ecd3 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Mon, 20 Dec 2021 11:54:03 +0200 Subject: [PATCH 14/20] updated namespace generation --- .../s3/avro/JsonToAvroSchemaConverter.java | 2 +- .../destination/s3/util/AvroRecordHelper.java | 12 +++++++++++- .../json_conversion_test_cases.json | 12 ++++++------ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 00bf8f93d9cb..768b5599642f 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -215,7 +215,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType String.format("Array field %s has invalid items property: %s", fieldName, items)); } } - case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, resolveNamespace(jsonNodePathMap.get(fieldDefinition)), false, false); + case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, jsonNodePathMap.get(fieldDefinition), false, false); default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java index a053b0d75b1b..e415b234586e 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java @@ -12,8 +12,10 @@ import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; +import java.util.Arrays; import java.util.Iterator; import java.util.Map; +import java.util.stream.Collectors; /** * Helper methods for unit tests. This is needed by multiple modules, so it is in the src directory. @@ -56,7 +58,15 @@ public static void obtainPaths(String currentPath, JsonNode jsonNode, Map> iter = objectNode.fields(); String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; - jsonNodePathMap.put(jsonNode,pathPrefix); + String[] pathFieldsArray = currentPath.split("/"); + String parent = Arrays.stream(pathFieldsArray) + .filter(x -> !x.equals("items")) + .filter(x -> !x.equals("properties")) + .filter(x -> !x.equals(pathFieldsArray[pathFieldsArray.length - 1])) + .collect(Collectors.joining(".")); + if (!parent.isEmpty()){ + jsonNodePathMap.put(jsonNode, parent); + } while (iter.hasNext()) { Map.Entry entry = iter.next(); obtainPaths(pathPrefix + entry.getKey(), entry.getValue(), jsonNodePathMap); diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index 51eddff67d44..d02c4ece2d2c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -104,7 +104,7 @@ { "type": "record", "name": "user", - "namespace": "user", + "namespace": "", "fields": [ { "name": "first_name", @@ -580,7 +580,7 @@ { "type": "record", "name": "user", - "namespace": "user", + "namespace": "", "fields": [ { "name": "created_at", @@ -1133,7 +1133,7 @@ { "type": "record", "name": "author", - "namespace": "author", + "namespace": "", "fields": [ { "name": "login", @@ -1182,7 +1182,7 @@ { "type": "record", "name": "commit", - "namespace": "commit", + "namespace": "", "fields": [ { "name": "author", @@ -1191,7 +1191,7 @@ { "type": "record", "name": "author", - "namespace": "commit.author", + "namespace": "commit", "fields": [ { "name": "name", @@ -1228,7 +1228,7 @@ { "type": "record", "name": "pr", - "namespace": "commit.author.pr", + "namespace": "commit.author", "fields": [ { "name": "id", From 21895ff45ab9e039f14df98f9f45f4e1acf3167d Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Mon, 20 Dec 2021 11:55:49 +0200 Subject: [PATCH 15/20] removed unneeded method from AvroNameTransformer --- .../destination/s3/avro/AvroNameTransformer.java | 9 --------- .../destination/s3/avro/JsonToAvroSchemaConverter.java | 1 - 2 files changed, 10 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index 76ec494763e6..a1a7c176238d 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -28,13 +28,4 @@ private String checkFirsCharInStreamName(final String name) { return "_" + name; } } - - public static String resolveNamespace(String fullPathToNode) { - return fullPathToNode==null ? null : Arrays.stream(fullPathToNode.split("/")) - .filter(key -> !key.isBlank()) - .filter(key -> !key.equals("items")) - .filter(key -> !key.equals("properties")) - .collect(Collectors.joining(".")); - } - } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 768b5599642f..dd0000cb81be 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -26,7 +26,6 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; -import static io.airbyte.integrations.destination.s3.avro.AvroNameTransformer.resolveNamespace; import static io.airbyte.integrations.destination.s3.util.AvroRecordHelper.obtainPaths; /** From 6f0d91f8ce4a332d84a8453771cffb9c6e951955 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Tue, 21 Dec 2021 16:17:29 +0200 Subject: [PATCH 16/20] resolved merge conflicts --- .../integrations/destination/gcs/avro/GcsAvroWriter.java | 2 +- .../destination/s3/avro/JsonToAvroSchemaConverter.java | 5 +++-- .../json_schema_converter/json_conversion_test_cases.json | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java index d8452e9a9b2b..89d5c2cc75f3 100644 --- a/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java +++ b/airbyte-integrations/connectors/destination-gcs/src/main/java/io/airbyte/integrations/destination/gcs/avro/GcsAvroWriter.java @@ -64,7 +64,7 @@ public GcsAvroWriter(final GcsDestinationConfig config, Schema schema = (airbyteSchema == null ? GcsUtils.getDefaultAvroSchema(stream.getName(), stream.getNamespace(), true) : new JsonToAvroSchemaConverter().getAvroSchema(airbyteSchema, stream.getName(), - stream.getNamespace(), true, false, false)); + stream.getNamespace(), true, false, false,true)); LOGGER.info("Avro schema : {}", schema); final String outputFilename = BaseGcsWriter.getOutputFilename(uploadTimestamp, S3Format.AVRO); objectKey = String.join("/", outputPrefix, outputFilename); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index d108637fa320..bdb5b21ccd5b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -99,8 +99,9 @@ public Map getStandardizedNames() { public Schema getAvroSchema(final JsonNode jsonSchema, final String name, @Nullable final String namespace, - final boolean appendAirbyteFields) { - return getAvroSchema(jsonSchema, name, namespace, appendAirbyteFields, true, true); + final boolean appendAirbyteFields, + final boolean isRootNode) { + return getAvroSchema(jsonSchema, name, namespace, appendAirbyteFields, true, true, isRootNode); } /** diff --git a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json index 3bce38d156bd..60844150212b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json +++ b/airbyte-integrations/connectors/destination-s3/src/test/resources/parquet/json_schema_converter/json_conversion_test_cases.json @@ -1011,7 +1011,7 @@ }, { "schemaName": "schema_with_the_same_object_names", - "namespace": "namespace16", + "namespace": "namespace17", "appendAirbyteFields": false, "jsonSchema": { "type": "object", From 7879fd2b912786b47cacb1d8ab4b0a232ccdfb9f Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Tue, 21 Dec 2021 16:22:04 +0200 Subject: [PATCH 17/20] removed unused imports --- .../integrations/destination/s3/avro/AvroNameTransformer.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index a1a7c176238d..c1dc15a076d1 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -6,9 +6,6 @@ import io.airbyte.integrations.destination.ExtendedNameTransformer; -import java.util.Arrays; -import java.util.stream.Collectors; - public class AvroNameTransformer extends ExtendedNameTransformer { @Override @@ -28,4 +25,5 @@ private String checkFirsCharInStreamName(final String name) { return "_" + name; } } + } From 1e16cb297024a12410f74f854df4e0ef3d327626 Mon Sep 17 00:00:00 2001 From: Oleksandr Sheheda Date: Tue, 21 Dec 2021 16:38:23 +0200 Subject: [PATCH 18/20] reformat the code --- .../s3/avro/AvroNameTransformer.java | 4 +- .../s3/avro/JsonToAvroSchemaConverter.java | 16 +- .../destination/s3/util/AvroRecordHelper.java | 15 +- .../json_conversion_test_cases.json | 266 ++++-------------- .../snowflake/SnowflakeDatabase.java | 3 +- .../SnowflakeInternalStagingDestination.java | 25 +- 6 files changed, 82 insertions(+), 247 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java index a1a7c176238d..c1dc15a076d1 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroNameTransformer.java @@ -6,9 +6,6 @@ import io.airbyte.integrations.destination.ExtendedNameTransformer; -import java.util.Arrays; -import java.util.stream.Collectors; - public class AvroNameTransformer extends ExtendedNameTransformer { @Override @@ -28,4 +25,5 @@ private String checkFirsCharInStreamName(final String name) { return "_" + name; } } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index dd0000cb81be..3a43aba1fc68 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -4,6 +4,8 @@ package io.airbyte.integrations.destination.s3.avro; +import static io.airbyte.integrations.destination.s3.util.AvroRecordHelper.obtainPaths; + import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; import com.google.common.base.Preconditions; @@ -26,8 +28,6 @@ import org.slf4j.LoggerFactory; import tech.allegro.schema.json2avro.converter.AdditionalPropertyField; -import static io.airbyte.integrations.destination.s3.util.AvroRecordHelper.obtainPaths; - /** * The main function of this class is to convert a JsonSchema to Avro schema. It can also * standardize schema names, and keep track of a mapping from the original names to the standardized @@ -47,8 +47,8 @@ public class JsonToAvroSchemaConverter { private static final Schema TIMESTAMP_MILLIS_SCHEMA = LogicalTypes.timestampMillis() .addToSchema(Schema.create(Schema.Type.LONG)); - private final Map standardizedNames = new HashMap<>(); - private final Map jsonNodePathMap = new HashMap<>(); + private final Map standardizedNames = new HashMap<>(); + private final Map jsonNodePathMap = new HashMap<>(); static List getNonNullTypes(final String fieldName, final JsonNode fieldDefinition) { return getTypes(fieldName, fieldDefinition).stream() @@ -106,9 +106,9 @@ public Schema getAvroSchema(final JsonNode jsonSchema, final boolean isRootNode) { final String stdName = AvroConstants.NAME_TRANSFORMER.getIdentifier(name); RecordBuilder builder = SchemaBuilder.record(stdName); - if (isRootNode) { - obtainPaths("", jsonSchema, jsonNodePathMap); - } + if (isRootNode) { + obtainPaths("", jsonSchema, jsonNodePathMap); + } if (!stdName.equals(name)) { standardizedNames.put(name, stdName); LOGGER.warn("Schema name contains illegal character(s) and is standardized: {} -> {}", name, @@ -214,7 +214,7 @@ Schema getSingleFieldType(final String fieldName, final JsonSchemaType fieldType String.format("Array field %s has invalid items property: %s", fieldName, items)); } } - case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, jsonNodePathMap.get(fieldDefinition), false, false); + case OBJECT -> fieldSchema = getAvroSchema(fieldDefinition, fieldName, jsonNodePathMap.get(fieldDefinition), false, false); default -> throw new IllegalStateException( String.format("Unexpected type for field %s: %s", fieldName, fieldType)); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java index e415b234586e..059d08176d8e 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/AvroRecordHelper.java @@ -11,7 +11,6 @@ import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; - import java.util.Arrays; import java.util.Iterator; import java.util.Map; @@ -53,18 +52,18 @@ public static JsonNode pruneAirbyteJson(final JsonNode input) { return output; } - public static void obtainPaths(String currentPath, JsonNode jsonNode, Map jsonNodePathMap) { + public static void obtainPaths(String currentPath, JsonNode jsonNode, Map jsonNodePathMap) { if (jsonNode.isObject()) { ObjectNode objectNode = (ObjectNode) jsonNode; Iterator> iter = objectNode.fields(); String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; String[] pathFieldsArray = currentPath.split("/"); String parent = Arrays.stream(pathFieldsArray) - .filter(x -> !x.equals("items")) - .filter(x -> !x.equals("properties")) - .filter(x -> !x.equals(pathFieldsArray[pathFieldsArray.length - 1])) - .collect(Collectors.joining(".")); - if (!parent.isEmpty()){ + .filter(x -> !x.equals("items")) + .filter(x -> !x.equals("properties")) + .filter(x -> !x.equals(pathFieldsArray[pathFieldsArray.length - 1])) + .collect(Collectors.joining(".")); + if (!parent.isEmpty()) { jsonNodePathMap.put(jsonNode, parent); } while (iter.hasNext()) { @@ -76,7 +75,7 @@ public static void obtainPaths(String currentPath, JsonNode jsonNode, Map Date: Tue, 21 Dec 2021 17:09:41 +0200 Subject: [PATCH 19/20] bump version --- .../4816b78f-1489-44c1-9060-4b19d5fa9362.json | 2 +- .../ca8f6566-e555-4b40-943a-545bf123117a.json | 2 +- .../init/src/main/resources/seed/destination_definitions.yaml | 4 ++-- airbyte-integrations/connectors/destination-gcs/Dockerfile | 2 +- airbyte-integrations/connectors/destination-s3/Dockerfile | 2 +- docs/integrations/destinations/gcs.md | 1 + 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json index 3cea3a60802a..a7e817b4dba7 100644 --- a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json +++ b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json @@ -2,7 +2,7 @@ "destinationDefinitionId": "4816b78f-1489-44c1-9060-4b19d5fa9362", "name": "S3", "dockerRepository": "airbyte/destination-s3", - "dockerImageTag": "0.1.14", + "dockerImageTag": "0.2.2", "documentationUrl": "https://docs.airbyte.io/integrations/destinations/s3", "icon": "s3.svg" } diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/ca8f6566-e555-4b40-943a-545bf123117a.json b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/ca8f6566-e555-4b40-943a-545bf123117a.json index 4f278dac67c9..32253be7ffb9 100644 --- a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/ca8f6566-e555-4b40-943a-545bf123117a.json +++ b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/ca8f6566-e555-4b40-943a-545bf123117a.json @@ -2,7 +2,7 @@ "destinationDefinitionId": "ca8f6566-e555-4b40-943a-545bf123117a", "name": "Google Cloud Storage (GCS)", "dockerRepository": "airbyte/destination-gcs", - "dockerImageTag": "0.1.14", + "dockerImageTag": "0.1.17", "documentationUrl": "https://docs.airbyte.io/integrations/destinations/gcs", "icon": "googlecloudstorage.svg" } diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 7f57e644d87b..a8580225ff02 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -60,7 +60,7 @@ - name: Google Cloud Storage (GCS) destinationDefinitionId: ca8f6566-e555-4b40-943a-545bf123117a dockerRepository: airbyte/destination-gcs - dockerImageTag: 0.1.16 + dockerImageTag: 0.1.17 documentationUrl: https://docs.airbyte.io/integrations/destinations/gcs icon: googlecloudstorage.svg - name: Google PubSub @@ -167,7 +167,7 @@ - name: S3 destinationDefinitionId: 4816b78f-1489-44c1-9060-4b19d5fa9362 dockerRepository: airbyte/destination-s3 - dockerImageTag: 0.2.1 + dockerImageTag: 0.2.2 documentationUrl: https://docs.airbyte.io/integrations/destinations/s3 icon: s3.svg - name: SFTP-JSON diff --git a/airbyte-integrations/connectors/destination-gcs/Dockerfile b/airbyte-integrations/connectors/destination-gcs/Dockerfile index 4fda6a25539d..0fccc8ce8c3b 100644 --- a/airbyte-integrations/connectors/destination-gcs/Dockerfile +++ b/airbyte-integrations/connectors/destination-gcs/Dockerfile @@ -7,5 +7,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar RUN tar xf ${APPLICATION}.tar --strip-components=1 -LABEL io.airbyte.version=0.1.16 +LABEL io.airbyte.version=0.1.17 LABEL io.airbyte.name=airbyte/destination-gcs diff --git a/airbyte-integrations/connectors/destination-s3/Dockerfile b/airbyte-integrations/connectors/destination-s3/Dockerfile index 7c5d3ed0db24..b3aac87385d6 100644 --- a/airbyte-integrations/connectors/destination-s3/Dockerfile +++ b/airbyte-integrations/connectors/destination-s3/Dockerfile @@ -7,5 +7,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar RUN tar xf ${APPLICATION}.tar --strip-components=1 -LABEL io.airbyte.version=0.2.1 +LABEL io.airbyte.version=0.2.2 LABEL io.airbyte.name=airbyte/destination-s3 diff --git a/docs/integrations/destinations/gcs.md b/docs/integrations/destinations/gcs.md index 81a52aee7eb3..1157c2aed658 100644 --- a/docs/integrations/destinations/gcs.md +++ b/docs/integrations/destinations/gcs.md @@ -222,6 +222,7 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | +| 0.1.17 | 2021-12-21 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types | | 0.1.16 | 2021-12-20 | [\#8974](https://github.com/airbytehq/airbyte/pull/8974) | Release a new version to ensure there is no excessive logging. | | 0.1.15 | 2021-12-03 | [\#8386](https://github.com/airbytehq/airbyte/pull/8386) | Add new GCP regions | | 0.1.14 | 2021-12-01 | [\#7732](https://github.com/airbytehq/airbyte/pull/7732) | Support timestamp in Avro and Parquet | From affce39e3901b97e51786391427449e2eec53dd8 Mon Sep 17 00:00:00 2001 From: vmaltsev Date: Tue, 21 Dec 2021 17:27:50 +0200 Subject: [PATCH 20/20] bump Bigquery Denormalized version --- .../079d5540-f236-4294-ba7c-ade8fd918496.json | 2 +- .../init/src/main/resources/seed/destination_definitions.yaml | 2 +- .../connectors/destination-bigquery-denormalized/Dockerfile | 2 +- docs/integrations/destinations/bigquery.md | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json index b29180b3354c..8daf1cf6cd2e 100644 --- a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json +++ b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json @@ -2,7 +2,7 @@ "destinationDefinitionId": "079d5540-f236-4294-ba7c-ade8fd918496", "name": "BigQuery (denormalized typed struct)", "dockerRepository": "airbyte/destination-bigquery-denormalized", - "dockerImageTag": "0.1.8", + "dockerImageTag": "0.2.1", "documentationUrl": "https://docs.airbyte.io/integrations/destinations/bigquery", "icon": "bigquery.svg" } diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index a8580225ff02..6b781d0614de 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -19,7 +19,7 @@ - name: BigQuery (denormalized typed struct) destinationDefinitionId: 079d5540-f236-4294-ba7c-ade8fd918496 dockerRepository: airbyte/destination-bigquery-denormalized - dockerImageTag: 0.1.11 + dockerImageTag: 0.2.1 documentationUrl: https://docs.airbyte.io/integrations/destinations/bigquery icon: bigquery.svg - name: Cassandra diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile b/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile index 078057b09805..ca6f9213a062 100644 --- a/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile +++ b/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile @@ -8,5 +8,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar RUN tar xf ${APPLICATION}.tar --strip-components=1 -LABEL io.airbyte.version=0.2.0 +LABEL io.airbyte.version=0.2.1 LABEL io.airbyte.name=airbyte/destination-bigquery-denormalized diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md index 0d1c66fef477..a074d16d274e 100644 --- a/docs/integrations/destinations/bigquery.md +++ b/docs/integrations/destinations/bigquery.md @@ -171,6 +171,7 @@ Therefore, Airbyte BigQuery destination will convert any invalid characters into | Version | Date | Pull Request | Subject | |:--------| :--- | :--- | :--- | +| 0.2.1 | 2021-12-21 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types | | 0.2.0 | 2021-12-17 | [\#8788](https://github.com/airbytehq/airbyte/pull/8788) | BigQuery/BiqQuery denorm Destinations : Add possibility to use different types of GCS files | | 0.1.11 | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations | | 0.1.10 | 2021-11-09 | [\#7804](https://github.com/airbytehq/airbyte/pull/7804) | handle null values in fields described by a $ref definition |