From 4431e911f00db6a4fb686bf6a2b17565263be557 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 11:51:23 -0700 Subject: [PATCH 01/17] Add jsonl format to spec.json --- .../destination-s3/src/main/resources/spec.json | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json index 5843614d2587..93f086deb4b9 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json @@ -65,7 +65,9 @@ "eu-west-2", "eu-west-3", "sa-east-1", - "me-south-1" + "me-south-1", + "us-gov-east-1", + "us-gov-west-1" ] }, "access_key_id": { @@ -105,6 +107,17 @@ } } }, + { + "title": "JSON Lines: newline-delimited JSON", + "required": ["format_type"], + "properties": { + "format_type": { + "type": "string", + "enum": ["JSONL"], + "default": "JSONL" + } + } + }, { "title": "Parquet: Columnar Storage", "required": ["format_type"], From a3614408e545ce56cdac78c45c58188efeec85b3 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 13:10:09 -0700 Subject: [PATCH 02/17] Implement jsonl writer --- .../destination/s3/S3Consumer.java | 3 +- .../integrations/destination/s3/S3Format.java | 3 +- .../destination/s3/S3FormatConfigs.java | 4 + .../destination/s3/csv/S3CsvConstants.java | 39 --------- .../destination/s3/csv/S3CsvWriter.java | 13 +-- .../s3/jsonl/S3JsonlFormatConfig.java | 13 +++ .../destination/s3/jsonl/S3JsonlWriter.java | 80 +++++++++++++++++++ .../s3/parquet/S3ParquetWriter.java | 2 +- .../util/S3StreamTransferManagerHelper.java | 30 +++++++ .../s3/writer/ProductionWriterFactory.java | 4 + 10 files changed, 137 insertions(+), 54 deletions(-) delete mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvConstants.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Consumer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Consumer.java index 4be48614655c..5fedf2406c5b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Consumer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Consumer.java @@ -130,8 +130,7 @@ protected void acceptTracked(AirbyteMessage airbyteMessage) throws Exception { Jsons.serialize(configuredCatalog), Jsons.serialize(recordMessage))); } - UUID id = UUID.randomUUID(); - streamNameAndNamespaceToWriters.get(pair).write(id, recordMessage); + streamNameAndNamespaceToWriters.get(pair).write(UUID.randomUUID(), recordMessage); } @Override diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java index 07c6ec854cf2..5cfc54acb02b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java @@ -27,7 +27,8 @@ public enum S3Format { CSV("csv"), - PARQUET("parquet"); + PARQUET("parquet"), + JSONL("jsonl"); private final String fileExtension; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java index 063f29e8f9ee..30c3400c8968 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java @@ -27,6 +27,7 @@ import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig; +import io.airbyte.integrations.destination.s3.jsonl.S3JsonlFormatConfig; import io.airbyte.integrations.destination.s3.parquet.S3ParquetFormatConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,6 +48,9 @@ public static S3FormatConfig getS3FormatConfig(JsonNode config) { case PARQUET -> { return new S3ParquetFormatConfig(formatConfig); } + case JSONL -> { + return new S3JsonlFormatConfig(); + } default -> { throw new RuntimeException("Unexpected output format: " + Jsons.serialize(config)); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvConstants.java deleted file mode 100644 index 3419501eef4c..000000000000 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvConstants.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2020 Airbyte - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package io.airbyte.integrations.destination.s3.csv; - -public class S3CsvConstants { - - // These parameters are used by {@link StreamTransferManager}. - // See this doc about how they affect memory usage: - // https://alexmojaki.github.io/s3-stream-upload/javadoc/apidocs/alex/mojaki/s3upload/StreamTransferManager.html - // Total memory = (numUploadThreads + queueCapacity) * partSize + numStreams * (partSize + 6MB) - // = 31 MB at current configurations - public static final int DEFAULT_UPLOAD_THREADS = 2; - public static final int DEFAULT_QUEUE_CAPACITY = 2; - public static final int DEFAULT_PART_SIZE_MB = 5; - public static final int DEFAULT_NUM_STREAMS = 1; - -} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java index 4c7f431a5594..3451c94ec95e 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java @@ -29,6 +29,7 @@ import com.amazonaws.services.s3.AmazonS3; import io.airbyte.integrations.destination.s3.S3DestinationConfig; import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.util.S3StreamTransferManagerHelper; import io.airbyte.integrations.destination.s3.writer.BaseS3Writer; import io.airbyte.integrations.destination.s3.writer.S3Writer; import io.airbyte.protocol.models.AirbyteRecordMessage; @@ -70,17 +71,7 @@ public S3CsvWriter(S3DestinationConfig config, LOGGER.info("Full S3 path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); - // The stream transfer manager lets us greedily stream into S3. The native AWS SDK does not - // have support for streaming multipart uploads. The alternative is first writing the entire - // output to disk before loading into S3. This is not feasible with large input. - // Data is chunked into parts during the upload. A part is sent off to a queue to be uploaded - // once it has reached it's configured part size. - // See {@link S3DestinationConstants} for memory usage calculation. - this.uploadManager = new StreamTransferManager(config.getBucketName(), objectKey, s3Client) - .numStreams(S3CsvConstants.DEFAULT_NUM_STREAMS) - .queueCapacity(S3CsvConstants.DEFAULT_QUEUE_CAPACITY) - .numUploadThreads(S3CsvConstants.DEFAULT_UPLOAD_THREADS) - .partSize(S3CsvConstants.DEFAULT_PART_SIZE_MB); + this.uploadManager = S3StreamTransferManagerHelper.getDefault(config.getBucketName(), objectKey, s3Client); // We only need one output stream as we only have one input stream. This is reasonably performant. this.outputStream = uploadManager.getMultiPartOutputStreams().get(0); this.csvPrinter = new CSVPrinter(new PrintWriter(outputStream, true, StandardCharsets.UTF_8), diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java new file mode 100644 index 000000000000..bb57f4c01094 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java @@ -0,0 +1,13 @@ +package io.airbyte.integrations.destination.s3.jsonl; + +import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.S3FormatConfig; + +public class S3JsonlFormatConfig implements S3FormatConfig { + + @Override + public S3Format getFormat() { + return S3Format.JSONL; + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java new file mode 100644 index 000000000000..0853ea01da04 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java @@ -0,0 +1,80 @@ +package io.airbyte.integrations.destination.s3.jsonl; + +import alex.mojaki.s3upload.MultiPartOutputStream; +import alex.mojaki.s3upload.StreamTransferManager; +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.jackson.MoreMappers; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.s3.S3DestinationConfig; +import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.util.S3StreamTransferManagerHelper; +import io.airbyte.integrations.destination.s3.writer.BaseS3Writer; +import io.airbyte.integrations.destination.s3.writer.S3Writer; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.sql.Timestamp; +import java.util.UUID; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class S3JsonlWriter extends BaseS3Writer implements S3Writer { + + protected static final Logger LOGGER = LoggerFactory.getLogger(S3JsonlWriter.class); + + private static final ObjectMapper MAPPER = MoreMappers.initMapper(); + private static final ObjectWriter WRITER = MAPPER.writer(); + + private final StreamTransferManager uploadManager; + private final MultiPartOutputStream outputStream; + private final PrintWriter printWriter; + + public S3JsonlWriter(S3DestinationConfig config, + AmazonS3 s3Client, + ConfiguredAirbyteStream configuredStream, + Timestamp uploadTimestamp) { + super(config, s3Client, configuredStream); + + String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.JSONL); + String objectKey = String.join("/", outputPrefix, outputFilename); + + LOGGER.info("Full S3 path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), + objectKey); + + this.uploadManager = S3StreamTransferManagerHelper.getDefault(config.getBucketName(), objectKey, s3Client); + // We only need one output stream as we only have one input stream. This is reasonably performant. + this.outputStream = uploadManager.getMultiPartOutputStreams().get(0); + this.printWriter = new PrintWriter(outputStream, true, StandardCharsets.UTF_8); + } + + @Override + public void write(UUID id, AirbyteRecordMessage recordMessage) { + ObjectNode json = MAPPER.createObjectNode(); + json.put(JavaBaseConstants.COLUMN_NAME_AB_ID, id.toString()); + json.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, recordMessage.getEmittedAt()); + json.set(JavaBaseConstants.COLUMN_NAME_DATA, recordMessage.getData()); + printWriter.println(Jsons.serialize(json)); + } + + @Override + public void close(boolean hasFailed) { + printWriter.close(); + outputStream.close(); + + if (hasFailed) { + LOGGER.warn("Failure detected. Aborting upload of stream '{}'...", stream.getName()); + uploadManager.abort(); + LOGGER.warn("Upload of stream '{}' aborted.", stream.getName()); + } else { + LOGGER.info("Uploading remaining data for stream '{}'.", stream.getName()); + uploadManager.complete(); + LOGGER.info("Upload completed for stream '{}'.", stream.getName()); + } + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java index 0d096b23f1a5..a624f73db99a 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java @@ -117,7 +117,7 @@ public void write(UUID id, AirbyteRecordMessage recordMessage) throws IOExceptio inputData = nameUpdater.getJsonWithStandardizedFieldNames(inputData); ObjectNode jsonRecord = MAPPER.createObjectNode(); - jsonRecord.put(JavaBaseConstants.COLUMN_NAME_AB_ID, UUID.randomUUID().toString()); + jsonRecord.put(JavaBaseConstants.COLUMN_NAME_AB_ID, id.toString()); jsonRecord.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, recordMessage.getEmittedAt()); jsonRecord.setAll((ObjectNode) inputData); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java new file mode 100644 index 000000000000..3f9ac5dc4fb9 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java @@ -0,0 +1,30 @@ +package io.airbyte.integrations.destination.s3.util; + +import alex.mojaki.s3upload.StreamTransferManager; +import com.amazonaws.services.s3.AmazonS3; + +public class S3StreamTransferManagerHelper { + + // See this doc about how they affect memory usage: + // https://alexmojaki.github.io/s3-stream-upload/javadoc/apidocs/alex/mojaki/s3upload/StreamTransferManager.html + // Total memory = (numUploadThreads + queueCapacity) * partSize + numStreams * (partSize + 6MB) + // = 31 MB at current configurations + public static final int DEFAULT_UPLOAD_THREADS = 2; + public static final int DEFAULT_QUEUE_CAPACITY = 2; + public static final int DEFAULT_PART_SIZE_MB = 5; + public static final int DEFAULT_NUM_STREAMS = 1; + + public static StreamTransferManager getDefault(String bucketName, String objectKey, AmazonS3 s3Client) { + // The stream transfer manager lets us greedily stream into S3. The native AWS SDK does not + // have support for streaming multipart uploads. The alternative is first writing the entire + // output to disk before loading into S3. This is not feasible with large input. + // Data is chunked into parts during the upload. A part is sent off to a queue to be uploaded + // once it has reached it's configured part size. + return new StreamTransferManager(bucketName, objectKey, s3Client) + .numStreams(DEFAULT_NUM_STREAMS) + .queueCapacity(DEFAULT_QUEUE_CAPACITY) + .numUploadThreads(DEFAULT_UPLOAD_THREADS) + .partSize(DEFAULT_PART_SIZE_MB); + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java index bdc7549b542e..78a8b2e3d690 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java @@ -28,6 +28,7 @@ import io.airbyte.integrations.destination.s3.S3DestinationConfig; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.csv.S3CsvWriter; +import io.airbyte.integrations.destination.s3.jsonl.S3JsonlWriter; import io.airbyte.integrations.destination.s3.parquet.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.parquet.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; @@ -65,6 +66,9 @@ public S3Writer create(S3DestinationConfig config, return new S3ParquetWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, nameUpdater); } + if (format == S3Format.JSONL) { + return new S3JsonlWriter(config, s3Client, configuredStream, uploadTimestamp); + } throw new RuntimeException("Unexpected S3 destination format: " + format); } From 37fcc6015000f59a712183a2ea01490fd099a6ca Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 13:55:21 -0700 Subject: [PATCH 03/17] Add documentation --- docs/integrations/destinations/s3.md | 42 +++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index c2ad2d321814..8ec3b7220619 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -76,7 +76,7 @@ Each CSV file includes at least two Airbyte metadata columns. Depending on the ` | `_airbyte_data` | When no flattening is needed, all data reside under this column as a json blob. | | root level fields| When root level flattening is selected, the root level fields are expanded. | - For example, given the following json object from a source: +For example, given the following json object from a source: ```json { @@ -100,6 +100,46 @@ With root level flattening, the output CSV is: | :--- | :--- | :--- | :--- | | `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | +### JSON Lines (JSONL) + +[Json Lines](https://jsonlines.org/) is a text format with one JSON per line. Each line has a structure as follows: + +```json +{ + "_airbyte_ab_id": "", + "_airbyte_emitted_at": "", + "_airbyte_data": "" +} +``` + +For example, given the following json object from a source: + +```json +{ + "user_id": 123, + "name": { + "first": "John", + "last": "Doe" + } +} +``` + +It will become this JSON line: + +```json +{ + "_airbyte_ab_id": "26d73cde-7eb1-4e1e-b7db-a4c03b4cf206", + "_airbyte_emitted_at": "1622135805000", + "_airbyte_data": { + "user_id": 123, + "name": { + "first": "John", + "last": "Doe" + } + } +} +``` + ### Parquet #### Configuration From f85c89866078f307504e1cc1d592ef4855067938 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 14:28:44 -0700 Subject: [PATCH 04/17] Add acceptance test --- .../s3/S3CsvDestinationAcceptanceTest.java | 15 +++--- .../s3/S3DestinationAcceptanceTest.java | 8 +++ .../s3/S3JsonlDestinationAcceptanceTest.java | 50 +++++++++++++++++++ .../S3ParquetDestinationAcceptanceTest.java | 21 ++++---- 4 files changed, 77 insertions(+), 17 deletions(-) create mode 100644 airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3CsvDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3CsvDestinationAcceptanceTest.java index 734ee29e07c0..50daa31b8dd6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3CsvDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3CsvDestinationAcceptanceTest.java @@ -114,13 +114,14 @@ protected List retrieveRecords(TestDestinationEnv testEnv, for (S3ObjectSummary objectSummary : objectSummaries) { S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey()); - Reader in = new InputStreamReader(object.getObjectContent(), StandardCharsets.UTF_8); - Iterable records = CSVFormat.DEFAULT - .withQuoteMode(QuoteMode.NON_NUMERIC) - .withFirstRecordAsHeader() - .parse(in); - StreamSupport.stream(records.spliterator(), false) - .forEach(r -> jsonRecords.add(getJsonNode(r.toMap(), fieldTypes))); + try (Reader in = new InputStreamReader(object.getObjectContent(), StandardCharsets.UTF_8)) { + Iterable records = CSVFormat.DEFAULT + .withQuoteMode(QuoteMode.NON_NUMERIC) + .withFirstRecordAsHeader() + .parse(in); + StreamSupport.stream(records.spliterator(), false) + .forEach(r -> jsonRecords.add(getJsonNode(r.toMap(), fieldTypes))); + } } return jsonRecords; diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3DestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3DestinationAcceptanceTest.java index db1147ed1e5a..12eb55522f06 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3DestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3DestinationAcceptanceTest.java @@ -122,6 +122,11 @@ protected List getAllSyncedObjects(String streamName, String na protected abstract JsonNode getFormatConfig(); + /** + * This method does the following: + *
  • Construct the S3 destination config.
  • + *
  • Construct the S3 client.
  • + */ @Override protected void setup(TestDestinationEnv testEnv) { JsonNode baseConfigJson = getBaseConfigJson(); @@ -161,6 +166,9 @@ protected void setup(TestDestinationEnv testEnv) { } } + /** + * Remove all the S3 output from the tests. + */ @Override protected void tearDown(TestDestinationEnv testEnv) { List keysToDelete = new LinkedList<>(); diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java new file mode 100644 index 000000000000..7b6cac948b33 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java @@ -0,0 +1,50 @@ +package io.airbyte.integrations.destination.s3; + +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.JavaBaseConstants; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.LinkedList; +import java.util.List; + +public class S3JsonlDestinationAcceptanceTest extends S3DestinationAcceptanceTest { + + protected S3JsonlDestinationAcceptanceTest() { + super(S3Format.JSONL); + } + + @Override + protected JsonNode getFormatConfig() { + return Jsons.deserialize("{\n" + + " \"format_type\": \"JSONL\"\n" + + "}"); + } + + @Override + protected List retrieveRecords(TestDestinationEnv testEnv, + String streamName, + String namespace, + JsonNode streamSchema) + throws IOException { + List objectSummaries = getAllSyncedObjects(streamName, namespace); + List jsonRecords = new LinkedList<>(); + + for (S3ObjectSummary objectSummary : objectSummaries) { + S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey()); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(object.getObjectContent(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + jsonRecords.add(Jsons.deserialize(line).get(JavaBaseConstants.COLUMN_NAME_DATA)); + } + } + } + + return jsonRecords; + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java index 6964ddca3654..a47400a7b8a0 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java @@ -80,17 +80,18 @@ protected List retrieveRecords(TestDestinationEnv testEnv, URI uri = new URI(String.format("s3a://%s/%s", object.getBucketName(), object.getKey())); var path = new org.apache.hadoop.fs.Path(uri); Configuration hadoopConfig = S3ParquetWriter.getHadoopConfig(config); - ParquetReader parquetReader = ParquetReader.builder(new AvroReadSupport<>(), path) + + try (ParquetReader parquetReader = ParquetReader.builder(new AvroReadSupport<>(), path) .withConf(hadoopConfig) - .build(); - - ObjectReader jsonReader = MAPPER.reader(); - GenericData.Record record; - while ((record = parquetReader.read()) != null) { - byte[] jsonBytes = converter.convertToJson(record); - JsonNode jsonRecord = jsonReader.readTree(jsonBytes); - jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); - jsonRecords.add(pruneAirbyteJson(jsonRecord)); + .build()) { + ObjectReader jsonReader = MAPPER.reader(); + GenericData.Record record; + while ((record = parquetReader.read()) != null) { + byte[] jsonBytes = converter.convertToJson(record); + JsonNode jsonRecord = jsonReader.readTree(jsonBytes); + jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); + jsonRecords.add(pruneAirbyteJson(jsonRecord)); + } } } From 7c8af86e98dd31810ccc84a135c9dce2e7edf815 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 15:14:01 -0700 Subject: [PATCH 05/17] Update document --- airbyte-integrations/builds.md | 3 ++- docs/integrations/destinations/s3.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/builds.md b/airbyte-integrations/builds.md index 31c4b576abb5..bef43d60c718 100644 --- a/airbyte-integrations/builds.md +++ b/airbyte-integrations/builds.md @@ -124,5 +124,6 @@ Redshift [![destination-redshift](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-redshift%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-redshift) - Snowflake [![destination-snowflake](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-snowflake%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-snowflake) + S3 [![destination-s3](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-s3%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-s3) + Snowflake [![destination-snowflake](https://img.shields.io/endpoint?url=https%3A%2F%2Fstatus-api.airbyte.io%2Ftests%2Fsummary%2Fdestination-snowflake%2Fbadge.json)](https://status-api.airbyte.io/tests/summary/destination-snowflake) diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 8ec3b7220619..8501a66563ee 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -321,6 +321,7 @@ Its corresponding Avro / Parquet schema will be: | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | +| 0.1.7 | 2021-06-20 | [#4227](https://github.com/airbytehq/airbyte/pull/4227) | Added JSONL output. | | 0.1.6 | 2021-06-16 | [#4130](https://github.com/airbytehq/airbyte/pull/4130) | Patched the check to verify prefix access instead of full-bucket access. | | 0.1.5 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Fixed default `max_padding_size_mb` in `spec.json`. | | 0.1.4 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Added Parquet output. | From 21aeeb4bcbdfe1ffd6e5e3af5d1bd122ca1a99c0 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 15:17:39 -0700 Subject: [PATCH 06/17] Bump version --- .../4816b78f-1489-44c1-9060-4b19d5fa9362.json | 2 +- .../init/src/main/resources/seed/destination_definitions.yaml | 2 +- airbyte-integrations/connectors/destination-s3/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json index 4791b97c32be..7a7973a48c81 100644 --- a/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json +++ b/airbyte-config/init/src/main/resources/config/STANDARD_DESTINATION_DEFINITION/4816b78f-1489-44c1-9060-4b19d5fa9362.json @@ -2,6 +2,6 @@ "destinationDefinitionId": "4816b78f-1489-44c1-9060-4b19d5fa9362", "name": "S3", "dockerRepository": "airbyte/destination-s3", - "dockerImageTag": "0.1.6", + "dockerImageTag": "0.1.7", "documentationUrl": "https://docs.airbyte.io/integrations/destinations/s3" } diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 689218d03b2d..e8119989fd02 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -27,7 +27,7 @@ - destinationDefinitionId: 4816b78f-1489-44c1-9060-4b19d5fa9362 name: S3 dockerRepository: airbyte/destination-s3 - dockerImageTag: 0.1.6 + dockerImageTag: 0.1.7 documentationUrl: https://docs.airbyte.io/integrations/destinations/s3 - destinationDefinitionId: f7a7d195-377f-cf5b-70a5-be6b819019dc name: Redshift diff --git a/airbyte-integrations/connectors/destination-s3/Dockerfile b/airbyte-integrations/connectors/destination-s3/Dockerfile index 004206f33dc1..aea3084c0b4e 100644 --- a/airbyte-integrations/connectors/destination-s3/Dockerfile +++ b/airbyte-integrations/connectors/destination-s3/Dockerfile @@ -7,5 +7,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar RUN tar xf ${APPLICATION}.tar --strip-components=1 -LABEL io.airbyte.version=0.1.6 +LABEL io.airbyte.version=0.1.7 LABEL io.airbyte.name=airbyte/destination-s3 From f0ec64e809ac949cf9cd189bf364e280655870a7 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sat, 19 Jun 2021 15:22:17 -0700 Subject: [PATCH 07/17] Update document example --- docs/integrations/destinations/s3.md | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 8501a66563ee..cd68ace33c56 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -112,32 +112,32 @@ With root level flattening, the output CSV is: } ``` -For example, given the following json object from a source: +For example, given the following two json objects from a source: ```json -{ - "user_id": 123, - "name": { - "first": "John", - "last": "Doe" - } -} -``` - -It will become this JSON line: - -```json -{ - "_airbyte_ab_id": "26d73cde-7eb1-4e1e-b7db-a4c03b4cf206", - "_airbyte_emitted_at": "1622135805000", - "_airbyte_data": { +[ + { "user_id": 123, "name": { "first": "John", "last": "Doe" } + }, + { + "user_id": 456, + "name": { + "first": "Jane", + "last": "Roe" + } } -} +] +``` + +They will be like this in the output file: + +```jsonl +{ "_airbyte_ab_id": "26d73cde-7eb1-4e1e-b7db-a4c03b4cf206", "_airbyte_emitted_at": "1622135805000", "_airbyte_data": { "user_id": 123, "name": { "first": "John", "last": "Doe" } } } +{ "_airbyte_ab_id": "0a61de1b-9cdd-4455-a739-93572c9a5f20", "_airbyte_emitted_at": "1631948170000", "_airbyte_data": { "user_id": 456, "name": { "first": "Jane", "last": "Roe" } } } ``` ### Parquet From 5051d0f5b4608b837c1f658581a1e1b0f459093a Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 00:51:17 -0700 Subject: [PATCH 08/17] Implement avro writer --- .../integrations/destination/s3/S3Format.java | 5 +- .../destination/s3/S3FormatConfigs.java | 10 ++- .../s3/avro/AvroRecordFactory.java | 42 +++++++++++ .../JsonFieldNameUpdater.java | 2 +- .../s3/{parquet => avro}/JsonSchemaType.java | 2 +- .../JsonToAvroSchemaConverter.java | 14 ++-- .../destination/s3/avro/S3AvroConstants.java | 9 +++ .../s3/avro/S3AvroFormatConfig.java | 17 +++++ .../destination/s3/avro/S3AvroWriter.java | 74 +++++++++++++++++++ .../destination/s3/csv/S3CsvWriter.java | 18 ++--- .../destination/s3/jsonl/S3JsonlWriter.java | 18 ++--- .../s3/parquet/S3ParquetConstants.java | 4 - .../s3/parquet/S3ParquetWriter.java | 44 +++-------- .../destination/s3/writer/BaseS3Writer.java | 32 ++++++++ .../s3/writer/ProductionWriterFactory.java | 22 ++++-- .../S3ParquetDestinationAcceptanceTest.java | 4 +- .../JsonFieldNameUpdaterTest.java | 3 +- .../JsonToAvroSchemaConverterTest.java | 4 +- 18 files changed, 242 insertions(+), 82 deletions(-) create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java rename airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/{parquet => avro}/JsonFieldNameUpdater.java (98%) rename airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/{parquet => avro}/JsonSchemaType.java (97%) rename airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/{parquet => avro}/JsonToAvroSchemaConverter.java (95%) create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java rename airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/{parquet => avro}/JsonFieldNameUpdaterTest.java (94%) rename airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/{parquet => avro}/JsonToAvroSchemaConverterTest.java (95%) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java index 5cfc54acb02b..e834f11b11cc 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3Format.java @@ -26,9 +26,10 @@ public enum S3Format { + AVRO("avro"), CSV("csv"), - PARQUET("parquet"), - JSONL("jsonl"); + JSONL("jsonl"), + PARQUET("parquet"); private final String fileExtension; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java index 30c3400c8968..d969846100c6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/S3FormatConfigs.java @@ -26,6 +26,7 @@ import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.s3.avro.S3AvroFormatConfig; import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig; import io.airbyte.integrations.destination.s3.jsonl.S3JsonlFormatConfig; import io.airbyte.integrations.destination.s3.parquet.S3ParquetFormatConfig; @@ -42,15 +43,18 @@ public static S3FormatConfig getS3FormatConfig(JsonNode config) { S3Format formatType = S3Format.valueOf(formatConfig.get("format_type").asText().toUpperCase()); switch (formatType) { + case AVRO -> { + return new S3AvroFormatConfig(formatConfig); + } case CSV -> { return new S3CsvFormatConfig(formatConfig); } - case PARQUET -> { - return new S3ParquetFormatConfig(formatConfig); - } case JSONL -> { return new S3JsonlFormatConfig(); } + case PARQUET -> { + return new S3ParquetFormatConfig(formatConfig); + } default -> { throw new RuntimeException("Unexpected output format: " + Jsons.serialize(config)); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java new file mode 100644 index 000000000000..9978a7d3d300 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java @@ -0,0 +1,42 @@ +package io.airbyte.integrations.destination.s3.avro; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.jackson.MoreMappers; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import java.util.UUID; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import tech.allegro.schema.json2avro.converter.JsonAvroConverter; + +public class AvroRecordFactory { + + private static final ObjectMapper MAPPER = MoreMappers.initMapper(); + private static final ObjectWriter WRITER = MAPPER.writer(); + + private final Schema schema; + private final JsonFieldNameUpdater nameUpdater; + private final JsonAvroConverter converter = new JsonAvroConverter(); + + public AvroRecordFactory(Schema schema, JsonFieldNameUpdater nameUpdater) { + this.schema = schema; + this.nameUpdater = nameUpdater; + } + + public GenericData.Record getAvroRecord(UUID id, AirbyteRecordMessage recordMessage) throws JsonProcessingException { + JsonNode inputData = recordMessage.getData(); + inputData = nameUpdater.getJsonWithStandardizedFieldNames(inputData); + + ObjectNode jsonRecord = MAPPER.createObjectNode(); + jsonRecord.put(JavaBaseConstants.COLUMN_NAME_AB_ID, id.toString()); + jsonRecord.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, recordMessage.getEmittedAt()); + jsonRecord.setAll((ObjectNode) inputData); + + return converter.convertToGenericDataRecord(WRITER.writeValueAsBytes(jsonRecord), schema); + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdater.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdater.java similarity index 98% rename from airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdater.java rename to airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdater.java index 0e94acbdef1f..01c209d637da 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdater.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdater.java @@ -22,7 +22,7 @@ * SOFTWARE. */ -package io.airbyte.integrations.destination.s3.parquet; +package io.airbyte.integrations.destination.s3.avro; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.collect.ImmutableMap; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonSchemaType.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonSchemaType.java similarity index 97% rename from airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonSchemaType.java rename to airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonSchemaType.java index bb129b0b9cb3..93d4c5633ced 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonSchemaType.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonSchemaType.java @@ -22,7 +22,7 @@ * SOFTWARE. */ -package io.airbyte.integrations.destination.s3.parquet; +package io.airbyte.integrations.destination.s3.avro; import org.apache.avro.Schema; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java similarity index 95% rename from airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverter.java rename to airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java index 16b2a9ffe879..e53f318f58c1 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverter.java @@ -22,7 +22,7 @@ * SOFTWARE. */ -package io.airbyte.integrations.destination.s3.parquet; +package io.airbyte.integrations.destination.s3.avro; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Preconditions; @@ -51,7 +51,7 @@ * ones. *

    * For limitations of this converter, see the README of this connector: - * https://docs.airbyte.io/integrations/destinations/s3#parquet + * https://docs.airbyte.io/integrations/destinations/s3#avro */ public class JsonToAvroSchemaConverter { @@ -102,8 +102,8 @@ public Schema getAvroSchema(JsonNode jsonSchema, stdName); builder = builder.doc( String.format("%s%s%s", - S3ParquetConstants.DOC_KEY_ORIGINAL_NAME, - S3ParquetConstants.DOC_KEY_VALUE_DELIMITER, + S3AvroConstants.DOC_KEY_ORIGINAL_NAME, + S3AvroConstants.DOC_KEY_VALUE_DELIMITER, name)); } if (namespace != null) { @@ -130,8 +130,8 @@ public Schema getAvroSchema(JsonNode jsonSchema, LOGGER.warn("Field name contains illegal character(s) and is standardized: {} -> {}", fieldName, stdFieldName); fieldBuilder = fieldBuilder.doc(String.format("%s%s%s", - S3ParquetConstants.DOC_KEY_ORIGINAL_NAME, - S3ParquetConstants.DOC_KEY_VALUE_DELIMITER, + S3AvroConstants.DOC_KEY_ORIGINAL_NAME, + S3AvroConstants.DOC_KEY_VALUE_DELIMITER, fieldName)); } assembler = fieldBuilder.type(getNullableFieldTypes(fieldName, fieldDefinition)) @@ -202,7 +202,7 @@ Schema getNullableFieldTypes(String fieldName, JsonNode fieldDefinition) { if (nonNullFieldTypes.isEmpty()) { return Schema.create(Schema.Type.NULL); } else { - // Mark every field as nullable to prevent missing value exceptions from Parquet. + // Mark every field as nullable to prevent missing value exceptions from Avro / Parquet. nonNullFieldTypes.add(0, Schema.create(Schema.Type.NULL)); return Schema.createUnion(nonNullFieldTypes); } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java new file mode 100644 index 000000000000..61caa40d62e3 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java @@ -0,0 +1,9 @@ +package io.airbyte.integrations.destination.s3.avro; + +public class S3AvroConstants { + + // Field name with special character + public static final String DOC_KEY_VALUE_DELIMITER = ":"; + public static final String DOC_KEY_ORIGINAL_NAME = "_airbyte_original_name"; + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java new file mode 100644 index 000000000000..3f6273ae05ba --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java @@ -0,0 +1,17 @@ +package io.airbyte.integrations.destination.s3.avro; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.S3FormatConfig; + +public class S3AvroFormatConfig implements S3FormatConfig { + + public S3AvroFormatConfig(JsonNode formatConfig) { + } + + @Override + public S3Format getFormat() { + return S3Format.AVRO; + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java new file mode 100644 index 000000000000..c11a233eacc9 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java @@ -0,0 +1,74 @@ +package io.airbyte.integrations.destination.s3.avro; + +import alex.mojaki.s3upload.MultiPartOutputStream; +import alex.mojaki.s3upload.StreamTransferManager; +import com.amazonaws.services.s3.AmazonS3; +import io.airbyte.integrations.destination.s3.S3DestinationConfig; +import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.util.S3StreamTransferManagerHelper; +import io.airbyte.integrations.destination.s3.writer.BaseS3Writer; +import io.airbyte.integrations.destination.s3.writer.S3Writer; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.UUID; +import org.apache.avro.Schema; +import org.apache.avro.file.CodecFactory; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericData.Record; +import org.apache.avro.generic.GenericDatumWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class S3AvroWriter extends BaseS3Writer implements S3Writer { + + protected static final Logger LOGGER = LoggerFactory.getLogger(S3AvroWriter.class); + + private final StreamTransferManager uploadManager; + private final MultiPartOutputStream outputStream; + private final DataFileWriter dataFileWriter; + private final AvroRecordFactory avroRecordFactory; + + public S3AvroWriter(S3DestinationConfig config, + AmazonS3 s3Client, + ConfiguredAirbyteStream configuredStream, + Timestamp uploadTimestamp, + Schema schema, + JsonFieldNameUpdater nameUpdater) throws IOException { + super(config, s3Client, configuredStream); + + String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.AVRO); + String objectKey = String.join("/", outputPrefix, outputFilename); + + LOGGER.info("Full S3 path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), + objectKey); + + this.uploadManager = S3StreamTransferManagerHelper.getDefault(config.getBucketName(), objectKey, s3Client); + // We only need one output stream as we only have one input stream. This is reasonably performant. + this.outputStream = uploadManager.getMultiPartOutputStreams().get(0); + this.dataFileWriter = new DataFileWriter<>(new GenericDatumWriter()).create(schema, outputStream); + this.avroRecordFactory = new AvroRecordFactory(schema, nameUpdater); + } + + @Override + public void write(UUID id, AirbyteRecordMessage recordMessage) throws IOException { + dataFileWriter.append(avroRecordFactory.getAvroRecord(id, recordMessage)); + } + + @Override + protected void closeWhenSucceed() throws IOException { + dataFileWriter.close(); + outputStream.close(); + uploadManager.complete(); + } + + @Override + protected void closeWhenFail() throws IOException { + dataFileWriter.close(); + outputStream.close(); + uploadManager.abort(); + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java index 3451c94ec95e..de0c53ed922b 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/csv/S3CsvWriter.java @@ -85,19 +85,17 @@ public void write(UUID id, AirbyteRecordMessage recordMessage) throws IOExceptio } @Override - public void close(boolean hasFailed) throws IOException { + protected void closeWhenSucceed() throws IOException { csvPrinter.close(); outputStream.close(); + uploadManager.complete(); + } - if (hasFailed) { - LOGGER.warn("Failure detected. Aborting upload of stream '{}'...", stream.getName()); - uploadManager.abort(); - LOGGER.warn("Upload of stream '{}' aborted.", stream.getName()); - } else { - LOGGER.info("Uploading remaining data for stream '{}'.", stream.getName()); - uploadManager.complete(); - LOGGER.info("Upload completed for stream '{}'.", stream.getName()); - } + @Override + protected void closeWhenFail() throws IOException { + csvPrinter.close(); + outputStream.close(); + uploadManager.abort(); } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java index 0853ea01da04..a14bf00382bd 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java @@ -62,19 +62,17 @@ public void write(UUID id, AirbyteRecordMessage recordMessage) { } @Override - public void close(boolean hasFailed) { + protected void closeWhenSucceed() { printWriter.close(); outputStream.close(); + uploadManager.complete(); + } - if (hasFailed) { - LOGGER.warn("Failure detected. Aborting upload of stream '{}'...", stream.getName()); - uploadManager.abort(); - LOGGER.warn("Upload of stream '{}' aborted.", stream.getName()); - } else { - LOGGER.info("Uploading remaining data for stream '{}'.", stream.getName()); - uploadManager.complete(); - LOGGER.info("Upload completed for stream '{}'.", stream.getName()); - } + @Override + protected void closeWhenFail() { + printWriter.close(); + outputStream.close(); + uploadManager.abort(); } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetConstants.java index b833dceffb7d..05c6b324e22e 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetConstants.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetConstants.java @@ -36,8 +36,4 @@ public class S3ParquetConstants { public static final int DEFAULT_DICTIONARY_PAGE_SIZE_KB = 1024; public static final boolean DEFAULT_DICTIONARY_ENCODING = true; - // Field name with special character - public static final String DOC_KEY_VALUE_DELIMITER = ":"; - public static final String DOC_KEY_ORIGINAL_NAME = "_airbyte_original_name"; - } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java index a624f73db99a..36454de9735d 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/parquet/S3ParquetWriter.java @@ -25,14 +25,10 @@ package io.airbyte.integrations.destination.s3.parquet; import com.amazonaws.services.s3.AmazonS3; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectWriter; -import com.fasterxml.jackson.databind.node.ObjectNode; -import io.airbyte.commons.jackson.MoreMappers; -import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.integrations.destination.s3.S3DestinationConfig; import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.avro.AvroRecordFactory; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.writer.BaseS3Writer; import io.airbyte.integrations.destination.s3.writer.S3Writer; import io.airbyte.protocol.models.AirbyteRecordMessage; @@ -53,18 +49,13 @@ import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import tech.allegro.schema.json2avro.converter.JsonAvroConverter; public class S3ParquetWriter extends BaseS3Writer implements S3Writer { private static final Logger LOGGER = LoggerFactory.getLogger(S3ParquetWriter.class); - private static final ObjectMapper MAPPER = MoreMappers.initMapper(); - private static final ObjectWriter WRITER = MAPPER.writer(); - private final Schema schema; - private final JsonFieldNameUpdater nameUpdater; private final ParquetWriter parquetWriter; - private final JsonAvroConverter converter = new JsonAvroConverter(); + private final AvroRecordFactory avroRecordFactory; public S3ParquetWriter(S3DestinationConfig config, AmazonS3 s3Client, @@ -74,8 +65,6 @@ public S3ParquetWriter(S3DestinationConfig config, JsonFieldNameUpdater nameUpdater) throws URISyntaxException, IOException { super(config, s3Client, configuredStream); - this.schema = schema; - this.nameUpdater = nameUpdater; String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.PARQUET); String objectKey = String.join("/", outputPrefix, outputFilename); @@ -98,6 +87,7 @@ public S3ParquetWriter(S3DestinationConfig config, .withDictionaryPageSize(formatConfig.getDictionaryPageSize()) .withDictionaryEncoding(formatConfig.isDictionaryEncoding()) .build(); + this.avroRecordFactory = new AvroRecordFactory(schema, nameUpdater); } public static Configuration getHadoopConfig(S3DestinationConfig config) { @@ -113,29 +103,17 @@ public static Configuration getHadoopConfig(S3DestinationConfig config) { @Override public void write(UUID id, AirbyteRecordMessage recordMessage) throws IOException { - JsonNode inputData = recordMessage.getData(); - inputData = nameUpdater.getJsonWithStandardizedFieldNames(inputData); - - ObjectNode jsonRecord = MAPPER.createObjectNode(); - jsonRecord.put(JavaBaseConstants.COLUMN_NAME_AB_ID, id.toString()); - jsonRecord.put(JavaBaseConstants.COLUMN_NAME_EMITTED_AT, recordMessage.getEmittedAt()); - jsonRecord.setAll((ObjectNode) inputData); + parquetWriter.write(avroRecordFactory.getAvroRecord(id, recordMessage)); + } - GenericData.Record avroRecord = converter.convertToGenericDataRecord(WRITER.writeValueAsBytes(jsonRecord), schema); - parquetWriter.write(avroRecord); + @Override + protected void closeWhenSucceed() throws IOException { + parquetWriter.close(); } @Override - public void close(boolean hasFailed) throws IOException { - if (hasFailed) { - LOGGER.warn("Failure detected. Aborting upload of stream '{}'...", stream.getName()); - parquetWriter.close(); - LOGGER.warn("Upload of stream '{}' aborted.", stream.getName()); - } else { - LOGGER.info("Uploading remaining data for stream '{}'.", stream.getName()); - parquetWriter.close(); - LOGGER.info("Upload completed for stream '{}'.", stream.getName()); - } + protected void closeWhenFail() throws IOException { + parquetWriter.close(); } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/BaseS3Writer.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/BaseS3Writer.java index 25704aa2e637..cf2b2aecb5b8 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/BaseS3Writer.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/BaseS3Writer.java @@ -36,6 +36,7 @@ import io.airbyte.protocol.models.AirbyteStream; import io.airbyte.protocol.models.ConfiguredAirbyteStream; import io.airbyte.protocol.models.DestinationSyncMode; +import java.io.IOException; import java.sql.Timestamp; import java.text.DateFormat; import java.text.SimpleDateFormat; @@ -49,6 +50,7 @@ * The base implementation takes care of the following: *

  • Create shared instance variables.
  • *
  • Create the bucket and prepare the bucket path.
  • + *
  • Log and close the write.
  • */ public abstract class BaseS3Writer implements S3Writer { @@ -103,6 +105,36 @@ public void initialize() { } } + /** + * Log and close the write. + */ + @Override + public void close(boolean hasFailed) throws IOException { + if (hasFailed) { + LOGGER.warn("Failure detected. Aborting upload of stream '{}'...", stream.getName()); + closeWhenFail(); + LOGGER.warn("Upload of stream '{}' aborted.", stream.getName()); + } else { + LOGGER.info("Uploading remaining data for stream '{}'.", stream.getName()); + closeWhenSucceed(); + LOGGER.info("Upload completed for stream '{}'.", stream.getName()); + } + } + + /** + * Operations that will run when the write succeeds. + */ + protected void closeWhenSucceed() throws IOException { + // Do nothing by default + } + + /** + * Operations that will run when the write fails. + */ + protected void closeWhenFail() throws IOException { + // Do nothing by default + } + // Filename: __0. public static String getOutputFilename(Timestamp timestamp, S3Format format) { DateFormat formatter = new SimpleDateFormat(S3DestinationConstants.YYYY_MM_DD_FORMAT_STRING); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java index 78a8b2e3d690..b18cc54c7417 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java @@ -27,10 +27,11 @@ import com.amazonaws.services.s3.AmazonS3; import io.airbyte.integrations.destination.s3.S3DestinationConfig; import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.avro.S3AvroWriter; import io.airbyte.integrations.destination.s3.csv.S3CsvWriter; import io.airbyte.integrations.destination.s3.jsonl.S3JsonlWriter; -import io.airbyte.integrations.destination.s3.parquet.JsonFieldNameUpdater; -import io.airbyte.integrations.destination.s3.parquet.JsonToAvroSchemaConverter; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; import io.airbyte.protocol.models.AirbyteStream; import io.airbyte.protocol.models.ConfiguredAirbyteStream; @@ -50,10 +51,8 @@ public S3Writer create(S3DestinationConfig config, Timestamp uploadTimestamp) throws Exception { S3Format format = config.getFormatConfig().getFormat(); - if (format == S3Format.CSV) { - return new S3CsvWriter(config, s3Client, configuredStream, uploadTimestamp); - } - if (format == S3Format.PARQUET) { + + if (format == S3Format.AVRO || format == S3Format.PARQUET) { AirbyteStream stream = configuredStream.getStream(); JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); Schema avroSchema = schemaConverter.getAvroSchema(stream.getJsonSchema(), stream.getName(), stream.getNamespace(), true); @@ -64,8 +63,17 @@ public S3Writer create(S3DestinationConfig config, LOGGER.info("The following field names will be standardized: {}", nameUpdater); } - return new S3ParquetWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, nameUpdater); + if (format == S3Format.AVRO) { + return new S3AvroWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, nameUpdater); + } else { + return new S3ParquetWriter(config, s3Client, configuredStream, uploadTimestamp, avroSchema, nameUpdater); + } + } + + if (format == S3Format.CSV) { + return new S3CsvWriter(config, s3Client, configuredStream, uploadTimestamp); } + if (format == S3Format.JSONL) { return new S3JsonlWriter(config, s3Client, configuredStream, uploadTimestamp); } diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java index a47400a7b8a0..1aa4e3825382 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java @@ -32,8 +32,8 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; -import io.airbyte.integrations.destination.s3.parquet.JsonFieldNameUpdater; -import io.airbyte.integrations.destination.s3.parquet.JsonToAvroSchemaConverter; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; import java.io.IOException; import java.net.URI; diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdaterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java similarity index 94% rename from airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdaterTest.java rename to airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java index e05e2be3c1b2..858ed470d3ab 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonFieldNameUpdaterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java @@ -22,7 +22,7 @@ * SOFTWARE. */ -package io.airbyte.integrations.destination.s3.parquet; +package io.airbyte.integrations.destination.s3.avro; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -30,6 +30,7 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.resources.MoreResources; import io.airbyte.commons.util.MoreIterators; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import java.io.IOException; import java.util.Map.Entry; import java.util.stream.Collectors; diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java similarity index 95% rename from airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverterTest.java rename to airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java index cc3111b66738..dafd8ec56059 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/parquet/JsonToAvroSchemaConverterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java @@ -22,7 +22,7 @@ * SOFTWARE. */ -package io.airbyte.integrations.destination.s3.parquet; +package io.airbyte.integrations.destination.s3.avro; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -31,6 +31,8 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.resources.MoreResources; import io.airbyte.commons.util.MoreIterators; +import io.airbyte.integrations.destination.s3.avro.JsonSchemaType; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import java.util.Collections; import java.util.stream.Stream; import org.junit.jupiter.api.Test; From a9d219b50f1f90613bbb30ad7f62c6dac2008e30 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 02:32:06 -0700 Subject: [PATCH 09/17] Implement compression codec --- .../s3/avro/S3AvroFormatConfig.java | 93 ++++++++++++++ .../destination/s3/avro/S3AvroWriter.java | 21 ++-- .../src/main/resources/spec.json | 116 +++++++++++++++++- .../s3/avro/S3AvroFormatConfigTest.java | 74 +++++++++++ 4 files changed, 294 insertions(+), 10 deletions(-) create mode 100644 airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java index 3f6273ae05ba..6831f90f52a2 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java @@ -3,10 +3,18 @@ import com.fasterxml.jackson.databind.JsonNode; import io.airbyte.integrations.destination.s3.S3Format; import io.airbyte.integrations.destination.s3.S3FormatConfig; +import org.apache.avro.file.CodecFactory; public class S3AvroFormatConfig implements S3FormatConfig { + private final CodecFactory codecFactory; + public S3AvroFormatConfig(JsonNode formatConfig) { + this.codecFactory = parseCodecConfig(formatConfig.get("compression_codec")); + } + + public CodecFactory getCodecFactory() { + return codecFactory; } @Override @@ -14,4 +22,89 @@ public S3Format getFormat() { return S3Format.AVRO; } + public enum CompressionCodec { + NULL("no compression"), + DEFLATE("deflate"), + BZIP2("bzip2"), + XZ("xz"), + ZSTANDARD("zstandard"), + SNAPPY("snappy"); + + private final String configValue; + + CompressionCodec(String configValue) { + this.configValue = configValue; + } + + public static CompressionCodec fromConfigValue(String configValue) { + for (CompressionCodec codec : values()) { + if (configValue.equalsIgnoreCase(codec.configValue)) { + return codec; + } + } + throw new IllegalArgumentException("Unknown codec config value: " + configValue); + } + } + + public static CodecFactory parseCodecConfig(JsonNode compressionCodecConfig) { + if (compressionCodecConfig == null || compressionCodecConfig.isNull()) { + return CodecFactory.nullCodec(); + } + + JsonNode codecConfig = compressionCodecConfig.get("codec"); + if (codecConfig == null || codecConfig.isNull() || !codecConfig.isTextual()) { + return CodecFactory.nullCodec(); + } + String codecType = codecConfig.asText(); + CompressionCodec codec = CompressionCodec.fromConfigValue(codecConfig.asText()); + switch (codec) { + case NULL -> { + return CodecFactory.nullCodec(); + } + case DEFLATE -> { + int compressionLevel = getCompressionLevel(compressionCodecConfig, 0, 0, 9); + return CodecFactory.deflateCodec(compressionLevel); + } + case BZIP2 -> { + return CodecFactory.bzip2Codec(); + } + case XZ -> { + int compressionLevel = getCompressionLevel(compressionCodecConfig, 6, 0, 9); + return CodecFactory.xzCodec(compressionLevel); + } + case ZSTANDARD -> { + int compressionLevel = getCompressionLevel(compressionCodecConfig, 3, -5,22); + boolean includeChecksum = getIncludeChecksum(compressionCodecConfig, false); + return CodecFactory.zstandardCodec(compressionLevel, includeChecksum); + } + case SNAPPY -> { + return CodecFactory.snappyCodec(); + } + default -> { + throw new IllegalArgumentException("Unsupported compression codec: " + codecType); + } + } + } + + public static int getCompressionLevel(JsonNode compressionCodecConfig, int defaultLevel, int minLevel, int maxLevel) { + JsonNode levelConfig = compressionCodecConfig.get("compression_level"); + if (levelConfig == null || levelConfig.isNull() || !levelConfig.isIntegralNumber()) { + return defaultLevel; + } + int level = levelConfig.asInt(); + if (level < minLevel || level > maxLevel) { + throw new IllegalArgumentException( + String.format("Invalid compression level: %d, expected an integer in range [%d, %d]", level, minLevel, maxLevel)); + } + return level; + } + + public static boolean getIncludeChecksum(JsonNode compressionCodecConfig, boolean defaultValue) { + JsonNode checksumConfig = compressionCodecConfig.get("include_checksum"); + if (checksumConfig == null || checksumConfig.isNumber() || !checksumConfig.isBoolean()) { + return defaultValue; + } + return checksumConfig.asBoolean(); + } + } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java index c11a233eacc9..2a17e2854a5f 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java @@ -14,7 +14,6 @@ import java.sql.Timestamp; import java.util.UUID; import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; @@ -26,17 +25,17 @@ public class S3AvroWriter extends BaseS3Writer implements S3Writer { protected static final Logger LOGGER = LoggerFactory.getLogger(S3AvroWriter.class); + private final AvroRecordFactory avroRecordFactory; private final StreamTransferManager uploadManager; private final MultiPartOutputStream outputStream; private final DataFileWriter dataFileWriter; - private final AvroRecordFactory avroRecordFactory; public S3AvroWriter(S3DestinationConfig config, - AmazonS3 s3Client, - ConfiguredAirbyteStream configuredStream, - Timestamp uploadTimestamp, - Schema schema, - JsonFieldNameUpdater nameUpdater) throws IOException { + AmazonS3 s3Client, + ConfiguredAirbyteStream configuredStream, + Timestamp uploadTimestamp, + Schema schema, + JsonFieldNameUpdater nameUpdater) throws IOException { super(config, s3Client, configuredStream); String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.AVRO); @@ -45,11 +44,15 @@ public S3AvroWriter(S3DestinationConfig config, LOGGER.info("Full S3 path for stream '{}': {}/{}", stream.getName(), config.getBucketName(), objectKey); + this.avroRecordFactory = new AvroRecordFactory(schema, nameUpdater); this.uploadManager = S3StreamTransferManagerHelper.getDefault(config.getBucketName(), objectKey, s3Client); // We only need one output stream as we only have one input stream. This is reasonably performant. this.outputStream = uploadManager.getMultiPartOutputStreams().get(0); - this.dataFileWriter = new DataFileWriter<>(new GenericDatumWriter()).create(schema, outputStream); - this.avroRecordFactory = new AvroRecordFactory(schema, nameUpdater); + + S3AvroFormatConfig formatConfig = (S3AvroFormatConfig) config.getFormatConfig(); + this.dataFileWriter = new DataFileWriter<>(new GenericDatumWriter()) + .create(schema, outputStream) + .setCodec(formatConfig.getCodecFactory()); } @Override diff --git a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json index 93f086deb4b9..bac4f324444d 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json @@ -89,6 +89,120 @@ "type": "object", "description": "Output data format", "oneOf": [ + { + "title": "Avro: Apache Avro", + "required": ["format_type", "compression_codec"], + "properties": { + "format_type": { + "type": "string", + "enum": ["Avro"], + "default": "Avro" + }, + "compression_codec": { + "title": "Compression Codec", + "description": "The compression algorithm used to compress data. Default to no compression.", + "type": "object", + "oneOf": [ + { + "title": "no compression", + "required": ["codec"], + "properties": { + "codec": { + "type": "string", + "enum": ["no compression"], + "default": "no compression" + } + } + }, + { + "title": "Deflate", + "required": ["codec", "compression_level"], + "properties": { + "codec": { + "type": "string", + "enum": ["Deflate"], + "default": "Deflate" + }, + "compression_level": { + "title": "Deflate level", + "description": "0: no compression fastest, 9: best compression, slowest.", + "type": "integer", + "default": 0, + "minimum": 0, + "maximum": 9 + } + } + }, + { + "title": "bzip2", + "required": ["codec"], + "properties": { + "codec": { + "type": "string", + "enum": ["bzip2"], + "default": "bzip2" + } + } + }, + { + "title": "xz", + "required": ["codec", "compression_level"], + "properties": { + "codec": { + "type": "string", + "enum": ["xz"], + "default": "xz" + }, + "compression_level": { + "title": "Compression level", + "description": "See here for details.", + "type": "integer", + "default": 6, + "minimum": 0, + "maximum": 9 + } + } + }, + { + "title": "zstandard", + "required": ["codec", "compression_level"], + "properties": { + "codec": { + "type": "string", + "enum": ["zstandard"], + "default": "zstandard" + }, + "compression_level": { + "title": "Compression level", + "description": "Negative levels are 'fast' modes akin to lz4 or snappy, levels above 9 are generally for archival purposes, and levels above 18 use a lot of memory.", + "type": "integer", + "default": 3, + "minimum": -5, + "maximum": 22 + }, + "include_checksum": { + "title": "Include checksum", + "description": "If true, include a checksum with each data block.", + "type": "boolean", + "default": false + } + } + }, + { + "title": "snappy", + "required": ["codec"], + "properties": { + "codec": { + "type": "string", + "enum": ["snappy"], + "default": "snappy" + } + } + } + ] + } + } + }, { "title": "CSV: Comma-Separated Values", "required": ["format_type", "flattening"], @@ -128,7 +242,7 @@ "default": "Parquet" }, "compression_codec": { - "title": "Compression Algorithm", + "title": "Compression Codec", "description": "The compression algorithm used to compress data pages.", "type": "string", "enum": [ diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java new file mode 100644 index 000000000000..f96788ff821f --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java @@ -0,0 +1,74 @@ +package io.airbyte.integrations.destination.s3.avro; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.Lists; +import io.airbyte.commons.json.Jsons; +import java.util.List; +import org.apache.avro.file.CodecFactory; +import org.apache.avro.file.DataFileConstants; +import org.junit.jupiter.api.Test; + +class S3AvroFormatConfigTest { + + @Test + public void testParseCodecConfigNull() { + List nullConfigs = Lists.newArrayList("{}", "{ \"codec\": \"no compression\" }"); + for (String nullConfig : nullConfigs) { + assertEquals( + DataFileConstants.NULL_CODEC, + S3AvroFormatConfig.parseCodecConfig(Jsons.deserialize(nullConfig)).toString() + ); + } + } + + @Test + public void testParseCodecConfigDeflate() { + JsonNode deflateConfig = Jsons.deserialize("{ \"codec\": \"deflate\", \"compression_level\": 5 }"); + CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(deflateConfig); + assertEquals("deflate-5", codecFactory.toString()); + } + + @Test + public void testParseCodecConfigBzip2() { + JsonNode bzip2Config = Jsons.deserialize("{ \"codec\": \"bzip2\" }"); + CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(bzip2Config); + assertEquals(DataFileConstants.BZIP2_CODEC, codecFactory.toString()); + } + + @Test + public void testParseCodecConfigXz() { + JsonNode xzConfig = Jsons.deserialize("{ \"codec\": \"xz\", \"compression_level\": 7 }"); + CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(xzConfig); + assertEquals("xz-7", codecFactory.toString()); + } + + @Test + public void testParseCodecConfigZstandard() { + JsonNode zstandardConfig = Jsons.deserialize("{ \"codec\": \"zstandard\", \"compression_level\": 20, \"include_checksum\": true }"); + CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(zstandardConfig); + // There is no way to verify the checksum; all relevant methods are private or protected... + assertEquals("zstandard[20]", codecFactory.toString()); + } + + @Test + public void testParseCodecConfigSnappy() { + JsonNode snappyConfig = Jsons.deserialize("{ \"codec\": \"snappy\" }"); + CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(snappyConfig); + assertEquals(DataFileConstants.SNAPPY_CODEC, codecFactory.toString()); + } + + @Test + public void testParseCodecConfigInvalid() { + try { + JsonNode invalidConfig = Jsons.deserialize("{ \"codec\": \"bi-directional-bfs\" }"); + S3AvroFormatConfig.parseCodecConfig(invalidConfig); + fail(); + } catch (IllegalArgumentException e) { + // expected + } + } + +} From a09c196a1dacddb4840e1eaf00242c5cfb2efaaf Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 02:55:24 -0700 Subject: [PATCH 10/17] Update documentation --- .../src/main/resources/spec.json | 2 +- docs/integrations/destinations/s3.md | 243 ++++++++++-------- 2 files changed, 140 insertions(+), 105 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json index bac4f324444d..c97d8df79e4f 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json @@ -125,7 +125,7 @@ }, "compression_level": { "title": "Deflate level", - "description": "0: no compression fastest, 9: best compression, slowest.", + "description": "0: no compression & fastest, 9: best compression & slowest.", "type": "integer", "default": 0, "minimum": 0, diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index cd68ace33c56..0f12861466b2 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -65,118 +65,57 @@ Each stream will be outputted to its dedicated directory according to the config - Under Full Refresh Sync mode, old output files will be purged before new files are created. - Under Incremental - Append Sync mode, new output files will be added that only contain the new data. -### CSV - -Each CSV file includes at least two Airbyte metadata columns. Depending on the `flattening` config, the data may reside in one column (`_airbyte_data`) when there is no flattening, or multiple columns with root level flattening. - -| Column | Condition | Description | -| :--- | :--- | :--- | -| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | -| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | -| `_airbyte_data` | When no flattening is needed, all data reside under this column as a json blob. | -| root level fields| When root level flattening is selected, the root level fields are expanded. | - -For example, given the following json object from a source: - -```json -{ - "user_id": 123, - "name": { - "first": "John", - "last": "Doe" - } -} -``` - -With no flattening, the output CSV is: - -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | -| :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | - -With root level flattening, the output CSV is: - -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | -| :--- | :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | +### Avro -### JSON Lines (JSONL) - -[Json Lines](https://jsonlines.org/) is a text format with one JSON per line. Each line has a structure as follows: - -```json -{ - "_airbyte_ab_id": "", - "_airbyte_emitted_at": "", - "_airbyte_data": "" -} -``` - -For example, given the following two json objects from a source: - -```json -[ - { - "user_id": 123, - "name": { - "first": "John", - "last": "Doe" - } - }, - { - "user_id": 456, - "name": { - "first": "Jane", - "last": "Roe" - } - } -] -``` - -They will be like this in the output file: - -```jsonl -{ "_airbyte_ab_id": "26d73cde-7eb1-4e1e-b7db-a4c03b4cf206", "_airbyte_emitted_at": "1622135805000", "_airbyte_data": { "user_id": 123, "name": { "first": "John", "last": "Doe" } } } -{ "_airbyte_ab_id": "0a61de1b-9cdd-4455-a739-93572c9a5f20", "_airbyte_emitted_at": "1631948170000", "_airbyte_data": { "user_id": 456, "name": { "first": "Jane", "last": "Roe" } } } -``` - -### Parquet +[Apache Avro](https://avro.apache.org/) serializes data in a compact binary format. #### Configuration -The following configuration is available to configure the Parquet output: - -| Parameter | Type | Default | Description | -| :--- | :---: | :---: | :--- | -| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | -| `block_size_mb` | integer | 128 (MB) | **Block size (row group size)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | -| `max_padding_size_mb` | integer | 8 (MB) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | -| `page_size_kb` | integer | 1024 (KB) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | -| `dictionary_page_size_kb` | integer | 1024 (KB) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | -| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | - -These parameters are related to the `ParquetOutputFormat`. See the [Java doc](https://www.javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.12.0/org/apache/parquet/hadoop/ParquetOutputFormat.html) for more details. Also see [Parquet documentation](https://parquet.apache.org/documentation/latest/#configurations) for their recommended configurations (512 - 1024 MB block size, 8 KB page size). +Here is the available compression codecs: + +- No compression +- `deflate` + - Compression level + - Range `[0, 9]`. Default to 0. + - Level 0: no compression & fastest. + - Level 9: best compression & slowest. +- `bzip2` +- `xz` + - Compression level + - Range `[0, 9]`. Default to 6. + - Level 0-3 are fast with medium compression. + - Level 4-6 are fairly slow with high compression. + - Level 7-9 are like level 6 but use bigger dictionaries and have higher memory requirements. Unless the uncompressed size of the file exceeds 8 MiB, 16 MiB, or 32 MiB, it is waste of memory to use the presets 7, 8, or 9, respectively. +- `zstandard` + - Compression level + - Range `[-5, 22]`. Default to 3. + - Negative levels are 'fast' modes akin to `lz4` or `snappy`. + - Levels above 9 are generally for archival purposes. + - Levels above 18 use a lot of memory. + - Include checksum + - If set to `true`, a checksum will be included in each data block. +- `snappy` #### Data schema -Under the hood, an Airbyte data stream in Json schema is first converted to Avro schema, and then written to Parquet format. Because the data stream can come from any data source, the Parquet S3 destination connector has the following arbitrary rules. +Under the hood, an Airbyte data stream in Json schema is converted to an Avro schema, and then the Json object is converted to an Avro record based on the Avro schema. Because the data stream can come from any data source, the Avro S3 destination connector has the following arbitrary rules. -1. Json schema type is mapped to Avro / Parquet type as follows: +1. Json schema types are mapped to Avro typea as follows: - | Json Data Type | Avro / Parquet Data Type | +| Json Data Type | Avro Data Type | | :---: | :---: | - | string | string | - | number | double | - | integer | int | - | boolean | boolean | - | null | null | - | object | record | - | array | array | +| string | string | +| number | double | +| integer | int | +| boolean | boolean | +| null | null | +| object | record | +| array | array | 2. Built-in Json schema formats are not mapped to Avro logical types at this moment. 2. Json schema compositions ("allOf", "anyOf", and "oneOf") are not supported at this moment. 3. Only alphanumeric characters and underscores (`/a-zA-Z0-9_/`) are allowed in a stream or field name. Any special character will be converted to an alphabet or underscore. For example, `spécial:character_names` will become `special_character_names`. The original names will be stored in the `doc` property in this format: `_airbyte_original_name:`. -4. All field will be nullable. For example, a `string` Json field will be typed as `["null", "string"]` in Parquet. This is necessary because the incoming data stream may have optional fields. +4. All field will be nullable. For example, a `string` Json field will be typed as `["null", "string"]` in Avro. This is necessary because the incoming data stream may have optional fields. 5. For array fields in Json schema, when the `items` property is an array, it means that each element in the array should follow its own schema sequentially. For example, the following specification means the first item in the array should be a string, and the second a number. ```json @@ -191,7 +130,7 @@ Under the hood, an Airbyte data stream in Json schema is first converted to Avro } ``` - This is not supported in Avro schema. As a compromise, the converter creates a union, ["string", "number"], which is less stringent: +This is not supported in Avro schema. As a compromise, the converter creates a union, ["string", "number"], which is less stringent: ```json { @@ -207,12 +146,12 @@ Under the hood, an Airbyte data stream in Json schema is first converted to Avro } ``` -6. Two Airbyte specific fields will be added to each Parquet record: +6. Two Airbyte specific fields will be added to each Avro record: - | Field | Schema | Document | +| Field | Schema | Document | | :--- | :--- | :---: | - | `_airbyte_ab_id` | `UUID` | [link](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid) - | `_airbyte_emitted_at` | `TimestampType (isAdjustedToUTC = true, unit = MILLIS)` | [link](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#instant-semantics-timestamps-normalized-to-utc) | +| `_airbyte_ab_id` | `uuid` | [link](http://avro.apache.org/docs/current/spec.html#UUID) +| `_airbyte_emitted_at` | `timestamp-millis` | [link](http://avro.apache.org/docs/current/spec.html#Timestamp+%28millisecond+precision%29) | 7. Currently `additionalProperties` is not supported. This means if the source is schemaless (e.g. Mongo), or has flexible fields, they will be ignored. We will have a solution soon. Feel free to submit a new issue if this is blocking for you. @@ -245,7 +184,7 @@ For example, given the following Json schema: } ``` -Its corresponding Avro / Parquet schema will be: +Its corresponding Avro schema will be: ```json { @@ -292,6 +231,102 @@ Its corresponding Avro / Parquet schema will be: } ``` +### CSV + +Each CSV file includes at least two Airbyte metadata columns. Depending on the `flattening` config, the data may reside in one column (`_airbyte_data`) when there is no flattening, or multiple columns with root level flattening. + +| Column | Condition | Description | +| :--- | :--- | :--- | +| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | +| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | +| `_airbyte_data` | When no flattening is needed, all data reside under this column as a json blob. | +| root level fields| When root level flattening is selected, the root level fields are expanded. | + +For example, given the following json object from a source: + +```json +{ + "user_id": 123, + "name": { + "first": "John", + "last": "Doe" + } +} +``` + +With no flattening, the output CSV is: + +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | +| :--- | :--- | :--- | +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | + +With root level flattening, the output CSV is: + +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | +| :--- | :--- | :--- | :--- | +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | + +### JSON Lines (JSONL) + +[Json Lines](https://jsonlines.org/) is a text format with one JSON per line. Each line has a structure as follows: + +```json +{ + "_airbyte_ab_id": "", + "_airbyte_emitted_at": "", + "_airbyte_data": "" +} +``` + +For example, given the following two json objects from a source: + +```json +[ + { + "user_id": 123, + "name": { + "first": "John", + "last": "Doe" + } + }, + { + "user_id": 456, + "name": { + "first": "Jane", + "last": "Roe" + } + } +] +``` + +They will be like this in the output file: + +```jsonl +{ "_airbyte_ab_id": "26d73cde-7eb1-4e1e-b7db-a4c03b4cf206", "_airbyte_emitted_at": "1622135805000", "_airbyte_data": { "user_id": 123, "name": { "first": "John", "last": "Doe" } } } +{ "_airbyte_ab_id": "0a61de1b-9cdd-4455-a739-93572c9a5f20", "_airbyte_emitted_at": "1631948170000", "_airbyte_data": { "user_id": 456, "name": { "first": "Jane", "last": "Roe" } } } +``` + +### Parquet + +#### Configuration + +The following configuration is available to configure the Parquet output: + +| Parameter | Type | Default | Description | +| :--- | :---: | :---: | :--- | +| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | +| `block_size_mb` | integer | 128 (MB) | **Block size (row group size)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | +| `max_padding_size_mb` | integer | 8 (MB) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | +| `page_size_kb` | integer | 1024 (KB) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | +| `dictionary_page_size_kb` | integer | 1024 (KB) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | +| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | + +These parameters are related to the `ParquetOutputFormat`. See the [Java doc](https://www.javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.12.0/org/apache/parquet/hadoop/ParquetOutputFormat.html) for more details. Also see [Parquet documentation](https://parquet.apache.org/documentation/latest/#configurations) for their recommended configurations (512 - 1024 MB block size, 8 KB page size). + +#### Data schema + +Under the hood, an Airbyte data stream in Json schema is first converted to an Avro schema, then the Json object is converted to an Avro record, and finally the Avro record is outputted to the Parquet format. See the `Data schema` section from the [Avro output](#avro) for rules and limitations. + ## Getting started ### Requirements From c63704e25b3d83c8de9bc3d3e934d782269ddbc2 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 03:02:24 -0700 Subject: [PATCH 11/17] Revise documentation --- .../s3/avro/S3AvroFormatConfig.java | 68 +++++++++---------- docs/integrations/destinations/s3.md | 26 +++---- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java index 6831f90f52a2..e2d80935693c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java @@ -13,39 +13,6 @@ public S3AvroFormatConfig(JsonNode formatConfig) { this.codecFactory = parseCodecConfig(formatConfig.get("compression_codec")); } - public CodecFactory getCodecFactory() { - return codecFactory; - } - - @Override - public S3Format getFormat() { - return S3Format.AVRO; - } - - public enum CompressionCodec { - NULL("no compression"), - DEFLATE("deflate"), - BZIP2("bzip2"), - XZ("xz"), - ZSTANDARD("zstandard"), - SNAPPY("snappy"); - - private final String configValue; - - CompressionCodec(String configValue) { - this.configValue = configValue; - } - - public static CompressionCodec fromConfigValue(String configValue) { - for (CompressionCodec codec : values()) { - if (configValue.equalsIgnoreCase(codec.configValue)) { - return codec; - } - } - throw new IllegalArgumentException("Unknown codec config value: " + configValue); - } - } - public static CodecFactory parseCodecConfig(JsonNode compressionCodecConfig) { if (compressionCodecConfig == null || compressionCodecConfig.isNull()) { return CodecFactory.nullCodec(); @@ -73,7 +40,7 @@ public static CodecFactory parseCodecConfig(JsonNode compressionCodecConfig) { return CodecFactory.xzCodec(compressionLevel); } case ZSTANDARD -> { - int compressionLevel = getCompressionLevel(compressionCodecConfig, 3, -5,22); + int compressionLevel = getCompressionLevel(compressionCodecConfig, 3, -5, 22); boolean includeChecksum = getIncludeChecksum(compressionCodecConfig, false); return CodecFactory.zstandardCodec(compressionLevel, includeChecksum); } @@ -107,4 +74,37 @@ public static boolean getIncludeChecksum(JsonNode compressionCodecConfig, boolea return checksumConfig.asBoolean(); } + public CodecFactory getCodecFactory() { + return codecFactory; + } + + @Override + public S3Format getFormat() { + return S3Format.AVRO; + } + + public enum CompressionCodec { + NULL("no compression"), + DEFLATE("deflate"), + BZIP2("bzip2"), + XZ("xz"), + ZSTANDARD("zstandard"), + SNAPPY("snappy"); + + private final String configValue; + + CompressionCodec(String configValue) { + this.configValue = configValue; + } + + public static CompressionCodec fromConfigValue(String configValue) { + for (CompressionCodec codec : values()) { + if (configValue.equalsIgnoreCase(codec.configValue)) { + return codec; + } + } + throw new IllegalArgumentException("Unknown codec config value: " + configValue); + } + } + } diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 0f12861466b2..71136d48b008 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -102,15 +102,15 @@ Under the hood, an Airbyte data stream in Json schema is converted to an Avro sc 1. Json schema types are mapped to Avro typea as follows: -| Json Data Type | Avro Data Type | + | Json Data Type | Avro Data Type | | :---: | :---: | -| string | string | -| number | double | -| integer | int | -| boolean | boolean | -| null | null | -| object | record | -| array | array | + | string | string | + | number | double | + | integer | int | + | boolean | boolean | + | null | null | + | object | record | + | array | array | 2. Built-in Json schema formats are not mapped to Avro logical types at this moment. 2. Json schema compositions ("allOf", "anyOf", and "oneOf") are not supported at this moment. @@ -130,7 +130,7 @@ Under the hood, an Airbyte data stream in Json schema is converted to an Avro sc } ``` -This is not supported in Avro schema. As a compromise, the converter creates a union, ["string", "number"], which is less stringent: + This is not supported in Avro schema. As a compromise, the converter creates a union, ["string", "number"], which is less stringent: ```json { @@ -148,10 +148,10 @@ This is not supported in Avro schema. As a compromise, the converter creates a u 6. Two Airbyte specific fields will be added to each Avro record: -| Field | Schema | Document | + | Field | Schema | Document | | :--- | :--- | :---: | -| `_airbyte_ab_id` | `uuid` | [link](http://avro.apache.org/docs/current/spec.html#UUID) -| `_airbyte_emitted_at` | `timestamp-millis` | [link](http://avro.apache.org/docs/current/spec.html#Timestamp+%28millisecond+precision%29) | + | `_airbyte_ab_id` | `uuid` | [link](http://avro.apache.org/docs/current/spec.html#UUID) + | `_airbyte_emitted_at` | `timestamp-millis` | [link](http://avro.apache.org/docs/current/spec.html#Timestamp+%28millisecond+precision%29) | 7. Currently `additionalProperties` is not supported. This means if the source is schemaless (e.g. Mongo), or has flexible fields, they will be ignored. We will have a solution soon. Feel free to submit a new issue if this is blocking for you. @@ -356,7 +356,7 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | -| 0.1.7 | 2021-06-20 | [#4227](https://github.com/airbytehq/airbyte/pull/4227) | Added JSONL output. | +| 0.1.7 | 2021-06-21 | [#4227](https://github.com/airbytehq/airbyte/pull/4227) | Added Avro and JSONL output. | | 0.1.6 | 2021-06-16 | [#4130](https://github.com/airbytehq/airbyte/pull/4130) | Patched the check to verify prefix access instead of full-bucket access. | | 0.1.5 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Fixed default `max_padding_size_mb` in `spec.json`. | | 0.1.4 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Added Parquet output. | From 29845ee8d779b2a3e5fcfe3c538ff8fbb073a687 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 03:11:31 -0700 Subject: [PATCH 12/17] Add more tests --- .../s3/avro/S3AvroFormatConfigTest.java | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java index f96788ff821f..0c3bbfb3716c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java @@ -26,9 +26,15 @@ public void testParseCodecConfigNull() { @Test public void testParseCodecConfigDeflate() { - JsonNode deflateConfig = Jsons.deserialize("{ \"codec\": \"deflate\", \"compression_level\": 5 }"); - CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(deflateConfig); - assertEquals("deflate-5", codecFactory.toString()); + // default compression level 0 + CodecFactory codecFactory1 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"deflate\" }")); + assertEquals("deflate-0", codecFactory1.toString()); + + // compression level 5 + CodecFactory codecFactory2 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"deflate\", \"compression_level\": 5 }")); + assertEquals("deflate-5", codecFactory2.toString()); } @Test @@ -40,17 +46,30 @@ public void testParseCodecConfigBzip2() { @Test public void testParseCodecConfigXz() { - JsonNode xzConfig = Jsons.deserialize("{ \"codec\": \"xz\", \"compression_level\": 7 }"); - CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(xzConfig); - assertEquals("xz-7", codecFactory.toString()); + // default compression level 6 + CodecFactory codecFactory1 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"xz\" }")); + assertEquals("xz-6", codecFactory1.toString()); + + // compression level 7 + CodecFactory codecFactory2 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"xz\", \"compression_level\": 7 }")); + assertEquals("xz-7", codecFactory2.toString()); } @Test public void testParseCodecConfigZstandard() { - JsonNode zstandardConfig = Jsons.deserialize("{ \"codec\": \"zstandard\", \"compression_level\": 20, \"include_checksum\": true }"); - CodecFactory codecFactory = S3AvroFormatConfig.parseCodecConfig(zstandardConfig); + // default compression level 3 + CodecFactory codecFactory1 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"zstandard\" }")); + // There is no way to verify the checksum; all relevant methods are private or protected... + assertEquals("zstandard[3]", codecFactory1.toString()); + + // compression level 20 + CodecFactory codecFactory2 = S3AvroFormatConfig.parseCodecConfig( + Jsons.deserialize("{ \"codec\": \"zstandard\", \"compression_level\": 20, \"include_checksum\": true }")); // There is no way to verify the checksum; all relevant methods are private or protected... - assertEquals("zstandard[20]", codecFactory.toString()); + assertEquals("zstandard[20]", codecFactory2.toString()); } @Test From 2db0ec3461765cac212d2a02f6fb906202debe24 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 03:55:28 -0700 Subject: [PATCH 13/17] Add acceptance test --- .../destination/s3/avro/S3AvroWriter.java | 6 +- .../destination/s3/AvroRecordHelper.java | 36 ++++++++++ .../s3/S3AvroDestinationAcceptanceTest.java | 67 +++++++++++++++++++ .../S3ParquetDestinationAcceptanceTest.java | 25 +------ docs/integrations/destinations/s3.md | 2 +- 5 files changed, 109 insertions(+), 27 deletions(-) create mode 100644 airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java create mode 100644 airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java index 2a17e2854a5f..884fe72044a7 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java @@ -50,9 +50,11 @@ public S3AvroWriter(S3DestinationConfig config, this.outputStream = uploadManager.getMultiPartOutputStreams().get(0); S3AvroFormatConfig formatConfig = (S3AvroFormatConfig) config.getFormatConfig(); + // The DataFileWriter always uses binary encoding. + // If json encoding is needed in the future, use the GenericDatumWriter directly. this.dataFileWriter = new DataFileWriter<>(new GenericDatumWriter()) - .create(schema, outputStream) - .setCodec(formatConfig.getCodecFactory()); + .setCodec(formatConfig.getCodecFactory()) + .create(schema, outputStream); } @Override diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java new file mode 100644 index 000000000000..24c393c0bec5 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java @@ -0,0 +1,36 @@ +package io.airbyte.integrations.destination.s3; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.jackson.MoreMappers; +import io.airbyte.commons.util.MoreIterators; +import io.airbyte.integrations.base.JavaBaseConstants; +import org.apache.avro.generic.GenericData; + +public class AvroRecordHelper { + + /** + * Convert an Airbyte JsonNode from Avro / Parquet Record to a plain one. + *
  • Remove the airbyte id and emission timestamp fields.
  • + *
  • Remove null fields that must exist in Parquet but does not in original Json.
  • This + * function mutates the input Json. + */ + public static JsonNode pruneAirbyteJson(JsonNode input) { + ObjectNode output = (ObjectNode) input; + + // Remove Airbyte columns. + output.remove(JavaBaseConstants.COLUMN_NAME_AB_ID); + output.remove(JavaBaseConstants.COLUMN_NAME_EMITTED_AT); + + // Fields with null values does not exist in the original Json but only in Parquet. + for (String field : MoreIterators.toList(output.fieldNames())) { + if (output.get(field) == null || output.get(field).isNull()) { + output.remove(field); + } + } + + return output; + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java new file mode 100644 index 000000000000..dd2d4a2aa7b5 --- /dev/null +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java @@ -0,0 +1,67 @@ +package io.airbyte.integrations.destination.s3; + +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectReader; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; +import java.util.LinkedList; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.SeekableByteArrayInput; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericData.Record; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DecoderFactory; +import tech.allegro.schema.json2avro.converter.JsonAvroConverter; + +public class S3AvroDestinationAcceptanceTest extends S3DestinationAcceptanceTest { + + private final JsonAvroConverter converter = new JsonAvroConverter(); + + protected S3AvroDestinationAcceptanceTest() { + super(S3Format.AVRO); + } + + @Override + protected JsonNode getFormatConfig() { + return Jsons.deserialize("{\n" + + " \"format_type\": \"Avro\",\n" + + " \"compression_codec\": { \"codec\": \"no compression\", \"compression_level\": 5, \"include_checksum\": true }\n" + + "}"); + } + + @Override + protected List retrieveRecords(TestDestinationEnv testEnv, String streamName, String namespace, JsonNode streamSchema) throws Exception { + JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); + schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true); + JsonFieldNameUpdater nameUpdater = new JsonFieldNameUpdater(schemaConverter.getStandardizedNames()); + + List objectSummaries = getAllSyncedObjects(streamName, namespace); + List jsonRecords = new LinkedList<>(); + + for (S3ObjectSummary objectSummary : objectSummaries) { + S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey()); + DataFileReader dataFileReader = new DataFileReader<>( + new SeekableByteArrayInput(object.getObjectContent().readAllBytes()), + new GenericDatumReader<>() + ); + + ObjectReader jsonReader = MAPPER.reader(); + while (dataFileReader.hasNext()) { + GenericData.Record record = dataFileReader.next(); + byte[] jsonBytes = converter.convertToJson(record); + JsonNode jsonRecord = jsonReader.readTree(jsonBytes); + jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); + jsonRecords.add(AvroRecordHelper.pruneAirbyteJson(jsonRecord)); + } + } + + return jsonRecords; + } + +} diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java index 1aa4e3825382..7444b06e1ab6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java @@ -90,7 +90,7 @@ protected List retrieveRecords(TestDestinationEnv testEnv, byte[] jsonBytes = converter.convertToJson(record); JsonNode jsonRecord = jsonReader.readTree(jsonBytes); jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); - jsonRecords.add(pruneAirbyteJson(jsonRecord)); + jsonRecords.add(AvroRecordHelper.pruneAirbyteJson(jsonRecord)); } } } @@ -98,27 +98,4 @@ protected List retrieveRecords(TestDestinationEnv testEnv, return jsonRecords; } - /** - * Convert an Airbyte JsonNode from Parquet to a plain one. - *
  • Remove the airbyte id and emission timestamp fields.
  • - *
  • Remove null fields that must exist in Parquet but does not in original Json.
  • This - * function mutates the input Json. - */ - private static JsonNode pruneAirbyteJson(JsonNode input) { - ObjectNode output = (ObjectNode) input; - - // Remove Airbyte columns. - output.remove(JavaBaseConstants.COLUMN_NAME_AB_ID); - output.remove(JavaBaseConstants.COLUMN_NAME_EMITTED_AT); - - // Fields with null values does not exist in the original Json but only in Parquet. - for (String field : MoreIterators.toList(output.fieldNames())) { - if (output.get(field) == null || output.get(field).isNull()) { - output.remove(field); - } - } - - return output; - } - } diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 71136d48b008..12c991548996 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -67,7 +67,7 @@ Each stream will be outputted to its dedicated directory according to the config ### Avro -[Apache Avro](https://avro.apache.org/) serializes data in a compact binary format. +[Apache Avro](https://avro.apache.org/) serializes data in a compact binary format. Currently, the Airbyte S3 Avro connector always uses the [binary encoding](http://avro.apache.org/docs/current/spec.html#binary_encoding), and assumes that all data records follow the same schema. #### Configuration From 7a9cc7ccca483811338cdb803e00567d950f5cb4 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 04:18:25 -0700 Subject: [PATCH 14/17] Format code --- .../s3/avro/AvroRecordFactory.java | 24 +++++++++++++++ .../destination/s3/avro/S3AvroConstants.java | 24 +++++++++++++++ .../s3/avro/S3AvroFormatConfig.java | 26 ++++++++++++++++ .../destination/s3/avro/S3AvroWriter.java | 27 ++++++++++++++++- .../s3/jsonl/S3JsonlFormatConfig.java | 24 +++++++++++++++ .../destination/s3/jsonl/S3JsonlWriter.java | 30 +++++++++++++++++-- .../util/S3StreamTransferManagerHelper.java | 24 +++++++++++++++ .../s3/writer/ProductionWriterFactory.java | 4 +-- .../destination/s3/AvroRecordHelper.java | 27 +++++++++++++++-- .../s3/S3AvroDestinationAcceptanceTest.java | 30 +++++++++++++++---- .../s3/S3JsonlDestinationAcceptanceTest.java | 24 +++++++++++++++ .../S3ParquetDestinationAcceptanceTest.java | 3 -- .../s3/avro/JsonFieldNameUpdaterTest.java | 1 - .../avro/JsonToAvroSchemaConverterTest.java | 2 -- .../s3/avro/S3AvroFormatConfigTest.java | 27 +++++++++++++++-- 15 files changed, 275 insertions(+), 22 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java index 9978a7d3d300..f05a5d955f62 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/AvroRecordFactory.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.avro; import com.fasterxml.jackson.core.JsonProcessingException; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java index 61caa40d62e3..ce10b67f9b92 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroConstants.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.avro; public class S3AvroConstants { diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java index e2d80935693c..95a68020fa86 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfig.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.avro; import com.fasterxml.jackson.databind.JsonNode; @@ -84,6 +108,7 @@ public S3Format getFormat() { } public enum CompressionCodec { + NULL("no compression"), DEFLATE("deflate"), BZIP2("bzip2"), @@ -105,6 +130,7 @@ public static CompressionCodec fromConfigValue(String configValue) { } throw new IllegalArgumentException("Unknown codec config value: " + configValue); } + } } diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java index 884fe72044a7..0eb575671031 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/avro/S3AvroWriter.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.avro; import alex.mojaki.s3upload.MultiPartOutputStream; @@ -35,7 +59,8 @@ public S3AvroWriter(S3DestinationConfig config, ConfiguredAirbyteStream configuredStream, Timestamp uploadTimestamp, Schema schema, - JsonFieldNameUpdater nameUpdater) throws IOException { + JsonFieldNameUpdater nameUpdater) + throws IOException { super(config, s3Client, configuredStream); String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.AVRO); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java index bb57f4c01094..15ec7b5684d3 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlFormatConfig.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.jsonl; import io.airbyte.integrations.destination.s3.S3Format; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java index a14bf00382bd..6c28d0a590c2 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/jsonl/S3JsonlWriter.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.jsonl; import alex.mojaki.s3upload.MultiPartOutputStream; @@ -35,9 +59,9 @@ public class S3JsonlWriter extends BaseS3Writer implements S3Writer { private final PrintWriter printWriter; public S3JsonlWriter(S3DestinationConfig config, - AmazonS3 s3Client, - ConfiguredAirbyteStream configuredStream, - Timestamp uploadTimestamp) { + AmazonS3 s3Client, + ConfiguredAirbyteStream configuredStream, + Timestamp uploadTimestamp) { super(config, s3Client, configuredStream); String outputFilename = BaseS3Writer.getOutputFilename(uploadTimestamp, S3Format.JSONL); diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java index 3f9ac5dc4fb9..dc188536113c 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/util/S3StreamTransferManagerHelper.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.util; import alex.mojaki.s3upload.StreamTransferManager; diff --git a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java index b18cc54c7417..bd27869017ae 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java +++ b/airbyte-integrations/connectors/destination-s3/src/main/java/io/airbyte/integrations/destination/s3/writer/ProductionWriterFactory.java @@ -27,11 +27,11 @@ import com.amazonaws.services.s3.AmazonS3; import io.airbyte.integrations.destination.s3.S3DestinationConfig; import io.airbyte.integrations.destination.s3.S3Format; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.avro.S3AvroWriter; import io.airbyte.integrations.destination.s3.csv.S3CsvWriter; import io.airbyte.integrations.destination.s3.jsonl.S3JsonlWriter; -import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; -import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; import io.airbyte.protocol.models.AirbyteStream; import io.airbyte.protocol.models.ConfiguredAirbyteStream; diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java index 24c393c0bec5..6caeecffe896 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java @@ -1,12 +1,33 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; -import io.airbyte.commons.jackson.MoreMappers; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; -import org.apache.avro.generic.GenericData; public class AvroRecordHelper { diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java index dd2d4a2aa7b5..b0bc18d0b6d4 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3; import com.amazonaws.services.s3.model.S3Object; @@ -9,14 +33,11 @@ import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import java.util.LinkedList; import java.util.List; -import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableByteArrayInput; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DecoderFactory; import tech.allegro.schema.json2avro.converter.JsonAvroConverter; public class S3AvroDestinationAcceptanceTest extends S3DestinationAcceptanceTest { @@ -48,8 +69,7 @@ protected List retrieveRecords(TestDestinationEnv testEnv, String stre S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey()); DataFileReader dataFileReader = new DataFileReader<>( new SeekableByteArrayInput(object.getObjectContent().readAllBytes()), - new GenericDatumReader<>() - ); + new GenericDatumReader<>()); ObjectReader jsonReader = MAPPER.reader(); while (dataFileReader.hasNext()) { diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java index 7b6cac948b33..7c0b27b51bc3 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3JsonlDestinationAcceptanceTest.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3; import com.amazonaws.services.s3.model.S3Object; diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java index 7444b06e1ab6..d0e5d4542d16 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java @@ -28,10 +28,7 @@ import com.amazonaws.services.s3.model.S3ObjectSummary; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.node.ObjectNode; import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.util.MoreIterators; -import io.airbyte.integrations.base.JavaBaseConstants; import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java index 858ed470d3ab..40fad9c61cfd 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonFieldNameUpdaterTest.java @@ -30,7 +30,6 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.resources.MoreResources; import io.airbyte.commons.util.MoreIterators; -import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; import java.io.IOException; import java.util.Map.Entry; import java.util.stream.Collectors; diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java index dafd8ec56059..4a501396804d 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/JsonToAvroSchemaConverterTest.java @@ -31,8 +31,6 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.resources.MoreResources; import io.airbyte.commons.util.MoreIterators; -import io.airbyte.integrations.destination.s3.avro.JsonSchemaType; -import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import java.util.Collections; import java.util.stream.Stream; import org.junit.jupiter.api.Test; diff --git a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java index 0c3bbfb3716c..b0fa3f564ec2 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test/java/io/airbyte/integrations/destination/s3/avro/S3AvroFormatConfigTest.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package io.airbyte.integrations.destination.s3.avro; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -19,8 +43,7 @@ public void testParseCodecConfigNull() { for (String nullConfig : nullConfigs) { assertEquals( DataFileConstants.NULL_CODEC, - S3AvroFormatConfig.parseCodecConfig(Jsons.deserialize(nullConfig)).toString() - ); + S3AvroFormatConfig.parseCodecConfig(Jsons.deserialize(nullConfig)).toString()); } } From 89ddaa9e740e562e15ed2d02d75164c14d695d5f Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Sun, 20 Jun 2021 04:23:07 -0700 Subject: [PATCH 15/17] Create helper method for name updater --- .../destination/s3/AvroRecordHelper.java | 8 ++++++ .../s3/S3AvroDestinationAcceptanceTest.java | 25 ++++++++----------- .../S3ParquetDestinationAcceptanceTest.java | 5 +--- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java index 6caeecffe896..83b3d5134a97 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/AvroRecordHelper.java @@ -28,9 +28,17 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import io.airbyte.commons.util.MoreIterators; import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; +import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; public class AvroRecordHelper { + public static JsonFieldNameUpdater getFieldNameUpdater(String streamName, String namespace, JsonNode streamSchema) { + JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); + schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true); + return new JsonFieldNameUpdater(schemaConverter.getStandardizedNames()); + } + /** * Convert an Airbyte JsonNode from Avro / Parquet Record to a plain one. *
  • Remove the airbyte id and emission timestamp fields.
  • diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java index b0bc18d0b6d4..db7ca2343784 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3AvroDestinationAcceptanceTest.java @@ -30,7 +30,6 @@ import com.fasterxml.jackson.databind.ObjectReader; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; -import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import java.util.LinkedList; import java.util.List; import org.apache.avro.file.DataFileReader; @@ -58,26 +57,24 @@ protected JsonNode getFormatConfig() { @Override protected List retrieveRecords(TestDestinationEnv testEnv, String streamName, String namespace, JsonNode streamSchema) throws Exception { - JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true); - JsonFieldNameUpdater nameUpdater = new JsonFieldNameUpdater(schemaConverter.getStandardizedNames()); + JsonFieldNameUpdater nameUpdater = AvroRecordHelper.getFieldNameUpdater(streamName, namespace, streamSchema); List objectSummaries = getAllSyncedObjects(streamName, namespace); List jsonRecords = new LinkedList<>(); for (S3ObjectSummary objectSummary : objectSummaries) { S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey()); - DataFileReader dataFileReader = new DataFileReader<>( + try (DataFileReader dataFileReader = new DataFileReader<>( new SeekableByteArrayInput(object.getObjectContent().readAllBytes()), - new GenericDatumReader<>()); - - ObjectReader jsonReader = MAPPER.reader(); - while (dataFileReader.hasNext()) { - GenericData.Record record = dataFileReader.next(); - byte[] jsonBytes = converter.convertToJson(record); - JsonNode jsonRecord = jsonReader.readTree(jsonBytes); - jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); - jsonRecords.add(AvroRecordHelper.pruneAirbyteJson(jsonRecord)); + new GenericDatumReader<>())) { + ObjectReader jsonReader = MAPPER.reader(); + while (dataFileReader.hasNext()) { + GenericData.Record record = dataFileReader.next(); + byte[] jsonBytes = converter.convertToJson(record); + JsonNode jsonRecord = jsonReader.readTree(jsonBytes); + jsonRecord = nameUpdater.getJsonWithOriginalFieldNames(jsonRecord); + jsonRecords.add(AvroRecordHelper.pruneAirbyteJson(jsonRecord)); + } } } diff --git a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java index d0e5d4542d16..6166a8869bd6 100644 --- a/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java +++ b/airbyte-integrations/connectors/destination-s3/src/test-integration/java/io/airbyte/integrations/destination/s3/S3ParquetDestinationAcceptanceTest.java @@ -30,7 +30,6 @@ import com.fasterxml.jackson.databind.ObjectReader; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater; -import io.airbyte.integrations.destination.s3.avro.JsonToAvroSchemaConverter; import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter; import java.io.IOException; import java.net.URI; @@ -65,9 +64,7 @@ protected List retrieveRecords(TestDestinationEnv testEnv, String namespace, JsonNode streamSchema) throws IOException, URISyntaxException { - JsonToAvroSchemaConverter schemaConverter = new JsonToAvroSchemaConverter(); - schemaConverter.getAvroSchema(streamSchema, streamName, namespace, true); - JsonFieldNameUpdater nameUpdater = new JsonFieldNameUpdater(schemaConverter.getStandardizedNames()); + JsonFieldNameUpdater nameUpdater = AvroRecordHelper.getFieldNameUpdater(streamName, namespace, streamSchema); List objectSummaries = getAllSyncedObjects(streamName, namespace); List jsonRecords = new LinkedList<>(); From 38ecd878ab15704ff86c272ab6fd941299770a39 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Wed, 23 Jun 2021 14:28:42 -0700 Subject: [PATCH 16/17] Update csv doc with normalization --- .../destination-s3/src/main/resources/spec.json | 4 ++-- docs/integrations/destinations/s3.md | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json index c97d8df79e4f..f3ae3a1c9452 100644 --- a/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/destination-s3/src/main/resources/spec.json @@ -214,8 +214,8 @@ }, "flattening": { "type": "string", - "title": "Flattening", - "description": "Whether the input json data should be flattened in the output CSV. Please refer to docs for details.", + "title": "Normalization (Flattening)", + "description": "Whether the input json data should be normalized (flattened) in the output CSV. Please refer to docs for details.", "default": "No flattening", "enum": ["No flattening", "Root level flattening"] } diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 12c991548996..a2ffbe58ac43 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -233,14 +233,14 @@ Its corresponding Avro schema will be: ### CSV -Each CSV file includes at least two Airbyte metadata columns. Depending on the `flattening` config, the data may reside in one column (`_airbyte_data`) when there is no flattening, or multiple columns with root level flattening. +Like most of the other Airbyte destination connectors, usually the output has three columns: a UUID, an emission timestamp, and the data blob. With the CSV output, it is possible to normalize (flatten) the data blob to multiple columns. | Column | Condition | Description | | :--- | :--- | :--- | | `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | | `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | -| `_airbyte_data` | When no flattening is needed, all data reside under this column as a json blob. | -| root level fields| When root level flattening is selected, the root level fields are expanded. | +| `_airbyte_data` | When no normalization (flattening) is needed, all data reside under this column as a json blob. | +| root level fields | When root level normalization (flattening) is selected, the root level fields are expanded. | For example, given the following json object from a source: @@ -254,13 +254,13 @@ For example, given the following json object from a source: } ``` -With no flattening, the output CSV is: +With no normalization, the output CSV is: | `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | | :--- | :--- | :--- | | `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | -With root level flattening, the output CSV is: +With root level normalization, the output CSV is: | `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | | :--- | :--- | :--- | :--- | From 035ae1c9ccac2122e0f49bee0ac50b04384d2a44 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Wed, 23 Jun 2021 14:32:45 -0700 Subject: [PATCH 17/17] Update version date --- docs/integrations/destinations/s3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index a2ffbe58ac43..d642378a6688 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -356,7 +356,7 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | -| 0.1.7 | 2021-06-21 | [#4227](https://github.com/airbytehq/airbyte/pull/4227) | Added Avro and JSONL output. | +| 0.1.7 | 2021-06-23 | [#4227](https://github.com/airbytehq/airbyte/pull/4227) | Added Avro and JSONL output. | | 0.1.6 | 2021-06-16 | [#4130](https://github.com/airbytehq/airbyte/pull/4130) | Patched the check to verify prefix access instead of full-bucket access. | | 0.1.5 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Fixed default `max_padding_size_mb` in `spec.json`. | | 0.1.4 | 2021-06-14 | [#3908](https://github.com/airbytehq/airbyte/pull/3908) | Added Parquet output. |