🐛 Destination Snowflake | BigQuery: add part_size config to UI (#9039)

* add part_size to Snowflake-s3, Bihquery-gcs * add part size for bigquery-denormalized * update changelog md * add limints for part size * update bigquery version
airbytehq · Dec 23, 2021 · 14c3536 · 14c3536
1 parent 389efbd
commit 14c3536
Show file tree

Hide file tree

Showing 17 changed files with 90 additions and 14 deletions.
diff --git a/...esources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json b/...esources/config/STANDARD_DESTINATION_DEFINITION/079d5540-f236-4294-ba7c-ade8fd918496.json
@@ -2,7 +2,7 @@
   "destinationDefinitionId": "079d5540-f236-4294-ba7c-ade8fd918496",
   "name": "BigQuery (denormalized typed struct)",
   "dockerRepository": "airbyte/destination-bigquery-denormalized",
-  "dockerImageTag": "0.2.1",
+  "dockerImageTag": "0.2.2",
   "documentationUrl": "https://docs.airbyte.io/integrations/destinations/bigquery",
   "icon": "bigquery.svg"
 }
diff --git a/...esources/config/STANDARD_DESTINATION_DEFINITION/22f6c74f-5699-40ff-833c-4a879ea40133.json b/...esources/config/STANDARD_DESTINATION_DEFINITION/22f6c74f-5699-40ff-833c-4a879ea40133.json
@@ -2,7 +2,7 @@
   "destinationDefinitionId": "22f6c74f-5699-40ff-833c-4a879ea40133",
   "name": "BigQuery",
   "dockerRepository": "airbyte/destination-bigquery",
-  "dockerImageTag": "0.4.1",
+  "dockerImageTag": "0.6.1",
   "documentationUrl": "https://docs.airbyte.io/integrations/destinations/bigquery",
   "icon": "bigquery.svg"
 }
diff --git a/...esources/config/STANDARD_DESTINATION_DEFINITION/424892c4-daac-4491-b35d-c6688ba547ba.json b/...esources/config/STANDARD_DESTINATION_DEFINITION/424892c4-daac-4491-b35d-c6688ba547ba.json
@@ -2,7 +2,7 @@
   "destinationDefinitionId": "424892c4-daac-4491-b35d-c6688ba547ba",
   "name": "Snowflake",
   "dockerRepository": "airbyte/destination-snowflake",
-  "dockerImageTag": "0.3.22",
+  "dockerImageTag": "0.3.23",
   "documentationUrl": "https://docs.airbyte.io/integrations/destinations/snowflake",
   "icon": "snowflake.svg"
 }
diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml
@@ -13,13 +13,13 @@
 - name: BigQuery
   destinationDefinitionId: 22f6c74f-5699-40ff-833c-4a879ea40133
   dockerRepository: airbyte/destination-bigquery
-  dockerImageTag: 0.5.1
+  dockerImageTag: 0.6.1
   documentationUrl: https://docs.airbyte.io/integrations/destinations/bigquery
   icon: bigquery.svg
 - name: BigQuery (denormalized typed struct)
   destinationDefinitionId: 079d5540-f236-4294-ba7c-ade8fd918496
   dockerRepository: airbyte/destination-bigquery-denormalized
-  dockerImageTag: 0.2.1
+  dockerImageTag: 0.2.2
   documentationUrl: https://docs.airbyte.io/integrations/destinations/bigquery
   icon: bigquery.svg
 - name: Cassandra
@@ -179,7 +179,7 @@
 - name: Snowflake
   destinationDefinitionId: 424892c4-daac-4491-b35d-c6688ba547ba
   dockerRepository: airbyte/destination-snowflake
-  dockerImageTag: 0.3.22
+  dockerImageTag: 0.3.23
   documentationUrl: https://docs.airbyte.io/integrations/destinations/snowflake
   icon: snowflake.svg
 - name: MariaDB ColumnStore

diff --git a/airbyte-config/init/src/main/resources/seed/destination_specs.yaml b/airbyte-config/init/src/main/resources/seed/destination_specs.yaml
@@ -176,7 +176,7 @@
     supportsDBT: false
     supported_destination_sync_modes:
     - "append"
-- dockerImage: "airbyte/destination-bigquery:0.5.1"
+- dockerImage: "airbyte/destination-bigquery:0.6.1"
   spec:
     documentationUrl: "https://docs.airbyte.io/integrations/destinations/bigquery"
     connectionSpecification:
@@ -307,6 +307,16 @@
                 type: "string"
                 examples:
                 - "data_sync/test"
+              part_size_mb:
+                title: "Block Size (MB) for GCS multipart upload"
+                description: "This is the size of a \"Part\" being buffered in memory.\
+                  \ It limits the memory usage when writing. Larger values will allow\
+                  \ to upload a bigger files and improve the speed, but consumes9\
+                  \ more memory. Allowed values: min=5MB, max=525MB Default: 5MB."
+                type: "integer"
+                default: 5
+                examples:
+                - 5
               keep_files_in_gcs-bucket:
                 type: "string"
                 description: "This upload method is supposed to temporary store records\
@@ -354,7 +364,7 @@
     - "overwrite"
     - "append"
     - "append_dedup"
-- dockerImage: "airbyte/destination-bigquery-denormalized:0.2.1"
+- dockerImage: "airbyte/destination-bigquery-denormalized:0.2.2"
   spec:
     documentationUrl: "https://docs.airbyte.io/integrations/destinations/bigquery"
     connectionSpecification:
@@ -474,6 +484,18 @@
                 type: "string"
                 examples:
                 - "data_sync/test"
+              part_size_mb:
+                title: "Block Size (MB) for GCS multipart upload"
+                description: "This is the size of a \"Part\" being buffered in memory.\
+                  \ It limits the memory usage when writing. Larger values will allow\
+                  \ to upload a bigger files and improve the speed, but consumes9\
+                  \ more memory. Allowed values: min=5MB, max=525MB Default: 5MB."
+                type: "integer"
+                default: 5
+                minimum: 5
+                maximum: 525
+                examples:
+                - 5
               keep_files_in_gcs-bucket:
                 type: "string"
                 description: "This upload method is supposed to temporary store records\
@@ -3549,6 +3571,8 @@
                   \ more memory. Allowed values: min=5MB, max=525MB Default: 5MB."
                 type: "integer"
                 default: 5
+                minimum: 5
+                maximum: 525
                 examples:
                 - 5
           - title: "CSV: Comma-Separated Values"
@@ -3722,7 +3746,7 @@
     supported_destination_sync_modes:
     - "overwrite"
     - "append"
-- dockerImage: "airbyte/destination-snowflake:0.3.22"
+- dockerImage: "airbyte/destination-snowflake:0.3.23"
   spec:
     documentationUrl: "https://docs.airbyte.io/integrations/destinations/snowflake"
     connectionSpecification:
@@ -3895,6 +3919,21 @@
                 title: "S3 Access Key"
                 airbyte_secret: true
                 order: 4
+              part_size:
+                type: "integer"
+                default: 5
+                examples:
+                - 5
+                description: "Optional. Increase this if syncing tables larger than\
+                  \ 100GB. Only relevant for COPY. Files are streamed to S3 in parts.\
+                  \ This determines the size of each part, in MBs. As S3 has a limit\
+                  \ of 10,000 parts per file, part size affects the table size. This\
+                  \ is 10MB by default, resulting in a default limit of 100GB tables.\
+                  \ Note, a larger part size will result in larger memory requirements.\
+                  \ A rule of thumb is to multiply the part size by 10 to get the\
+                  \ memory requirement. Modify this with care."
+                title: "Stream Part Size"
+                order: 5
           - title: "GCS Staging"
             additionalProperties: false
             description: "Writes large batches of records to a file, uploads the file\

diff --git a/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile b/airbyte-integrations/connectors/destination-bigquery-denormalized/Dockerfile
@@ -8,5 +8,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar
 
 RUN tar xf ${APPLICATION}.tar --strip-components=1
 
-LABEL io.airbyte.version=0.2.1
+LABEL io.airbyte.version=0.2.2
 LABEL io.airbyte.name=airbyte/destination-bigquery-denormalized
diff --git a/...te-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json b/...te-integrations/connectors/destination-bigquery-denormalized/src/main/resources/spec.json
@@ -118,6 +118,15 @@
                 "type": "string",
                 "examples": ["data_sync/test"]
               },
+              "part_size_mb": {
+                "title": "Block Size (MB) for GCS multipart upload",
+                "description": "This is the size of a \"Part\" being buffered in memory. It limits the memory usage when writing. Larger values will allow to upload a bigger files and improve the speed, but consumes more memory. Allowed values: min=5MB, max=525MB Default: 5MB.",
+                "type": "integer",
+                "default": 5,
+                "minimum": 5,
+                "maximum": 525,
+                "examples": [5]
+              },
               "keep_files_in_gcs-bucket": {
                 "type": "string",
                 "description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished?",

diff --git a/...e/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationAcceptanceTest.java b/...e/integrations/destination/bigquery/BigQueryDenormalizedGscDestinationAcceptanceTest.java
@@ -202,6 +202,7 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception {
         .put(BigQueryConsts.METHOD, BigQueryConsts.GCS_STAGING)
         .put(BigQueryConsts.GCS_BUCKET_NAME, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_NAME))
         .put(BigQueryConsts.GCS_BUCKET_PATH, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_PATH).asText() + System.currentTimeMillis())
+        .put(BigQueryConsts.PART_SIZE, gcsConfigFromSecretFile.get(BigQueryConsts.PART_SIZE))
         .put(BigQueryConsts.CREDENTIAL, credential)
         .build());
 

diff --git a/airbyte-integrations/connectors/destination-bigquery/Dockerfile b/airbyte-integrations/connectors/destination-bigquery/Dockerfile
@@ -8,5 +8,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar
 
 RUN tar xf ${APPLICATION}.tar --strip-components=1
 
-LABEL io.airbyte.version=0.6.0-rc1
+LABEL io.airbyte.version=0.6.1
 LABEL io.airbyte.name=airbyte/destination-bigquery
diff --git a/...n-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryConsts.java b/...n-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryConsts.java
@@ -23,6 +23,7 @@ public class BigQueryConsts {
   public static final String FORMAT = "format";
   public static final String KEEP_GCS_FILES = "keep_files_in_gcs-bucket";
   public static final String KEEP_GCS_FILES_VAL = "Keep all tmp files in GCS";
+  public static final String PART_SIZE = "part_size_mb";
 
   // tests
   public static final String BIGQUERY_BASIC_CONFIG = "basic_bigquery_config";

diff --git a/...on-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java b/...on-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryUtils.java
@@ -135,7 +135,8 @@ public static JsonNode getGcsJsonNodeConfig(final JsonNode config) {
         .put(BigQueryConsts.CREDENTIAL, loadingMethod.get(BigQueryConsts.CREDENTIAL))
         .put(BigQueryConsts.FORMAT, Jsons.deserialize("{\n"
             + "  \"format_type\": \"CSV\",\n"
-            + "  \"flattening\": \"No flattening\"\n"
+            + "  \"flattening\": \"No flattening\",\n"
+            + "  \"part_size_mb\": \"" + loadingMethod.get(BigQueryConsts.PART_SIZE) + "\"\n"
             + "}"))
         .build());
 
@@ -152,7 +153,8 @@ public static JsonNode getGcsAvroJsonNodeConfig(final JsonNode config) {
         .put(BigQueryConsts.CREDENTIAL, loadingMethod.get(BigQueryConsts.CREDENTIAL))
         .put(BigQueryConsts.FORMAT, Jsons.deserialize("{\n"
             + "  \"format_type\": \"AVRO\",\n"
-            + "  \"flattening\": \"No flattening\"\n"
+            + "  \"flattening\": \"No flattening\",\n"
+            + "  \"part_size_mb\": \"" + loadingMethod.get(BigQueryConsts.PART_SIZE) + "\"\n"
             + "}"))
         .build());
 

diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json
@@ -125,6 +125,15 @@
                 "type": "string",
                 "examples": ["data_sync/test"]
               },
+              "part_size_mb": {
+                "title": "Block Size (MB) for GCS multipart upload",
+                "description": "This is the size of a \"Part\" being buffered in memory. It limits the memory usage when writing. Larger values will allow to upload a bigger files and improve the speed, but consumes more memory. Allowed values: min=5MB, max=525MB Default: 5MB.",
+                "type": "integer",
+                "default": 5,
+                "minimum": 5,
+                "maximum": 525,
+                "examples": [5]
+              },
               "keep_files_in_gcs-bucket": {
                 "type": "string",
                 "description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished?",

diff --git a/...va/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java b/...va/io/airbyte/integrations/destination/bigquery/BigQueryGcsDestinationAcceptanceTest.java
@@ -45,6 +45,7 @@ protected void setup(final TestDestinationEnv testEnv) throws Exception {
         .put(BigQueryConsts.METHOD, BigQueryConsts.GCS_STAGING)
         .put(BigQueryConsts.GCS_BUCKET_NAME, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_NAME))
         .put(BigQueryConsts.GCS_BUCKET_PATH, gcsConfigFromSecretFile.get(BigQueryConsts.GCS_BUCKET_PATH).asText() + System.currentTimeMillis())
+        .put(BigQueryConsts.PART_SIZE, gcsConfigFromSecretFile.get(BigQueryConsts.PART_SIZE))
         .put(BigQueryConsts.CREDENTIAL, credential)
         .build());
 

diff --git a/airbyte-integrations/connectors/destination-snowflake/Dockerfile b/airbyte-integrations/connectors/destination-snowflake/Dockerfile
@@ -18,5 +18,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar
 
 RUN tar xf ${APPLICATION}.tar --strip-components=1
 
-LABEL io.airbyte.version=0.3.22
+LABEL io.airbyte.version=0.3.23
 LABEL io.airbyte.name=airbyte/destination-snowflake
diff --git a/airbyte-integrations/connectors/destination-snowflake/src/main/resources/spec.json b/airbyte-integrations/connectors/destination-snowflake/src/main/resources/spec.json
@@ -170,6 +170,16 @@
                 "title": "S3 Access Key",
                 "airbyte_secret": true,
                 "order": 4
+              },
+              "part_size": {
+                "type": "integer",
+                "default": 5,
+                "examples": [
+                  5
+                ],
+                "description": "Optional. Increase this if syncing tables larger than 100GB. Only relevant for COPY. Files are streamed to S3 in parts. This determines the size of each part, in MBs. As S3 has a limit of 10,000 parts per file, part size affects the table size. This is 10MB by default, resulting in a default limit of 100GB tables. Note, a larger part size will result in larger memory requirements. A rule of thumb is to multiply the part size by 10 to get the memory requirement. Modify this with care.",
+                "title": "Stream Part Size",
+                "order": 5
               }
             }
           },

diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md
@@ -109,6 +109,7 @@ There are 2 available options to upload data to BigQuery `Standard` and `GCS Sta
 This is the recommended configuration for uploading data to BigQuery. It works by first uploading all the data to a [GCS](https://cloud.google.com/storage) bucket, then ingesting the data to BigQuery. To configure GCS Staging, you'll need the following parameters:
 * **GCS Bucket Name**
 * **GCS Bucket Path**
+* **Block Size (MB) for GCS multipart upload**
 * **GCS Bucket Keep files after migration**
   * See [this](https://cloud.google.com/storage/docs/creating-buckets) for instructions on how to create a GCS bucket.
 * **HMAC Key Access ID**
@@ -145,6 +146,7 @@ Therefore, Airbyte BigQuery destination will convert any invalid characters into
 
 | Version | Date | Pull Request | Subject |
 |:--------| :--- | :--- | :--- |
+| 0.6.1   | 2021-12-22 | [\#9039](https://github.com/airbytehq/airbyte/pull/9039)   | Added part_size configuration to UI for GCS staging |
 | 0.6.0   | 2021-12-17 | [\#8788](https://github.com/airbytehq/airbyte/issues/8788) | BigQuery/BiqQuery denorm Destinations : Add possibility to use different types of GCS files |
 | 0.5.1   | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations |
 | 0.5.0   | 2021-10-26 | [\#7240](https://github.com/airbytehq/airbyte/issues/7240) | Output partitioned/clustered tables |
@@ -161,6 +163,7 @@ Therefore, Airbyte BigQuery destination will convert any invalid characters into
 
 | Version | Date | Pull Request | Subject |
 |:--------| :--- | :--- | :--- |
+| 0.2.2 | 2021-12-22 | [\#9039](https://github.com/airbytehq/airbyte/pull/9039)   | Added part_size configuration to UI for GCS staging |
 | 0.2.1 | 2021-12-21 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types |
 | 0.2.0   | 2021-12-17 | [\#8788](https://github.com/airbytehq/airbyte/pull/8788) |  BigQuery/BiqQuery denorm Destinations : Add possibility to use different types of GCS files |
 | 0.1.11  | 2021-12-16 | [\#8816](https://github.com/airbytehq/airbyte/issues/8816) | Update dataset locations |

diff --git a/docs/integrations/destinations/snowflake.md b/docs/integrations/destinations/snowflake.md
@@ -196,6 +196,7 @@ Finally, you need to add read/write permissions to your bucket with that email.
 
 | Version | Date      | Pull Request | Subject |
 | :------ | :-------- | :-----       | :------ |
+| 0.3.23 | 2021-12-22 | [#9039](https://github.com/airbytehq/airbyte/pull/9039) | Added part_size configuration in UI for S3 loading method |
 | 0.3.22 | 2021-12-21 | [#9006](https://github.com/airbytehq/airbyte/pull/9006) | Updated jdbc schema naming to follow Snowflake Naming Conventions |
 | 0.3.21 | 2021-12-15 | [#8781](https://github.com/airbytehq/airbyte/pull/8781) | Updated check method to verify permissions to create/drop stage for internal staging; compatibility fix for Java 17 |
 | 0.3.20 | 2021-12-10 | [#8562](https://github.com/airbytehq/airbyte/pull/8562) | Moving classes around for better dependency management; compatibility fix for Java 17 |