From 448dd2869842210bbd240edb02198af69d1d9055 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 29 Jun 2023 15:46:06 -0700 Subject: [PATCH 01/46] intiial implementation --- .../base-typing-deduping-test/build.gradle | 14 + .../BaseTypingDedupingTest.java | 335 ++++++++++++++++++ .../src/main/resources/schema.json | 23 ++ ...drecords_fullrefresh_overwrite_final.jsonl | 2 + ...tedrecords_fullrefresh_overwrite_raw.jsonl | 3 + .../src/main/resources/sync1_messages.jsonl | 5 + .../destination-bigquery/build.gradle | 1 + .../bigquery/BigQueryDestination.java | 2 +- .../BigQuerySqlGeneratorIntegrationTest.java | 34 +- ...ueryStandardInsertsTypingDedupingTest.java | 82 +++++ settings.gradle | 1 + 11 files changed, 487 insertions(+), 15 deletions(-) create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/build.gradle create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl create mode 100644 airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java diff --git a/airbyte-integrations/bases/base-typing-deduping-test/build.gradle b/airbyte-integrations/bases/base-typing-deduping-test/build.gradle new file mode 100644 index 000000000000..cfbf0a72d513 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/build.gradle @@ -0,0 +1,14 @@ +plugins { + id 'java-library' +} + +dependencies { + implementation project(':airbyte-config-oss:config-models-oss') + implementation project(':airbyte-connector-test-harnesses:acceptance-test-harness') + implementation libs.airbyte.protocol + + implementation(enforcedPlatform('org.junit:junit-bom:5.8.2')) + implementation 'org.junit.jupiter:junit-jupiter-api' + implementation 'org.junit.jupiter:junit-jupiter-params' + implementation 'org.mockito:mockito-core:4.6.1' +} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java new file mode 100644 index 000000000000..62bd9bcaabfc --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -0,0 +1,335 @@ +package io.airbyte.integrations.base.destination.typing_deduping; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.Streams; +import io.airbyte.commons.features.EnvVariableFeatureFlags; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.lang.Exceptions; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.commons.string.Strings; +import io.airbyte.configoss.WorkerDestinationConfig; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import io.airbyte.protocol.models.v0.SyncMode; +import io.airbyte.workers.internal.AirbyteDestination; +import io.airbyte.workers.internal.DefaultAirbyteDestination; +import io.airbyte.workers.process.AirbyteIntegrationLauncher; +import io.airbyte.workers.process.DockerProcessFactory; +import io.airbyte.workers.process.ProcessFactory; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.UUID; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The sync-running code is copy-pasted from there. + *

+ * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each test case constructs a + * ConfiguredAirbyteCatalog dynamically. + *

+ * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For sync modes which use a + * cursor, the stream provides an updated_at field. The stream also has an _ab_cdc_deleted_at field. + */ +public abstract class BaseTypingDedupingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); + private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator + .comparingLong((JsonNode record) -> asInt(record.get("_airbyte_data").get("id1"))) + .thenComparingLong(record -> asInt(record.get("_airbyte_data").get("id2"))) + .thenComparing(record -> asTimestamp(record.get("_airbyte_data").get("updated_at"))) + .thenComparing(record -> asTimestamp(record.get("_airbyte_extracted_at"))); + private static final Comparator RAW_RECORD_SORT_COMPARATOR = RAW_RECORD_IDENTITY_COMPARATOR + .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + private static final Comparator FINAL_RECORD_IDENTITY_COMPARATOR = Comparator + .comparingLong((JsonNode record) -> asInt(record.get("id1"))) + .thenComparingLong(record -> asInt(record.get("id2"))) + .thenComparing(record -> asTimestamp(record.get("updated_at"))) + .thenComparing(record -> asTimestamp(record.get("_airbyte_extracted_at"))); + private static final Comparator FINAL_RECORD_SORT_COMPARATOR = FINAL_RECORD_IDENTITY_COMPARATOR + .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + private static ProcessFactory processFactory; + + /** + * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this field. + *

+ * That method should also start testcontainer(s), if you're using them. That test container will be used for all + * tests. This is safe because each test uses a randomized stream namespace+name. + */ + protected static JsonNode config; + + private String streamNamespace; + private String streamName; + + /** + * @return the docker image to run, e.g. {@code "airbyte/destination-bigquery:dev"}. + */ + protected abstract String getImageName(); + + /** + * For a given stream, return the records that exist in the destination's raw table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + * The {@code _airbyte_data} column MUST be an {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + */ + protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; + + /** + * For a given stream, return the records that exist in the destination's final table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + */ + protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; + + /** + * Create raw+final tables in the destinations as though a previous sync had loaded {@code initialRecords}. This method + * exists so that we don't need to run a sync just to load initial state, because that's both slow and error-prone. + */ + protected abstract void loadInitialRecords(String streamNamespace, String streamName, List initialRecords) throws Exception; + + /** + * Delete any resources in the destination associated with this stream AND its namespace. We need this because we write + * raw tables to a shared {@code airbyte} namespace, which we can't drop wholesale. + *

+ * In general, this should resemble {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. + */ + protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; + + @BeforeEach + public void setup() { + streamNamespace = Strings.addRandomSuffix("typing_deduping_test_namespace", "_", 5); + streamName = Strings.addRandomSuffix("test_stream", "_", 5); + LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); + } + + @AfterEach + public void teardown() throws Exception { + teardownStreamAndNamespace(streamNamespace, streamName); + } + + /** + * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the records are written to + * the destination table. + */ + @Test + public void initialFullRefreshOverwrite() throws Exception { + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace) + .withName(streamName) + .withJsonSchema(getSchema())))); + List messages = readMessages("sync1_messages.jsonl"); + + runSync(catalog, messages); + + List expectedRawRecords = readRecords("sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl"); + List expectedFinalRecords = readRecords("sync1_expectedrecords_fullrefresh_overwrite_final.jsonl"); + verifySyncResult(expectedRawRecords, expectedFinalRecords); + } + + private static JsonNode getSchema() throws IOException { + return Jsons.deserialize(MoreResources.readResource("schema.json")); + } + + private List readMessages(String filename) throws IOException { + return MoreResources.readResource(filename).lines() + .filter(line -> !line.startsWith("//")) + .map(jsonString -> Jsons.deserialize(jsonString, AirbyteMessage.class)) + .peek(message -> { + message.getRecord().setNamespace(streamNamespace); + message.getRecord().setStream(streamName); + }).toList(); + } + + private List readRecords(String filename) throws IOException { + return MoreResources.readResource(filename).lines() + .filter(line -> !line.startsWith("//")) + .map(Jsons::deserialize) + .toList(); + } + + private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords) throws Exception { + List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); + String rawDiff = diffRawTableRecords(expectedRawRecords, actualRawRecords); + List actualFinalRecords = dumpFinalTableRecords(streamNamespace, streamName); + String finalDiff = diffFinalTableRecords(expectedFinalRecords, actualFinalRecords); + + assertAll( + () -> assertTrue(rawDiff.isEmpty(), "Raw table was incorrect.\n" + rawDiff), + () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff) + ); + } + + private static String diffRawTableRecords(List expectedRecords, List actualRecords) { + return diffRecords(expectedRecords, actualRecords, RAW_RECORD_IDENTITY_COMPARATOR, RAW_RECORD_SORT_COMPARATOR); + } + + private static String diffFinalTableRecords(List expectedRecords, List actualRecords) { + return diffRecords(expectedRecords, actualRecords, FINAL_RECORD_IDENTITY_COMPARATOR, FINAL_RECORD_SORT_COMPARATOR); + } + + /** + * Generate a human-readable diff between the two lists. Only checks the keys specified in expectedRecords. + * + * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, breaks that tie using _airbyte_raw_id + * @return The diff, or empty string if there were no differences + */ + private static String diffRecords( + List originalExpectedRecords, + List originalActualRecords, + Comparator identityComparator, Comparator sortComparator) { + List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); + List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); + + // Iterate through both lists in parallel and compare each record. + // Build up an error message listing any incorrect, missing, or unexpected records. + // Not a true diff, but close enough. + String message = ""; + int expectedRecordIndex = 0; + int actualRecordIndex = 0; + while (expectedRecordIndex < expectedRecords.size() && actualRecordIndex < actualRecords.size()) { + JsonNode expectedRecord = expectedRecords.get(expectedRecordIndex); + JsonNode actualRecord = actualRecords.get(actualRecordIndex); + int compare = identityComparator.compare(expectedRecord, actualRecord); + if (compare == 0) { + // These records should be the same. Find the specific fields that are different. + boolean foundMismatch = false; + String mismatchedRecordMessage = "Row had incorrect data:\n"; + for (String key : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { + JsonNode expectedValue = expectedRecord.get(key); + JsonNode actualValue = actualRecord.get(key); + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + if (!Objects.equals(expectedValue, actualValue) + // Objects.equals expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { + mismatchedRecordMessage += " For key " + key + ", expected " + expectedValue + " but got " + actualValue + "\n"; + foundMismatch = true; + } + } + if (foundMismatch) { + message += mismatchedRecordMessage; + } + + expectedRecordIndex++; + actualRecordIndex++; + } else if (compare < 0) { + // The expected record is missing from the actual records. Print it and move on to the next expected record. + message += "Row was expected but missing: " + expectedRecord + "\n"; + expectedRecordIndex++; + } else { + // There's an actual record which isn't present in the expected records. Print it and move on to the next actual record. + message += "Row was not expected but present: " + actualRecord + "\n"; + actualRecordIndex++; + } + } + // Tail loops in case we reached the end of one list before the other. + while (expectedRecordIndex < expectedRecords.size()) { + message += "Row was expected but missing: " + expectedRecords.get(expectedRecordIndex) + "\n"; + expectedRecordIndex++; + } + while (actualRecordIndex < actualRecords.size()) { + message += "Row was not expected but present: " + actualRecords.get(actualRecordIndex) + "\n"; + actualRecordIndex++; + } + + return message; + } + + private static long asInt(JsonNode node) { + if (node == null || !node.isIntegralNumber()) { + return Integer.MIN_VALUE; + } else { + return node.longValue(); + } + } + + private static String asString(JsonNode node) { + if (node == null || node.isNull()) { + return ""; + } else if (node.isTextual()) { + return node.asText(); + } else { + return Jsons.serialize(node); + } + } + + private static Instant asTimestamp(JsonNode node) { + if (node == null || !node.isTextual()) { + return Instant.ofEpochMilli(Long.MIN_VALUE); + } else { + return Instant.parse(node.asText()); + } + } + + /* !!!!!! WARNING !!!!!! + * The code below was mostly copypasted from DestinationAcceptanceTest. If you make edits here, you probably want to also edit there. + * !!!!!!!!!!!!!!!!!!!!! + */ + + private static Path jobRoot; + + @BeforeAll + public static void globalSetup() throws IOException { + final Path testDir = Path.of("/tmp/airbyte_tests/"); + Files.createDirectories(testDir); + final Path workspaceRoot = Files.createTempDirectory(testDir, "test"); + jobRoot = Files.createDirectories(Path.of(workspaceRoot.toString(), "job")); + Path localRoot = Files.createTempDirectory(testDir, "output"); + processFactory = new DockerProcessFactory( + workspaceRoot, + workspaceRoot.toString(), + localRoot.toString(), + "host", + Collections.emptyMap()); + } + + private void runSync(ConfiguredAirbyteCatalog catalog, List messages) throws Exception { + final WorkerDestinationConfig destinationConfig = new WorkerDestinationConfig() + .withConnectionId(UUID.randomUUID()) + .withCatalog(convertProtocolObject(catalog, io.airbyte.protocol.models.ConfiguredAirbyteCatalog.class)) + .withDestinationConnectionConfiguration(config); + + final AirbyteDestination destination = new DefaultAirbyteDestination(new AirbyteIntegrationLauncher( + "0", + 0, + getImageName(), + processFactory, + null, + null, + false, + new EnvVariableFeatureFlags())); + + destination.start(destinationConfig, jobRoot, Collections.emptyMap()); + messages.forEach(message -> Exceptions.toRuntime(() -> + destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); + destination.notifyEndOfInput(); + + while (!destination.isFinished()) { + destination.attemptRead(); + } + + destination.close(); + } + + private static V0 convertProtocolObject(final V1 v1, final Class klass) { + return Jsons.object(Jsons.jsonNode(v1), klass); + } + +} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json new file mode 100644 index 000000000000..cc196c91f5e5 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json @@ -0,0 +1,23 @@ +{ + "type": "object", + "properties": { + "id1": { "type": "integer" }, + "id2": { "type": "integer" }, + "updated_at": { + "type": "string", + "airbyte_type": "timestamp_with_timezone" + }, + "_ab_cdc_deleted_at": { + "type": "string", + "airbyte_type": "timestamp_with_timezone" + }, + "name": { "type": "string" }, + "address": { + "type": "object", + "properties": { + "city": { "type": "string" }, + "state": { "type": "string" } + } + } + } +} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl new file mode 100644 index 000000000000..0b68fdcc802f --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl @@ -0,0 +1,2 @@ +{"_airbyte_extracted_at":"1970-01-01T00:00:01Z","_airbyte_meta":{"errors":[]},"id1":1,"id2":200,"updated_at":"2000-01-01T00:00:00Z","_ab_cdc_deleted_at":null,"name":"Alice","address":{"city":"San Francisco","state":"CA"}} +{"_airbyte_extracted_at":"1970-01-01T00:00:01Z","_airbyte_meta":{"errors":[]},"id1":1,"id2":200,"updated_at":"2000-01-01T00:01:00Z","_ab_cdc_deleted_at":null,"name":"Alice","address":{"city":"Los Angeles","state":"CA"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl new file mode 100644 index 000000000000..3010e4b5d73d --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl @@ -0,0 +1,3 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}} +// Note the duplicate record. In this sync mode, we don't dedup anything. +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl new file mode 100644 index 000000000000..629cea3e4da8 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl @@ -0,0 +1,5 @@ +// emitted_at:1000 is equal to 1970-01-01 00:00:01Z. This obviously makes no sense in relation to updated_at being in the year 2000 +// but that's OK because (from destinations POV) updated_at has no relation to emitted_at. +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} +// Emit a second record for id=(1,200). This generally doesn't happen in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter (i.e. both records should be written to the final table). +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} diff --git a/airbyte-integrations/connectors/destination-bigquery/build.gradle b/airbyte-integrations/connectors/destination-bigquery/build.gradle index b78504011245..3867ce22c210 100644 --- a/airbyte-integrations/connectors/destination-bigquery/build.gradle +++ b/airbyte-integrations/connectors/destination-bigquery/build.gradle @@ -28,6 +28,7 @@ dependencies { implementation ('com.github.airbytehq:json-avro-converter:1.1.0') { exclude group: 'ch.qos.logback', module: 'logback-classic'} testImplementation project(':airbyte-integrations:bases:standard-destination-test') + testImplementation project(':airbyte-integrations:bases:base-typing-deduping-test') integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-destination-test') integrationTestJavaImplementation project(':airbyte-integrations:connectors:destination-bigquery') diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java index ffba1f03cdcf..b4e7ba499f9b 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java @@ -168,7 +168,7 @@ private AirbyteConnectionStatus checkGcsPermission(final JsonNode config) { } } - protected BigQuery getBigQuery(final JsonNode config) { + public static BigQuery getBigQuery(final JsonNode config) { final String projectId = config.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(); try { diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index 3be1cecb0875..e6117f0a7813 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -9,7 +9,6 @@ import static org.junit.jupiter.api.Assertions.*; import com.fasterxml.jackson.databind.JsonNode; -import com.google.auth.oauth2.GoogleCredentials; import com.google.cloud.bigquery.*; import com.google.cloud.bigquery.Field.Mode; import com.google.common.collect.ImmutableMap; @@ -22,7 +21,6 @@ import io.airbyte.integrations.base.destination.typing_deduping.SqlGenerator.ColumnId; import io.airbyte.integrations.base.destination.typing_deduping.SqlGenerator.StreamId; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; -import io.airbyte.integrations.destination.bigquery.BigQueryUtils; import io.airbyte.protocol.models.v0.DestinationSyncMode; import io.airbyte.protocol.models.v0.SyncMode; import java.math.BigDecimal; @@ -119,14 +117,7 @@ public static void setup() throws Exception { String rawConfig = Files.readString(Path.of("secrets/credentials-gcs-staging.json")); JsonNode config = Jsons.deserialize(rawConfig); - final BigQueryOptions.Builder bigQueryBuilder = BigQueryOptions.newBuilder(); - final GoogleCredentials credentials = BigQueryDestination.getServiceAccountCredentials(config); - bq = bigQueryBuilder - .setProjectId(config.get("project_id").asText()) - .setCredentials(credentials) - .setHeaderProvider(BigQueryUtils.getHeaderProvider()) - .build() - .getService(); + bq = BigQueryDestination.getBigQuery(config); } @BeforeEach @@ -903,8 +894,12 @@ private static void logAndExecute(final String sql) throws InterruptedException bq.query(QueryJobConfiguration.newBuilder(sql).build()); } - private Map toMap(Schema schema, FieldValueList row) { - final Map map = new HashMap<>(); + /** + * FieldValueList stores everything internally as string (I think?) but provides conversions to more useful types. + * This method does that conversion, using the schema to determine which type is most appropriate. + */ + private static LinkedHashMap toMap(Schema schema, FieldValueList row) { + final LinkedHashMap map = new LinkedHashMap<>(); for (int i = 0; i < schema.getFields().size(); i++) { final Field field = schema.getFields().get(i); final FieldValue value = row.get(i); @@ -939,13 +934,13 @@ private Map toMap(Schema schema, FieldValueList row) { * logs. */ private void assertQueryResult(final List>> expectedRows, final TableResult result) { - List> actualRows = result.streamAll().map(row -> toMap(result.getSchema(), row)).toList(); + List> actualRows = toMaps(result); List>> missingRows = new ArrayList<>(); Set> matchedRows = new HashSet<>(); boolean foundMultiMatch = false; // For each expected row, iterate through all actual rows to find a match. for (Map> expectedRow : expectedRows) { - final List> matchingRows = actualRows.stream().filter(actualRow -> { + final List> matchingRows = actualRows.stream().filter(actualRow -> { // We only want to check the fields that are specified in the expected row. // E.g.we shouldn't assert against randomized UUIDs. for (Entry> expectedEntry : expectedRow.entrySet()) { @@ -984,6 +979,17 @@ private void assertQueryResult(final List>> expecte } } + /** + * TableResult contains records in a somewhat nonintuitive format (and it avoids loading them all into memory). + * That's annoying for us since we're working with small test data, so pull everything into a list, and convert them + * into maps of column name -> value. + *

+ * Note that the values have reasonable types; see {@link #toMap(Schema, FieldValueList)} for details. + */ + public static List> toMaps(TableResult result) { + return result.streamAll().map(row -> toMap(result.getSchema(), row)).toList(); + } + private static String sortedToString(Map record) { return sortedToString(record, Function.identity()); } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java new file mode 100644 index 000000000000..03faa47ac72c --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -0,0 +1,82 @@ +package io.airbyte.integrations.destination.bigquery.typing_deduping; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.DatasetId; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableResult; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; +import io.airbyte.integrations.destination.bigquery.BigQueryDestination; +import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; +import java.io.IOException; +import java.nio.file.Path; +import java.time.Instant; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.BeforeAll; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BigQueryStandardInsertsTypingDedupingTest extends BaseTypingDedupingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryStandardInsertsTypingDedupingTest.class); + + private static BigQuery bq; + + // Note that this is not an @Override, because it's a static method. I would love suggestions on how to do this better :) + @BeforeAll + public static void buildConfig() throws IOException { + final String datasetId = Strings.addRandomSuffix("typing_deduping_default_dataset", "_", 5); + LOGGER.info("Setting default dataset to {}", datasetId); + config = BigQueryDestinationTestUtils.createConfig(Path.of("secrets/credentials-1s1t-standard.json"), datasetId); + bq = BigQueryDestination.getBigQuery(config); + } + + @Override + protected String getImageName() { + return "airbyte/destination-bigquery:dev"; + } + + @Override + protected List dumpRawTableRecords(String streamNamespace, String streamName) throws InterruptedException { + TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM airbyte." + streamNamespace + "_" + streamName)); + List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); + return rowsAsMaps.stream().map(BigQueryStandardInsertsTypingDedupingTest::toJson).toList(); + } + + @Override + protected List dumpFinalTableRecords(String streamNamespace, String streamName) throws InterruptedException { + TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM " + streamNamespace + "." + streamName)); + List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); + return rowsAsMaps.stream().map(BigQueryStandardInsertsTypingDedupingTest::toJson).toList(); + } + + @Override + protected void loadInitialRecords(String streamNamespace, String streamName, List initialRecords) { + // TODO + } + + @Override + protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { + bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); + bq.delete(DatasetId.of(streamNamespace), BigQuery.DatasetDeleteOption.deleteContents()); + } + + private static JsonNode toJson(LinkedHashMap map) { + ObjectNode o = (ObjectNode) Jsons.emptyObject(); + for (Map.Entry entry : map.entrySet()) { + Object value = entry.getValue(); + if (value instanceof Instant i) { + // naively serializing an Instant returns a DecimalNode with the unix epoch, so manually dump the string here. + o.set(entry.getKey(), Jsons.jsonNode(i.toString())); + } else { + o.set(entry.getKey(), Jsons.jsonNode(value)); + } + } + return o; + } +} diff --git a/settings.gradle b/settings.gradle index fe9e61d097f4..ecbf49bb4a6b 100644 --- a/settings.gradle +++ b/settings.gradle @@ -105,6 +105,7 @@ if (!System.getenv().containsKey("SUB_BUILD") || System.getenv().get("SUB_BUILD" include ':airbyte-integrations:bases:base-java-s3' include ':airbyte-integrations:bases:base-normalization' include ':airbyte-integrations:bases:base-typing-deduping' + include ':airbyte-integrations:bases:base-typing-deduping-test' include ':airbyte-integrations:bases:bases-destination-jdbc' // needs to be lexicographically after base-java and base-normalization to avoid race condition include ':airbyte-integrations:bases:base-standard-source-test-file' include ':airbyte-integrations:bases:connector-acceptance-test' From 639f77a0d7ddf40bb7e94de5c8951adbbc95eef9 Mon Sep 17 00:00:00 2001 From: edgao Date: Thu, 29 Jun 2023 22:53:21 +0000 Subject: [PATCH 02/46] Automated Commit - Formatting Changes --- .../BaseTypingDedupingTest.java | 84 +++++++++++-------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 62bd9bcaabfc..6e553e6a0d4b 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + package io.airbyte.integrations.base.destination.typing_deduping; import static org.junit.jupiter.api.Assertions.assertAll; @@ -39,15 +43,18 @@ import org.slf4j.LoggerFactory; /** - * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The sync-running code is copy-pasted from there. + * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The + * sync-running code is copy-pasted from there. *

- * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each test case constructs a - * ConfiguredAirbyteCatalog dynamically. + * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each + * test case constructs a ConfiguredAirbyteCatalog dynamically. *

- * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For sync modes which use a - * cursor, the stream provides an updated_at field. The stream also has an _ab_cdc_deleted_at field. + * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For + * sync modes which use a cursor, the stream provides an updated_at field. The stream also has an + * _ab_cdc_deleted_at field. */ public abstract class BaseTypingDedupingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator .comparingLong((JsonNode record) -> asInt(record.get("_airbyte_data").get("id1"))) @@ -66,10 +73,11 @@ public abstract class BaseTypingDedupingTest { private static ProcessFactory processFactory; /** - * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this field. + * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this + * field. *

- * That method should also start testcontainer(s), if you're using them. That test container will be used for all - * tests. This is safe because each test uses a randomized stream namespace+name. + * That method should also start testcontainer(s), if you're using them. That test container will be + * used for all tests. This is safe because each test uses a randomized stream namespace+name. */ protected static JsonNode config; @@ -82,27 +90,32 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * For a given stream, return the records that exist in the destination's raw table. This _should_ include metadata columns (e.g. _airbyte_raw_id). - * The {@code _airbyte_data} column MUST be an {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + * For a given stream, return the records that exist in the destination's raw table. This _should_ + * include metadata columns (e.g. _airbyte_raw_id). The {@code _airbyte_data} column MUST be an + * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; /** - * For a given stream, return the records that exist in the destination's final table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + * For a given stream, return the records that exist in the destination's final table. This _should_ + * include metadata columns (e.g. _airbyte_raw_id). */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; /** - * Create raw+final tables in the destinations as though a previous sync had loaded {@code initialRecords}. This method - * exists so that we don't need to run a sync just to load initial state, because that's both slow and error-prone. + * Create raw+final tables in the destinations as though a previous sync had loaded + * {@code initialRecords}. This method exists so that we don't need to run a sync just to load + * initial state, because that's both slow and error-prone. */ protected abstract void loadInitialRecords(String streamNamespace, String streamName, List initialRecords) throws Exception; /** - * Delete any resources in the destination associated with this stream AND its namespace. We need this because we write - * raw tables to a shared {@code airbyte} namespace, which we can't drop wholesale. + * Delete any resources in the destination associated with this stream AND its namespace. We need + * this because we write raw tables to a shared {@code airbyte} namespace, which we can't drop + * wholesale. *

- * In general, this should resemble {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. + * In general, this should resemble + * {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; @@ -119,8 +132,8 @@ public void teardown() throws Exception { } /** - * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the records are written to - * the destination table. + * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the + * records are written to the destination table. */ @Test public void initialFullRefreshOverwrite() throws Exception { @@ -170,8 +183,7 @@ private void verifySyncResult(List expectedRawRecords, List assertAll( () -> assertTrue(rawDiff.isEmpty(), "Raw table was incorrect.\n" + rawDiff), - () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff) - ); + () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff)); } private static String diffRawTableRecords(List expectedRecords, List actualRecords) { @@ -183,16 +195,20 @@ private static String diffFinalTableRecords(List expectedRecords, List } /** - * Generate a human-readable diff between the two lists. Only checks the keys specified in expectedRecords. + * Generate a human-readable diff between the two lists. Only checks the keys specified in + * expectedRecords. * - * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, breaks that tie using _airbyte_raw_id + * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same + * PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, + * breaks that tie using _airbyte_raw_id * @return The diff, or empty string if there were no differences */ private static String diffRecords( - List originalExpectedRecords, - List originalActualRecords, - Comparator identityComparator, Comparator sortComparator) { + List originalExpectedRecords, + List originalActualRecords, + Comparator identityComparator, + Comparator sortComparator) { List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); @@ -230,11 +246,13 @@ private static String diffRecords( expectedRecordIndex++; actualRecordIndex++; } else if (compare < 0) { - // The expected record is missing from the actual records. Print it and move on to the next expected record. + // The expected record is missing from the actual records. Print it and move on to the next expected + // record. message += "Row was expected but missing: " + expectedRecord + "\n"; expectedRecordIndex++; } else { - // There's an actual record which isn't present in the expected records. Print it and move on to the next actual record. + // There's an actual record which isn't present in the expected records. Print it and move on to the + // next actual record. message += "Row was not expected but present: " + actualRecord + "\n"; actualRecordIndex++; } @@ -278,9 +296,9 @@ private static Instant asTimestamp(JsonNode node) { } } - /* !!!!!! WARNING !!!!!! - * The code below was mostly copypasted from DestinationAcceptanceTest. If you make edits here, you probably want to also edit there. - * !!!!!!!!!!!!!!!!!!!!! + /* + * !!!!!! WARNING !!!!!! The code below was mostly copypasted from DestinationAcceptanceTest. If you + * make edits here, you probably want to also edit there. !!!!!!!!!!!!!!!!!!!!! */ private static Path jobRoot; @@ -317,8 +335,8 @@ private void runSync(ConfiguredAirbyteCatalog catalog, List mess new EnvVariableFeatureFlags())); destination.start(destinationConfig, jobRoot, Collections.emptyMap()); - messages.forEach(message -> Exceptions.toRuntime(() -> - destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); + messages.forEach( + message -> Exceptions.toRuntime(() -> destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); destination.notifyEndOfInput(); while (!destination.isFinished()) { From 3653da448fc5d132fea87ebd81d28d3041dfd914 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 29 Jun 2023 19:33:43 -0700 Subject: [PATCH 03/46] add second sync to test --- .../base-typing-deduping-test/build.gradle | 1 + .../BaseTypingDedupingTest.java | 184 +++++++++++------- .../src/main/resources/sync1_messages.jsonl | 5 +- ...drecords_fullrefresh_overwrite_final.jsonl | 2 + ...tedrecords_fullrefresh_overwrite_raw.jsonl | 2 + .../src/main/resources/sync2_messages.jsonl | 2 + .../typing_deduping/BigQuerySqlGenerator.java | 4 +- ...ueryStandardInsertsTypingDedupingTest.java | 13 +- 8 files changed, 134 insertions(+), 79 deletions(-) create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl diff --git a/airbyte-integrations/bases/base-typing-deduping-test/build.gradle b/airbyte-integrations/bases/base-typing-deduping-test/build.gradle index cfbf0a72d513..5c786c2f79c0 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/build.gradle +++ b/airbyte-integrations/bases/base-typing-deduping-test/build.gradle @@ -5,6 +5,7 @@ plugins { dependencies { implementation project(':airbyte-config-oss:config-models-oss') implementation project(':airbyte-connector-test-harnesses:acceptance-test-harness') + implementation project(':airbyte-integrations:bases:base-typing-deduping') implementation libs.airbyte.protocol implementation(enforcedPlatform('org.junit:junit-bom:5.8.2')) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 6e553e6a0d4b..059f29375dcc 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.Objects; import java.util.UUID; +import java.util.function.Function; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -43,18 +44,15 @@ import org.slf4j.LoggerFactory; /** - * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The - * sync-running code is copy-pasted from there. + * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The sync-running code is copy-pasted from there. *

- * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each - * test case constructs a ConfiguredAirbyteCatalog dynamically. + * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each test case constructs a + * ConfiguredAirbyteCatalog dynamically. *

- * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For - * sync modes which use a cursor, the stream provides an updated_at field. The stream also has an - * _ab_cdc_deleted_at field. + * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For sync modes which use a + * cursor, the stream provides an updated_at field. The stream also has an _ab_cdc_deleted_at field. */ public abstract class BaseTypingDedupingTest { - private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator .comparingLong((JsonNode record) -> asInt(record.get("_airbyte_data").get("id1"))) @@ -73,11 +71,10 @@ public abstract class BaseTypingDedupingTest { private static ProcessFactory processFactory; /** - * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this - * field. + * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this field. *

- * That method should also start testcontainer(s), if you're using them. That test container will be - * used for all tests. This is safe because each test uses a randomized stream namespace+name. + * That method should also start testcontainer(s), if you're using them. That test container will be used for all + * tests. This is safe because each test uses a randomized stream namespace+name. */ protected static JsonNode config; @@ -90,32 +87,25 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * For a given stream, return the records that exist in the destination's raw table. This _should_ - * include metadata columns (e.g. _airbyte_raw_id). The {@code _airbyte_data} column MUST be an - * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + * For a given stream, return the records that exist in the destination's raw table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + * The {@code _airbyte_data} column MUST be an {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + *

+ * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_loaded_at": "..", "_airbyte_data": {fields...}}. */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; /** - * For a given stream, return the records that exist in the destination's final table. This _should_ - * include metadata columns (e.g. _airbyte_raw_id). + * For a given stream, return the records that exist in the destination's final table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + *

+ * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": "..", "field1": ..., "field2": ..., ...}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; /** - * Create raw+final tables in the destinations as though a previous sync had loaded - * {@code initialRecords}. This method exists so that we don't need to run a sync just to load - * initial state, because that's both slow and error-prone. - */ - protected abstract void loadInitialRecords(String streamNamespace, String streamName, List initialRecords) throws Exception; - - /** - * Delete any resources in the destination associated with this stream AND its namespace. We need - * this because we write raw tables to a shared {@code airbyte} namespace, which we can't drop - * wholesale. + * Delete any resources in the destination associated with this stream AND its namespace. We need this because we write + * raw tables to a shared {@code airbyte} namespace, which we can't drop wholesale. *

- * In general, this should resemble - * {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. + * In general, this should resemble {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; @@ -132,11 +122,11 @@ public void teardown() throws Exception { } /** - * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the - * records are written to the destination table. + * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the records are written to + * the destination table. Then run a second sync, and verify that the records are overwritten. */ @Test - public void initialFullRefreshOverwrite() throws Exception { + public void fullRefreshOverwrite() throws Exception { ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( new ConfiguredAirbyteStream() .withSyncMode(SyncMode.FULL_REFRESH) @@ -145,13 +135,24 @@ public void initialFullRefreshOverwrite() throws Exception { .withNamespace(streamNamespace) .withName(streamName) .withJsonSchema(getSchema())))); - List messages = readMessages("sync1_messages.jsonl"); - runSync(catalog, messages); + // First sync + List messages1 = readMessages("sync1_messages.jsonl"); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_fullrefresh_overwrite_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1); - List expectedRawRecords = readRecords("sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl"); - List expectedFinalRecords = readRecords("sync1_expectedrecords_fullrefresh_overwrite_final.jsonl"); - verifySyncResult(expectedRawRecords, expectedFinalRecords); + // Second sync + List messages2 = readMessages("sync2_messages.jsonl"); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_fullrefresh_overwrite_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2); } private static JsonNode getSchema() throws IOException { @@ -183,32 +184,59 @@ private void verifySyncResult(List expectedRawRecords, List assertAll( () -> assertTrue(rawDiff.isEmpty(), "Raw table was incorrect.\n" + rawDiff), - () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff)); + () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff) + ); } private static String diffRawTableRecords(List expectedRecords, List actualRecords) { - return diffRecords(expectedRecords, actualRecords, RAW_RECORD_IDENTITY_COMPARATOR, RAW_RECORD_SORT_COMPARATOR); + return diffRecords( + expectedRecords, + actualRecords, + RAW_RECORD_IDENTITY_COMPARATOR, + RAW_RECORD_SORT_COMPARATOR, + record -> getFieldIfPresent(record.get("_airbyte_data"), "id1") + + getFieldIfPresent(record.get("_airbyte_data"), "id2") + + getFieldIfPresent(record.get("_airbyte_data"), "updated_at") + + getFieldIfPresent(record, "_airbyte_extracted_at"), + true); } private static String diffFinalTableRecords(List expectedRecords, List actualRecords) { - return diffRecords(expectedRecords, actualRecords, FINAL_RECORD_IDENTITY_COMPARATOR, FINAL_RECORD_SORT_COMPARATOR); + return diffRecords( + expectedRecords, + actualRecords, + FINAL_RECORD_IDENTITY_COMPARATOR, + FINAL_RECORD_SORT_COMPARATOR, + record -> getFieldIfPresent(record, "id1") + + getFieldIfPresent(record, "id2") + + getFieldIfPresent(record, "updated_at") + + getFieldIfPresent(record, "_airbyte_extracted_at"), + false); + } + + private static String getFieldIfPresent(JsonNode record, String field) { + if (record.has(field)) { + return field + "=" + record.get(field) + "; "; + } else { + return ""; + } } /** - * Generate a human-readable diff between the two lists. Only checks the keys specified in - * expectedRecords. + * Generate a human-readable diff between the two lists. Only checks the keys specified in expectedRecords. + * Assumes (in general) that two records with the same PK, cursor, and extracted_at are the same record. * - * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same - * PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, - * breaks that tie using _airbyte_raw_id + * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, breaks that tie using _airbyte_raw_id * @return The diff, or empty string if there were no differences */ private static String diffRecords( - List originalExpectedRecords, - List originalActualRecords, - Comparator identityComparator, - Comparator sortComparator) { + List originalExpectedRecords, + List originalActualRecords, + Comparator identityComparator, + Comparator sortComparator, + Function recordIdExtractor, + boolean extractRawData) { List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); @@ -225,18 +253,36 @@ private static String diffRecords( if (compare == 0) { // These records should be the same. Find the specific fields that are different. boolean foundMismatch = false; - String mismatchedRecordMessage = "Row had incorrect data:\n"; + String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; for (String key : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { - JsonNode expectedValue = expectedRecord.get(key); - JsonNode actualValue = actualRecord.get(key); - // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - if (!Objects.equals(expectedValue, actualValue) - // Objects.equals expects the two values to be the same class. - // We need to handle comparisons between e.g. LongNode and IntNode. - && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { - mismatchedRecordMessage += " For key " + key + ", expected " + expectedValue + " but got " + actualValue + "\n"; - foundMismatch = true; + if (extractRawData && "_airbyte_data".equals(key)) { + JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); + JsonNode actualRawData = actualRecord.get("_airbyte_data"); + for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { + JsonNode expectedValue = expectedRawData.get(field); + JsonNode actualValue = actualRawData.get(field); + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + if (!Objects.equals(expectedValue, actualValue) + // Objects.equals expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { + mismatchedRecordMessage += " For _airbyte_data." + field + ", expected " + expectedValue + " but got " + actualValue + "\n"; + foundMismatch = true; + } + } + } else { + JsonNode expectedValue = expectedRecord.get(key); + JsonNode actualValue = actualRecord.get(key); + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + if (!Objects.equals(expectedValue, actualValue) + // Objects.equals expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { + mismatchedRecordMessage += " For key " + key + ", expected " + expectedValue + " but got " + actualValue + "\n"; + foundMismatch = true; + } } } if (foundMismatch) { @@ -246,13 +292,11 @@ private static String diffRecords( expectedRecordIndex++; actualRecordIndex++; } else if (compare < 0) { - // The expected record is missing from the actual records. Print it and move on to the next expected - // record. + // The expected record is missing from the actual records. Print it and move on to the next expected record. message += "Row was expected but missing: " + expectedRecord + "\n"; expectedRecordIndex++; } else { - // There's an actual record which isn't present in the expected records. Print it and move on to the - // next actual record. + // There's an actual record which isn't present in the expected records. Print it and move on to the next actual record. message += "Row was not expected but present: " + actualRecord + "\n"; actualRecordIndex++; } @@ -296,9 +340,9 @@ private static Instant asTimestamp(JsonNode node) { } } - /* - * !!!!!! WARNING !!!!!! The code below was mostly copypasted from DestinationAcceptanceTest. If you - * make edits here, you probably want to also edit there. !!!!!!!!!!!!!!!!!!!!! + /* !!!!!! WARNING !!!!!! + * The code below was mostly copypasted from DestinationAcceptanceTest. If you make edits here, you probably want to also edit there. + * !!!!!!!!!!!!!!!!!!!!! */ private static Path jobRoot; @@ -335,8 +379,8 @@ private void runSync(ConfiguredAirbyteCatalog catalog, List mess new EnvVariableFeatureFlags())); destination.start(destinationConfig, jobRoot, Collections.emptyMap()); - messages.forEach( - message -> Exceptions.toRuntime(() -> destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); + messages.forEach(message -> Exceptions.toRuntime(() -> + destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); destination.notifyEndOfInput(); while (!destination.isFinished()) { diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl index 629cea3e4da8..b2fc2a1ea173 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl @@ -1,5 +1,6 @@ -// emitted_at:1000 is equal to 1970-01-01 00:00:01Z. This obviously makes no sense in relation to updated_at being in the year 2000 -// but that's OK because (from destinations POV) updated_at has no relation to emitted_at. +// emitted_at:1000 is equal to 1970-01-01 00:00:01Z, which is what you'll see in the expected records. +// This obviously makes no sense in relation to updated_at being in the year 2000, but that's OK +// because (from destinations POV) updated_at has no relation to emitted_at. {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} // Emit a second record for id=(1,200). This generally doesn't happen in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter (i.e. both records should be written to the final table). {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl new file mode 100644 index 000000000000..80df5e903881 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl @@ -0,0 +1,2 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl new file mode 100644 index 000000000000..6f48c9630b3d --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl @@ -0,0 +1,2 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl new file mode 100644 index 000000000000..49d8f5a605eb --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl @@ -0,0 +1,2 @@ +{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}}} +{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}}} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java index f27305c84dfc..d32ecec9456b 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java @@ -69,7 +69,7 @@ public ColumnId buildColumnId(final String name) { return new ColumnId(nameTransformer.getIdentifier(quotedName), name, canonicalized); } - public StandardSQLTypeName toDialectType(final AirbyteType type) { + public static StandardSQLTypeName toDialectType(final AirbyteType type) { // switch pattern-matching is still in preview at language level 17 :( if (type instanceof final AirbyteProtocolType p) { return toDialectType(p); @@ -137,7 +137,7 @@ ELSE JSON_QUERY(`_airbyte_data`, '$.${column_name}') } } - public StandardSQLTypeName toDialectType(final AirbyteProtocolType airbyteProtocolType) { + public static StandardSQLTypeName toDialectType(final AirbyteProtocolType airbyteProtocolType) { return switch (airbyteProtocolType) { // TODO doublecheck these case STRING -> StandardSQLTypeName.STRING; diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 03faa47ac72c..1e75b3431a65 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -1,5 +1,7 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; +import static java.util.stream.Collectors.joining; + import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.cloud.bigquery.BigQuery; @@ -9,15 +11,21 @@ import com.google.cloud.bigquery.TableResult; import io.airbyte.commons.json.Jsons; import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType; +import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.Struct; import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import io.airbyte.protocol.models.v0.SyncMode; import java.io.IOException; import java.nio.file.Path; import java.time.Instant; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.text.StringSubstitutor; import org.junit.jupiter.api.BeforeAll; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,11 +63,6 @@ protected List dumpFinalTableRecords(String streamNamespace, String st return rowsAsMaps.stream().map(BigQueryStandardInsertsTypingDedupingTest::toJson).toList(); } - @Override - protected void loadInitialRecords(String streamNamespace, String streamName, List initialRecords) { - // TODO - } - @Override protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); From a4ba44a30fa0033a807b250a51467269c80dc097 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 29 Jun 2023 19:55:07 -0700 Subject: [PATCH 04/46] do concurrent things --- .../typing_deduping/BaseTypingDedupingTest.java | 15 ++++++++++----- ...BigQueryStandardInsertsTypingDedupingTest.java | 10 +--------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 059f29375dcc..5514be9862d7 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -37,9 +37,10 @@ import java.util.UUID; import java.util.function.Function; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,6 +53,9 @@ * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For sync modes which use a * cursor, the stream provides an updated_at field. The stream also has an _ab_cdc_deleted_at field. */ +// Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's build.gradle. +// See destination-bigquery for an example. +@Execution(ExecutionMode.CONCURRENT) public abstract class BaseTypingDedupingTest { private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator @@ -68,7 +72,6 @@ public abstract class BaseTypingDedupingTest { .thenComparing(record -> asTimestamp(record.get("_airbyte_extracted_at"))); private static final Comparator FINAL_RECORD_SORT_COMPARATOR = FINAL_RECORD_IDENTITY_COMPARATOR .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - private static ProcessFactory processFactory; /** * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this field. @@ -345,10 +348,12 @@ private static Instant asTimestamp(JsonNode node) { * !!!!!!!!!!!!!!!!!!!!! */ - private static Path jobRoot; + private Path jobRoot; + // This contains some state, so it needs to be instanced per test (i.e. cannot be static) + private ProcessFactory processFactory; - @BeforeAll - public static void globalSetup() throws IOException { + @BeforeEach + public void setupProcessFactory() throws IOException { final Path testDir = Path.of("/tmp/airbyte_tests/"); Files.createDirectories(testDir); final Path workspaceRoot = Files.createTempDirectory(testDir, "test"); diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 1e75b3431a65..e5bc2fa51a73 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -1,7 +1,5 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; -import static java.util.stream.Collectors.joining; - import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.cloud.bigquery.BigQuery; @@ -11,21 +9,15 @@ import com.google.cloud.bigquery.TableResult; import io.airbyte.commons.json.Jsons; import io.airbyte.commons.string.Strings; -import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType; -import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.Struct; import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; -import io.airbyte.protocol.models.v0.DestinationSyncMode; -import io.airbyte.protocol.models.v0.SyncMode; import java.io.IOException; import java.nio.file.Path; import java.time.Instant; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; -import org.apache.commons.text.StringSubstitutor; import org.junit.jupiter.api.BeforeAll; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -74,7 +66,7 @@ private static JsonNode toJson(LinkedHashMap map) { for (Map.Entry entry : map.entrySet()) { Object value = entry.getValue(); if (value instanceof Instant i) { - // naively serializing an Instant returns a DecimalNode with the unix epoch, so manually dump the string here. + // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it o.set(entry.getKey(), Jsons.jsonNode(i.toString())); } else { o.set(entry.getKey(), Jsons.jsonNode(value)); From 8298bff2912d766a3dbf3c19f2013815817a7c40 Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 02:59:22 +0000 Subject: [PATCH 05/46] Automated Commit - Formatting Changes --- .../BaseTypingDedupingTest.java | 92 +++++++++++-------- 1 file changed, 54 insertions(+), 38 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 5514be9862d7..d80b98e481af 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -45,18 +45,22 @@ import org.slf4j.LoggerFactory; /** - * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The sync-running code is copy-pasted from there. + * This is loosely based on standard-destination-tests's DestinationAcceptanceTest class. The + * sync-running code is copy-pasted from there. *

- * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each test case constructs a - * ConfiguredAirbyteCatalog dynamically. + * All tests use a single stream, whose schema is defined in {@code resources/schema.json}. Each + * test case constructs a ConfiguredAirbyteCatalog dynamically. *

- * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For sync modes which use a - * cursor, the stream provides an updated_at field. The stream also has an _ab_cdc_deleted_at field. + * For sync modes which use a primary key, the stream provides a composite key of (id1, id2). For + * sync modes which use a cursor, the stream provides an updated_at field. The stream also has an + * _ab_cdc_deleted_at field. */ -// Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's build.gradle. +// Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's +// build.gradle. // See destination-bigquery for an example. @Execution(ExecutionMode.CONCURRENT) public abstract class BaseTypingDedupingTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator .comparingLong((JsonNode record) -> asInt(record.get("_airbyte_data").get("id1"))) @@ -74,10 +78,11 @@ public abstract class BaseTypingDedupingTest { .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); /** - * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this field. + * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this + * field. *

- * That method should also start testcontainer(s), if you're using them. That test container will be used for all - * tests. This is safe because each test uses a randomized stream namespace+name. + * That method should also start testcontainer(s), if you're using them. That test container will be + * used for all tests. This is safe because each test uses a randomized stream namespace+name. */ protected static JsonNode config; @@ -90,25 +95,31 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * For a given stream, return the records that exist in the destination's raw table. This _should_ include metadata columns (e.g. _airbyte_raw_id). - * The {@code _airbyte_data} column MUST be an {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + * For a given stream, return the records that exist in the destination's raw table. This _should_ + * include metadata columns (e.g. _airbyte_raw_id). The {@code _airbyte_data} column MUST be an + * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). *

- * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_loaded_at": "..", "_airbyte_data": {fields...}}. + * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", + * "_airbyte_loaded_at": "..", "_airbyte_data": {fields...}}. */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; /** - * For a given stream, return the records that exist in the destination's final table. This _should_ include metadata columns (e.g. _airbyte_raw_id). + * For a given stream, return the records that exist in the destination's final table. This _should_ + * include metadata columns (e.g. _airbyte_raw_id). *

- * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": "..", "field1": ..., "field2": ..., ...}. + * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", + * "_airbyte_meta": "..", "field1": ..., "field2": ..., ...}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; /** - * Delete any resources in the destination associated with this stream AND its namespace. We need this because we write - * raw tables to a shared {@code airbyte} namespace, which we can't drop wholesale. + * Delete any resources in the destination associated with this stream AND its namespace. We need + * this because we write raw tables to a shared {@code airbyte} namespace, which we can't drop + * wholesale. *

- * In general, this should resemble {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. + * In general, this should resemble + * {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; @@ -125,8 +136,9 @@ public void teardown() throws Exception { } /** - * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the records are written to - * the destination table. Then run a second sync, and verify that the records are overwritten. + * Starting with an empty destination, execute a full refresh overwrite sync. Verify that the + * records are written to the destination table. Then run a second sync, and verify that the records + * are overwritten. */ @Test public void fullRefreshOverwrite() throws Exception { @@ -187,8 +199,7 @@ private void verifySyncResult(List expectedRawRecords, List assertAll( () -> assertTrue(rawDiff.isEmpty(), "Raw table was incorrect.\n" + rawDiff), - () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff) - ); + () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff)); } private static String diffRawTableRecords(List expectedRecords, List actualRecords) { @@ -226,20 +237,23 @@ private static String getFieldIfPresent(JsonNode record, String field) { } /** - * Generate a human-readable diff between the two lists. Only checks the keys specified in expectedRecords. - * Assumes (in general) that two records with the same PK, cursor, and extracted_at are the same record. + * Generate a human-readable diff between the two lists. Only checks the keys specified in + * expectedRecords. Assumes (in general) that two records with the same PK, cursor, and extracted_at + * are the same record. * - * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, breaks that tie using _airbyte_raw_id + * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same + * PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, + * breaks that tie using _airbyte_raw_id * @return The diff, or empty string if there were no differences */ private static String diffRecords( - List originalExpectedRecords, - List originalActualRecords, - Comparator identityComparator, - Comparator sortComparator, - Function recordIdExtractor, - boolean extractRawData) { + List originalExpectedRecords, + List originalActualRecords, + Comparator identityComparator, + Comparator sortComparator, + Function recordIdExtractor, + boolean extractRawData) { List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); @@ -295,11 +309,13 @@ private static String diffRecords( expectedRecordIndex++; actualRecordIndex++; } else if (compare < 0) { - // The expected record is missing from the actual records. Print it and move on to the next expected record. + // The expected record is missing from the actual records. Print it and move on to the next expected + // record. message += "Row was expected but missing: " + expectedRecord + "\n"; expectedRecordIndex++; } else { - // There's an actual record which isn't present in the expected records. Print it and move on to the next actual record. + // There's an actual record which isn't present in the expected records. Print it and move on to the + // next actual record. message += "Row was not expected but present: " + actualRecord + "\n"; actualRecordIndex++; } @@ -343,9 +359,9 @@ private static Instant asTimestamp(JsonNode node) { } } - /* !!!!!! WARNING !!!!!! - * The code below was mostly copypasted from DestinationAcceptanceTest. If you make edits here, you probably want to also edit there. - * !!!!!!!!!!!!!!!!!!!!! + /* + * !!!!!! WARNING !!!!!! The code below was mostly copypasted from DestinationAcceptanceTest. If you + * make edits here, you probably want to also edit there. !!!!!!!!!!!!!!!!!!!!! */ private Path jobRoot; @@ -384,8 +400,8 @@ private void runSync(ConfiguredAirbyteCatalog catalog, List mess new EnvVariableFeatureFlags())); destination.start(destinationConfig, jobRoot, Collections.emptyMap()); - messages.forEach(message -> Exceptions.toRuntime(() -> - destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); + messages.forEach( + message -> Exceptions.toRuntime(() -> destination.accept(convertProtocolObject(message, io.airbyte.protocol.models.AirbyteMessage.class)))); destination.notifyEndOfInput(); while (!destination.isFinished()) { From 24d0ca475d8daa7500be29017a062ea172b6330d Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 29 Jun 2023 20:21:38 -0700 Subject: [PATCH 06/46] clarify comment --- .../src/main/resources/sync1_messages.jsonl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl index b2fc2a1ea173..0bb48b2cb1cf 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl @@ -2,5 +2,5 @@ // This obviously makes no sense in relation to updated_at being in the year 2000, but that's OK // because (from destinations POV) updated_at has no relation to emitted_at. {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} -// Emit a second record for id=(1,200). This generally doesn't happen in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter (i.e. both records should be written to the final table). +// Emit a second record for id=(1,200) with a different updated_at. This generally doesn't happen in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter (i.e. both records should be written to the final table). {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} From d23ba3f607fd998430a16dea96dd73a4fb380f50 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 29 Jun 2023 20:28:11 -0700 Subject: [PATCH 07/46] minor tweaks --- .../destination/typing_deduping/BaseTypingDedupingTest.java | 5 +++-- .../src/main/resources/sync1_messages.jsonl | 4 +++- .../destination/bigquery/BigQueryDestinationTestUtils.java | 3 +++ .../BigQueryStandardInsertsTypingDedupingTest.java | 4 ---- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index d80b98e481af..d0cf73831955 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -125,8 +125,9 @@ public abstract class BaseTypingDedupingTest { @BeforeEach public void setup() { - streamNamespace = Strings.addRandomSuffix("typing_deduping_test_namespace", "_", 5); - streamName = Strings.addRandomSuffix("test_stream", "_", 5); + streamNamespace = Strings.addRandomSuffix("typing_deduping_test", "_", 5); + // we don't randomize this, because randomizing the namespace is sufficient. + streamName = "test_stream"; LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); } diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl index 0bb48b2cb1cf..1e0bee4bcd5a 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl @@ -2,5 +2,7 @@ // This obviously makes no sense in relation to updated_at being in the year 2000, but that's OK // because (from destinations POV) updated_at has no relation to emitted_at. {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} -// Emit a second record for id=(1,200) with a different updated_at. This generally doesn't happen in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter (i.e. both records should be written to the final table). +// Emit a second record for id=(1,200) with a different updated_at. This generally doesn't happen +// in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter +// (i.e. both records should be written to the final table). {"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationTestUtils.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationTestUtils.java index cdc5e042078f..cac72e263a43 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationTestUtils.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/BigQueryDestinationTestUtils.java @@ -21,8 +21,10 @@ import java.util.LinkedList; import java.util.List; import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class BigQueryDestinationTestUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDestinationTestUtils.class); /** * Parse the config file and replace dataset with datasetId randomly generated by the test @@ -33,6 +35,7 @@ public class BigQueryDestinationTestUtils { * @throws IOException */ public static JsonNode createConfig(Path configFile, String datasetId) throws IOException { + LOGGER.info("Setting default dataset to {}", datasetId); final String tmpConfigAsString = Files.readString(configFile); final JsonNode tmpConfigJson = Jsons.deserialize(tmpConfigAsString); return Jsons.jsonNode(((ObjectNode) tmpConfigJson).put(BigQueryConsts.CONFIG_DATASET_ID, datasetId)); diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index e5bc2fa51a73..1fd4d0eaba13 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -19,11 +19,8 @@ import java.util.List; import java.util.Map; import org.junit.jupiter.api.BeforeAll; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class BigQueryStandardInsertsTypingDedupingTest extends BaseTypingDedupingTest { - private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryStandardInsertsTypingDedupingTest.class); private static BigQuery bq; @@ -31,7 +28,6 @@ public class BigQueryStandardInsertsTypingDedupingTest extends BaseTypingDedupin @BeforeAll public static void buildConfig() throws IOException { final String datasetId = Strings.addRandomSuffix("typing_deduping_default_dataset", "_", 5); - LOGGER.info("Setting default dataset to {}", datasetId); config = BigQueryDestinationTestUtils.createConfig(Path.of("secrets/credentials-1s1t-standard.json"), datasetId); bq = BigQueryDestination.getBigQuery(config); } From 665fb3af7c56fd78f7847a796dafb6b40b7fad11 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 07:23:25 -0700 Subject: [PATCH 08/46] more stuff --- .../BaseTypingDedupingTest.java | 18 +++++++----------- ...QueryStandardInsertsTypingDedupingTest.java | 10 +++++----- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index d0cf73831955..16a346b7aa0e 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -95,21 +95,17 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * For a given stream, return the records that exist in the destination's raw table. This _should_ - * include metadata columns (e.g. _airbyte_raw_id). The {@code _airbyte_data} column MUST be an - * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + * For a given stream, return the records that exist in the destination's raw table. Each record must be in the format + * {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_loaded_at": "...", "_airbyte_data": {fields...}}. *

- * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", - * "_airbyte_loaded_at": "..", "_airbyte_data": {fields...}}. + * The {@code _airbyte_data} column must be an {@link com.fasterxml.jackson.databind.node.ObjectNode} + * (i.e. it cannot be a string value). */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; /** - * For a given stream, return the records that exist in the destination's final table. This _should_ - * include metadata columns (e.g. _airbyte_raw_id). - *

- * Each record should be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", - * "_airbyte_meta": "..", "field1": ..., "field2": ..., ...}. + * For a given stream, return the records that exist in the destination's final table. Each record must be in the + * format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": {...}, "field1": ..., "field2": ..., ...}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; @@ -119,7 +115,7 @@ public abstract class BaseTypingDedupingTest { * wholesale. *

* In general, this should resemble - * {@code DROP TABLE airbyte.namespace_name; DROP SCHEMA namespace}. + * {@code DROP TABLE IF EXISTS airbyte.namespace_name; DROP SCHEMA IF EXISTS namespace}. */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 1fd4d0eaba13..4cb909e382d8 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -53,21 +53,21 @@ protected List dumpFinalTableRecords(String streamNamespace, String st @Override protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { + // bq.delete simply returns false if the table/schema doesn't exist (e.g. if the connector failed to create it) bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); bq.delete(DatasetId.of(streamNamespace), BigQuery.DatasetDeleteOption.deleteContents()); } private static JsonNode toJson(LinkedHashMap map) { ObjectNode o = (ObjectNode) Jsons.emptyObject(); - for (Map.Entry entry : map.entrySet()) { - Object value = entry.getValue(); + map.forEach((key, value) -> { if (value instanceof Instant i) { // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it - o.set(entry.getKey(), Jsons.jsonNode(i.toString())); + o.set(key, Jsons.jsonNode(i.toString())); } else { - o.set(entry.getKey(), Jsons.jsonNode(value)); + o.set(key, Jsons.jsonNode(value)); } - } + }); return o; } } From bd61d1331b741f709b941412e4ef53a714dbe232 Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 14:28:28 +0000 Subject: [PATCH 09/46] Automated Commit - Formatting Changes --- .../typing_deduping/BaseTypingDedupingTest.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 16a346b7aa0e..7dde276118fa 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -95,17 +95,19 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * For a given stream, return the records that exist in the destination's raw table. Each record must be in the format - * {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_loaded_at": "...", "_airbyte_data": {fields...}}. + * For a given stream, return the records that exist in the destination's raw table. Each record + * must be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", + * "_airbyte_loaded_at": "...", "_airbyte_data": {fields...}}. *

- * The {@code _airbyte_data} column must be an {@link com.fasterxml.jackson.databind.node.ObjectNode} - * (i.e. it cannot be a string value). + * The {@code _airbyte_data} column must be an + * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; /** - * For a given stream, return the records that exist in the destination's final table. Each record must be in the - * format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": {...}, "field1": ..., "field2": ..., ...}. + * For a given stream, return the records that exist in the destination's final table. Each record + * must be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": + * {...}, "field1": ..., "field2": ..., ...}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; From 9911b5b31b15d2ab89dafe9aade3e9793d2b1988 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 07:46:34 -0700 Subject: [PATCH 10/46] minor cleanup --- .../BaseTypingDedupingTest.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 7dde276118fa..4ddce2a28d4d 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -173,23 +173,24 @@ private static JsonNode getSchema() throws IOException { return Jsons.deserialize(MoreResources.readResource("schema.json")); } - private List readMessages(String filename) throws IOException { + private List readRecords(String filename) throws IOException { return MoreResources.readResource(filename).lines() + .map(String::trim) + .filter(line -> !line.isEmpty()) .filter(line -> !line.startsWith("//")) - .map(jsonString -> Jsons.deserialize(jsonString, AirbyteMessage.class)) + .map(Jsons::deserialize) + .toList(); + } + + private List readMessages(String filename) throws IOException { + return readRecords(filename).stream() + .map(record -> Jsons.convertValue(record, AirbyteMessage.class)) .peek(message -> { message.getRecord().setNamespace(streamNamespace); message.getRecord().setStream(streamName); }).toList(); } - private List readRecords(String filename) throws IOException { - return MoreResources.readResource(filename).lines() - .filter(line -> !line.startsWith("//")) - .map(Jsons::deserialize) - .toList(); - } - private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords) throws Exception { List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); String rawDiff = diffRawTableRecords(expectedRawRecords, actualRawRecords); From f06815ede564dcd647befa59da0d231881e771e4 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 09:57:00 -0700 Subject: [PATCH 11/46] lots of fixes * handle sql vs json null better * verify extra columns * only check deleted_at if in DEDUP mode and the column exists * add full refresh append test case --- .../BaseTypingDedupingTest.java | 117 ++++++++++++++---- .../src/main/resources/schema.json | 6 + ...drecords_fullrefresh_overwrite_final.jsonl | 2 - ...tedrecords_fullrefresh_overwrite_raw.jsonl | 3 - ...sync1_expectedrecords_nondedup_final.jsonl | 5 + .../sync1_expectedrecords_nondedup_raw.jsonl | 6 + .../src/main/resources/sync1_messages.jsonl | 8 +- ...ctedrecords_fullrefresh_append_final.jsonl | 8 ++ ...pectedrecords_fullrefresh_append_raw.jsonl | 9 ++ ...drecords_fullrefresh_overwrite_final.jsonl | 5 +- ...tedrecords_fullrefresh_overwrite_raw.jsonl | 5 +- .../src/main/resources/sync2_messages.jsonl | 7 +- .../typing_deduping/BigQuerySqlGenerator.java | 23 ++-- .../BigQuerySqlGeneratorIntegrationTest.java | 6 +- ...ueryStandardInsertsTypingDedupingTest.java | 4 +- 15 files changed, 169 insertions(+), 45 deletions(-) delete mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl delete mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_raw.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_raw.jsonl diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 4ddce2a28d4d..948c5d18b613 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -32,7 +32,9 @@ import java.time.Instant; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.UUID; import java.util.function.Function; @@ -108,6 +110,12 @@ public abstract class BaseTypingDedupingTest { * For a given stream, return the records that exist in the destination's final table. Each record * must be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": * {...}, "field1": ..., "field2": ..., ...}. + *

+ * For JSON-valued columns, there is some nuance: a SQL null should be represented as a missing entry, whereas a JSON + * null should be represented as a {@link com.fasterxml.jackson.databind.node.NullNode}. For example, in the JSON blob + * {"name": null}, the `name` field is a JSON null, and the `address` field is a SQL null. + *

+ * The corresponding SQL looks like {@code INSERT INTO ... (name, address) VALUES ('null' :: jsonb, NULL)}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; @@ -124,8 +132,7 @@ public abstract class BaseTypingDedupingTest { @BeforeEach public void setup() { streamNamespace = Strings.addRandomSuffix("typing_deduping_test", "_", 5); - // we don't randomize this, because randomizing the namespace is sufficient. - streamName = "test_stream"; + streamName = Strings.addRandomSuffix("test_stream", "_", 5); LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); } @@ -155,8 +162,8 @@ public void fullRefreshOverwrite() throws Exception { runSync(catalog, messages1); - List expectedRawRecords1 = readRecords("sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl"); - List expectedFinalRecords1 = readRecords("sync1_expectedrecords_fullrefresh_overwrite_final.jsonl"); + List expectedRawRecords1 = readRecords("sync1_expectedrecords_nondedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_nondedup_final.jsonl"); verifySyncResult(expectedRawRecords1, expectedFinalRecords1); // Second sync @@ -169,6 +176,41 @@ public void fullRefreshOverwrite() throws Exception { verifySyncResult(expectedRawRecords2, expectedFinalRecords2); } + /** + * Starting with an empty destination, execute a full refresh append sync. Verify that the + * records are written to the destination table. Then run a second sync, and verify that the old and new records + * are all present. + */ + @Test + public void fullRefreshAppend() throws Exception { + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withDestinationSyncMode(DestinationSyncMode.APPEND) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace) + .withName(streamName) + .withJsonSchema(getSchema())))); + + // First sync + List messages1 = readMessages("sync1_messages.jsonl"); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_nondedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_nondedup_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1); + + // Second sync + List messages2 = readMessages("sync2_messages.jsonl"); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_fullrefresh_append_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_fullrefresh_append_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2); + } + private static JsonNode getSchema() throws IOException { return Jsons.deserialize(MoreResources.readResource("schema.json")); } @@ -271,37 +313,42 @@ private static String diffRecords( // These records should be the same. Find the specific fields that are different. boolean foundMismatch = false; String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; - for (String key : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { - if (extractRawData && "_airbyte_data".equals(key)) { + // Iterate through each field in the expected record and compare it to the actual record's value. + for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { + if (extractRawData && "_airbyte_data".equals(column)) { JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); JsonNode actualRawData = actualRecord.get("_airbyte_data"); for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { JsonNode expectedValue = expectedRawData.get(field); JsonNode actualValue = actualRawData.get(field); - // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - if (!Objects.equals(expectedValue, actualValue) - // Objects.equals expects the two values to be the same class. - // We need to handle comparisons between e.g. LongNode and IntNode. - && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { - mismatchedRecordMessage += " For _airbyte_data." + field + ", expected " + expectedValue + " but got " + actualValue + "\n"; + if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); + foundMismatch = true; + } + } + LinkedHashMap extraColumns = checkForExtraFields(expectedRawData, actualRawData); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); foundMismatch = true; } } } else { - JsonNode expectedValue = expectedRecord.get(key); - JsonNode actualValue = actualRecord.get(key); - // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - if (!Objects.equals(expectedValue, actualValue) - // Objects.equals expects the two values to be the same class. - // We need to handle comparisons between e.g. LongNode and IntNode. - && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble())) { - mismatchedRecordMessage += " For key " + key + ", expected " + expectedValue + " but got " + actualValue + "\n"; + JsonNode expectedValue = expectedRecord.get(column); + JsonNode actualValue = actualRecord.get(column); + if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); foundMismatch = true; } } } + LinkedHashMap extraColumns = checkForExtraFields(expectedRecord, actualRecord); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); + foundMismatch = true; + } + } if (foundMismatch) { message += mismatchedRecordMessage; } @@ -333,6 +380,32 @@ private static String diffRecords( return message; } + private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode actualValue) { + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + return !Objects.equals(expectedValue, actualValue) + // Objects.equals expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); + } + + private static LinkedHashMap checkForExtraFields(JsonNode expectedRecord, JsonNode actualRecord) { + LinkedHashMap extraFields = new LinkedHashMap<>(); + for (String column : Streams.stream(actualRecord.fieldNames()).sorted().toList()) { + // loaded_at and raw_id are generated dynamically, so we just ignore them. + if (!"_airbyte_loaded_at".equals(column) && !"_airbyte_raw_id".equals(column) && !expectedRecord.has(column)) { + extraFields.put(column, actualRecord.get(column)); + } + } + return extraFields; + } + + private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { + String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); + String actualString = actualValue == null ? "SQL NULL (i.e. no value)" : actualValue.toString(); + return " For " + fieldname + ", expected " + expectedString + " but got " + actualString + "\n"; + } + private static long asInt(JsonNode node) { if (node == null || !node.isIntegralNumber()) { return Integer.MIN_VALUE; diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json index cc196c91f5e5..e391324deaf7 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/schema.json @@ -18,6 +18,12 @@ "city": { "type": "string" }, "state": { "type": "string" } } + }, + "age": { "type": "integer" }, + "registration_date": { + "type": "string", + "format": "date", + "airbyte_type": "date" } } } diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl deleted file mode 100644 index 0b68fdcc802f..000000000000 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_final.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"_airbyte_extracted_at":"1970-01-01T00:00:01Z","_airbyte_meta":{"errors":[]},"id1":1,"id2":200,"updated_at":"2000-01-01T00:00:00Z","_ab_cdc_deleted_at":null,"name":"Alice","address":{"city":"San Francisco","state":"CA"}} -{"_airbyte_extracted_at":"1970-01-01T00:00:01Z","_airbyte_meta":{"errors":[]},"id1":1,"id2":200,"updated_at":"2000-01-01T00:01:00Z","_ab_cdc_deleted_at":null,"name":"Alice","address":{"city":"Los Angeles","state":"CA"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl deleted file mode 100644 index 3010e4b5d73d..000000000000 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_fullrefresh_overwrite_raw.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}} -// Note the duplicate record. In this sync mode, we don't dedup anything. -{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_final.jsonl new file mode 100644 index 000000000000..623527f41e75 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_final.jsonl @@ -0,0 +1,5 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}} +// Invalid columns are nulled out (i.e. SQL null, not JSON null) +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":["Problem with `age`", "Problem with `registration_date`"]}, "id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie"} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_raw.jsonl new file mode 100644 index 000000000000..4b4db08115e5 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_nondedup_raw.jsonl @@ -0,0 +1,6 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}} +// Note the duplicate record. In this sync mode, we don't dedup anything. +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}}} +// Invalid data is still allowed in the raw table. +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": "this is not an integer", "registration_date": "this is not a date"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl index 1e0bee4bcd5a..4c5dec1a24ea 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_messages.jsonl @@ -1,8 +1,12 @@ // emitted_at:1000 is equal to 1970-01-01 00:00:01Z, which is what you'll see in the expected records. // This obviously makes no sense in relation to updated_at being in the year 2000, but that's OK // because (from destinations POV) updated_at has no relation to emitted_at. -{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}}} // Emit a second record for id=(1,200) with a different updated_at. This generally doesn't happen // in full refresh syncs - but if T+D is implemented correctly, it shouldn't matter // (i.e. both records should be written to the final table). -{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}}} +// Emit a record with no _ab_cdc_deleted_at field. CDC sources typically emit an explicit null, but we should handle both cases. +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}}}} +// Emit a record with an invalid age. +{"type": "RECORD", "record": {"emitted_at": 1000, "data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": "this is not an integer", "registration_date": "this is not a date"}}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_final.jsonl new file mode 100644 index 000000000000..2e935f18f357 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_final.jsonl @@ -0,0 +1,8 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":["Problem with `age`", "Problem with `registration_date`"]}, "id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie"} + +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "name": "Bob", "address": {"city": "New York", "state": "NY"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_raw.jsonl new file mode 100644 index 000000000000..5cf2a7f389ce --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_append_raw.jsonl @@ -0,0 +1,9 @@ +// We keep the records from the first sync +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": "this is not an integer", "registration_date": "this is not a date"}} +// And append the records from the second sync +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl index 80df5e903881..0c06d6b00117 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_final.jsonl @@ -1,2 +1,3 @@ -{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} -{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "name": "Bob", "address": {"city": "New York", "state": "NY"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl index 6f48c9630b3d..79554272b9a6 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_fullrefresh_overwrite_raw.jsonl @@ -1,2 +1,3 @@ -{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}} -{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl index 49d8f5a605eb..1f828f31f5d3 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_messages.jsonl @@ -1,2 +1,5 @@ -{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}}} -{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}}} +{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}}} +{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}}} +// Set deleted_at to something non-null. Again, T+D doesn't check the actual _value_ of deleted_at (i.e. the fact that it's in the past is irrelevant). +// It only cares whether deleted_at is non-null. So this should delete Bob from the final table (in dedup mode). +{"type": "RECORD", "record": {"emitted_at": 2000, "data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"}}} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java index d32ecec9456b..c4c3be8a91af 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java @@ -221,7 +221,7 @@ public String updateTable(final String finalSuffix, final StreamConfig stream) { if (stream.destinationSyncMode() == DestinationSyncMode.APPEND_DEDUP) { validatePrimaryKeys = validatePrimaryKeys(stream.id(), stream.primaryKey(), stream.columns()); } - final String insertNewRecords = insertNewRecords(stream.id(), finalSuffix, stream.columns()); + final String insertNewRecords = insertNewRecords(stream.id(), finalSuffix, stream.columns(), stream.destinationSyncMode()); String dedupFinalTable = ""; String dedupRawTable = ""; if (stream.destinationSyncMode() == DestinationSyncMode.APPEND_DEDUP) { @@ -283,7 +283,7 @@ SELECT COUNT(1) } @VisibleForTesting - String insertNewRecords(final StreamId id, final String finalSuffix, final LinkedHashMap streamColumns) { + String insertNewRecords(final StreamId id, final String finalSuffix, final LinkedHashMap streamColumns, DestinationSyncMode destinationSyncMode) { final String columnCasts = streamColumns.entrySet().stream().map( col -> extractAndCast(col.getKey(), col.getValue()) + " as " + col.getKey().name(QUOTE) + ",") .collect(joining("\n")); @@ -302,6 +302,17 @@ String insertNewRecords(final StreamId id, final String finalSuffix, final Linke END""")) .collect(joining(",\n")); final String columnList = streamColumns.keySet().stream().map(quotedColumnId -> quotedColumnId.name(QUOTE) + ",").collect(joining("\n")); + final String deletionClause; + if (destinationSyncMode == DestinationSyncMode.APPEND_DEDUP && streamColumns.keySet().stream().anyMatch(col -> "_ab_cdc_deleted_at".equals(col.originalName()))) { + deletionClause = """ + AND ( + JSON_QUERY(`_airbyte_data`, '$._ab_cdc_deleted_at') IS NULL + OR JSON_TYPE(JSON_QUERY(`_airbyte_data`, '$._ab_cdc_deleted_at')) = 'null' + ) + """; + } else { + deletionClause = ""; + } // Note that we intentionally excluded deleted records from this insert. See dedupRawRecords for an // explanation of how CDC deletes work. @@ -310,7 +321,8 @@ String insertNewRecords(final StreamId id, final String finalSuffix, final Linke "final_table_id", id.finalTableId(finalSuffix, QUOTE), "column_casts", columnCasts, "column_errors", columnErrors, - "column_list", columnList)).replace( + "column_list", columnList, + "deletion_clause", deletionClause)).replace( """ INSERT INTO ${final_table_id} ( @@ -330,10 +342,7 @@ WITH intermediate_data AS ( FROM ${raw_table_id} WHERE _airbyte_loaded_at IS NULL - AND ( - JSON_QUERY(`_airbyte_data`, '$._ab_cdc_deleted_at') IS NULL - OR JSON_TYPE(JSON_QUERY(`_airbyte_data`, '$._ab_cdc_deleted_at')) = 'null' - ) + ${deletion_clause} ) SELECT ${column_list} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index e6117f0a7813..ac4f3bf71a32 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -216,7 +216,7 @@ public void testInsertNewRecordsIncremental() throws InterruptedException { """)) .build()); - final String sql = GENERATOR.insertNewRecords(streamId, "", COLUMNS); + final String sql = GENERATOR.insertNewRecords(streamId, "", COLUMNS, DestinationSyncMode.OVERWRITE); logAndExecute(sql); final TableResult result = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.finalTableId(QUOTE)).build()); @@ -897,6 +897,8 @@ private static void logAndExecute(final String sql) throws InterruptedException /** * FieldValueList stores everything internally as string (I think?) but provides conversions to more useful types. * This method does that conversion, using the schema to determine which type is most appropriate. + *

+ * SQL nulls are represented as explicit null values. JSON nulls are represented as {@link com.fasterxml.jackson.databind.node.NullNode}. */ private static LinkedHashMap toMap(Schema schema, FieldValueList row) { final LinkedHashMap map = new LinkedHashMap<>(); @@ -904,7 +906,7 @@ private static LinkedHashMap toMap(Schema schema, FieldValueList final Field field = schema.getFields().get(i); final FieldValue value = row.get(i); Object typedValue; - if (value.getValue() == null) { + if (value.isNull()) { typedValue = null; } else { typedValue = switch (field.getType().getStandardType()) { diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 4cb909e382d8..0da1f2945c65 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -61,7 +61,9 @@ protected void teardownStreamAndNamespace(String streamNamespace, String streamN private static JsonNode toJson(LinkedHashMap map) { ObjectNode o = (ObjectNode) Jsons.emptyObject(); map.forEach((key, value) -> { - if (value instanceof Instant i) { + if (value == null) { + // If the value is null, do nothing. We don't want to insert it into the json at all. + } else if (value instanceof Instant i) { // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it o.set(key, Jsons.jsonNode(i.toString())); } else { From d05365a89c09df4e042d7e6570758b0be2a1186a Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 17:00:58 +0000 Subject: [PATCH 12/46] Automated Commit - Formatting Changes --- .../typing_deduping/BaseTypingDedupingTest.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 948c5d18b613..861ac7e34c08 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -111,11 +111,13 @@ public abstract class BaseTypingDedupingTest { * must be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", "_airbyte_meta": * {...}, "field1": ..., "field2": ..., ...}. *

- * For JSON-valued columns, there is some nuance: a SQL null should be represented as a missing entry, whereas a JSON - * null should be represented as a {@link com.fasterxml.jackson.databind.node.NullNode}. For example, in the JSON blob - * {"name": null}, the `name` field is a JSON null, and the `address` field is a SQL null. + * For JSON-valued columns, there is some nuance: a SQL null should be represented as a missing + * entry, whereas a JSON null should be represented as a + * {@link com.fasterxml.jackson.databind.node.NullNode}. For example, in the JSON blob {"name": + * null}, the `name` field is a JSON null, and the `address` field is a SQL null. *

- * The corresponding SQL looks like {@code INSERT INTO ... (name, address) VALUES ('null' :: jsonb, NULL)}. + * The corresponding SQL looks like + * {@code INSERT INTO ... (name, address) VALUES ('null' :: jsonb, NULL)}. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; @@ -177,9 +179,9 @@ public void fullRefreshOverwrite() throws Exception { } /** - * Starting with an empty destination, execute a full refresh append sync. Verify that the - * records are written to the destination table. Then run a second sync, and verify that the old and new records - * are all present. + * Starting with an empty destination, execute a full refresh append sync. Verify that the records + * are written to the destination table. Then run a second sync, and verify that the old and new + * records are all present. */ @Test public void fullRefreshAppend() throws Exception { From 55887680a4c0319135b930942d4035744f438841 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 11:44:12 -0700 Subject: [PATCH 13/46] add tests for the remaining sync modes --- .../BaseTypingDedupingTest.java | 75 +++++++++++++++++++ .../sync1_expectedrecords_dedup_final.jsonl | 4 + .../sync1_expectedrecords_dedup_raw.jsonl | 4 + ...ectedrecords_incremental_dedup_final.jsonl | 3 + ...xpectedrecords_incremental_dedup_raw.jsonl | 5 ++ .../typing_deduping/BigQuerySqlGenerator.java | 2 +- 6 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_raw.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_final.jsonl create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_raw.jsonl diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 861ac7e34c08..db43e67bab3f 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -213,6 +213,81 @@ public void fullRefreshAppend() throws Exception { verifySyncResult(expectedRawRecords2, expectedFinalRecords2); } + /** + * Starting with an empty destination, execute an incremental append sync. + *

+ * This is (not so secretly) identical to {@link #fullRefreshAppend()}, and uses the same set of expected records. + * Incremental as a concept only exists in the source. From the destination's perspective, we only care about the + * destination sync mode. + */ + @Test + public void incrementalAppend() throws Exception { + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + // These two lines are literally the only difference between this test and fullRefreshAppend + .withSyncMode(SyncMode.INCREMENTAL) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.APPEND) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace) + .withName(streamName) + .withJsonSchema(getSchema())))); + + // First sync + List messages1 = readMessages("sync1_messages.jsonl"); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_nondedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_nondedup_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1); + + // Second sync + List messages2 = readMessages("sync2_messages.jsonl"); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_fullrefresh_append_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_fullrefresh_append_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2); + } + + /** + * Starting with an empty destination, execute an incremental dedup sync. Verify that the records are written to the + * destination table. Then run a second sync, and verify that the raw/final tables contain the correct records. + */ + @Test + public void incrementalDedup() throws Exception { + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.INCREMENTAL) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.APPEND_DEDUP) + .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace) + .withName(streamName) + .withJsonSchema(getSchema())))); + + // First sync + List messages1 = readMessages("sync1_messages.jsonl"); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_dedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1); + + // Second sync + List messages2 = readMessages("sync2_messages.jsonl"); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2); + } + private static JsonNode getSchema() throws IOException { return Jsons.deserialize(MoreResources.readResource("schema.json")); } diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_final.jsonl new file mode 100644 index 000000000000..e456f48d443a --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_final.jsonl @@ -0,0 +1,4 @@ +// Keep the Alice record with more recent updated_at +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":[]}, "id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":["Problem with `age`", "Problem with `registration_date`"]}, "id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie"} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_raw.jsonl new file mode 100644 index 000000000000..88411c9e4de3 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync1_expectedrecords_dedup_raw.jsonl @@ -0,0 +1,4 @@ +// Keep the Alice record with more recent updated_at +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": "this is not an integer", "registration_date": "this is not a date"}} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_final.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_final.jsonl new file mode 100644 index 000000000000..10cd001e22f6 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_final.jsonl @@ -0,0 +1,3 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_meta":{"errors":[]}, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +// Delete Bob, keep Charlie +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_meta": {"errors":["Problem with `age`", "Problem with `registration_date`"]}, "id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie"} diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_raw.jsonl b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_raw.jsonl new file mode 100644 index 000000000000..bd79da0ea871 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sync2_expectedrecords_incremental_dedup_raw.jsonl @@ -0,0 +1,5 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}} +// Keep the record that deleted Bob, but delete the other records associated with id=(1, 201) +{"_airbyte_extracted_at": "1970-01-01T00:00:02Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01:00:00:00Z"}} +// And keep Charlie's record, even though it wasn't reemitted in sync2. +{"_airbyte_extracted_at": "1970-01-01T00:00:01Z", "_airbyte_data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": "this is not an integer", "registration_date": "this is not a date"}} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java index c4c3be8a91af..93f21b472f89 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java @@ -386,7 +386,7 @@ String dedupFinalTable(final StreamId id, WHERE row_number != 1 ) OR ( - ${pk_list} IN ( + (${pk_list}) IN ( SELECT ( ${pk_cast_list} ) From 80f8d9046a7a23f97a295e4a12847d629c53edb7 Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 18:49:35 +0000 Subject: [PATCH 14/46] Automated Commit - Formatting Changes --- .../typing_deduping/BaseTypingDedupingTest.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index db43e67bab3f..5a8873440423 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -216,9 +216,9 @@ public void fullRefreshAppend() throws Exception { /** * Starting with an empty destination, execute an incremental append sync. *

- * This is (not so secretly) identical to {@link #fullRefreshAppend()}, and uses the same set of expected records. - * Incremental as a concept only exists in the source. From the destination's perspective, we only care about the - * destination sync mode. + * This is (not so secretly) identical to {@link #fullRefreshAppend()}, and uses the same set of + * expected records. Incremental as a concept only exists in the source. From the destination's + * perspective, we only care about the destination sync mode. */ @Test public void incrementalAppend() throws Exception { @@ -253,8 +253,9 @@ public void incrementalAppend() throws Exception { } /** - * Starting with an empty destination, execute an incremental dedup sync. Verify that the records are written to the - * destination table. Then run a second sync, and verify that the raw/final tables contain the correct records. + * Starting with an empty destination, execute an incremental dedup sync. Verify that the records + * are written to the destination table. Then run a second sync, and verify that the raw/final + * tables contain the correct records. */ @Test public void incrementalDedup() throws Exception { From 73b9e9014bb560ac38c9d6df878b98e091541b0e Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 13:04:23 -0700 Subject: [PATCH 15/46] readability stuff --- .../BaseTypingDedupingTest.java | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 5a8873440423..3db76a63de18 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -365,10 +365,11 @@ private static String getFieldIfPresent(JsonNode record, String field) { * PK+cursor+extracted_at) * @param sortComparator Behaves identically to identityComparator, but if two records are the same, * breaks that tie using _airbyte_raw_id + * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string + * @param extractRawData Whether to look inside the _airbyte_data column and diff its subfields * @return The diff, or empty string if there were no differences */ - private static String diffRecords( - List originalExpectedRecords, + private static String diffRecords(List originalExpectedRecords, List originalActualRecords, Comparator identityComparator, Comparator sortComparator, @@ -379,7 +380,6 @@ private static String diffRecords( // Iterate through both lists in parallel and compare each record. // Build up an error message listing any incorrect, missing, or unexpected records. - // Not a true diff, but close enough. String message = ""; int expectedRecordIndex = 0; int actualRecordIndex = 0; @@ -391,11 +391,13 @@ private static String diffRecords( // These records should be the same. Find the specific fields that are different. boolean foundMismatch = false; String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; - // Iterate through each field in the expected record and compare it to the actual record's value. + // Iterate through each column in the expected record and compare it to the actual record's value. for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { if (extractRawData && "_airbyte_data".equals(column)) { + // For the raw data in particular, we should also diff the fields inside _airbyte_data. JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); JsonNode actualRawData = actualRecord.get("_airbyte_data"); + // Iterate through all the subfields of the expected raw data and check that they match the actual record... for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { JsonNode expectedValue = expectedRawData.get(field); JsonNode actualValue = actualRawData.get(field); @@ -404,7 +406,8 @@ private static String diffRecords( foundMismatch = true; } } - LinkedHashMap extraColumns = checkForExtraFields(expectedRawData, actualRawData); + // ... and then check the actual raw data for any subfields that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); if (extraColumns.size() > 0) { for (Map.Entry extraColumn : extraColumns.entrySet()) { mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); @@ -412,6 +415,7 @@ private static String diffRecords( } } } else { + // For all other columns, we can just compare their values directly. JsonNode expectedValue = expectedRecord.get(column); JsonNode actualValue = actualRecord.get(column); if (jsonNodesNotEquivalent(expectedValue, actualValue)) { @@ -420,7 +424,8 @@ private static String diffRecords( } } } - LinkedHashMap extraColumns = checkForExtraFields(expectedRecord, actualRecord); + // Then check the entire actual record for any columns that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRecord, actualRecord); if (extraColumns.size() > 0) { for (Map.Entry extraColumn : extraColumns.entrySet()) { mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); @@ -467,7 +472,15 @@ private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode a && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); } - private static LinkedHashMap checkForExtraFields(JsonNode expectedRecord, JsonNode actualRecord) { + /** + * Verify that all fields in the actual record are present in the expected record. This is primarily relevant for + * detecting fields that we expected to be null, but actually were not. See {@link #dumpFinalTableRecords(String, String)} + * for an explanation of how SQL/JSON nulls are represented in the expected record. + *

+ * This has the side benefit of detecting completely unexpected columns, which would be a very weird bug but is + * probably still useful to catch. + */ + private static LinkedHashMap checkForExtraOrNonNullFields(JsonNode expectedRecord, JsonNode actualRecord) { LinkedHashMap extraFields = new LinkedHashMap<>(); for (String column : Streams.stream(actualRecord.fieldNames()).sorted().toList()) { // loaded_at and raw_id are generated dynamically, so we just ignore them. @@ -478,15 +491,19 @@ private static LinkedHashMap checkForExtraFields(JsonNode expe return extraFields; } + /** + * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". It's indented intentionally. + */ private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); String actualString = actualValue == null ? "SQL NULL (i.e. no value)" : actualValue.toString(); return " For " + fieldname + ", expected " + expectedString + " but got " + actualString + "\n"; } + // These asFoo methods are used for sorting records, so their defaults are intended to make broken records stand out. private static long asInt(JsonNode node) { if (node == null || !node.isIntegralNumber()) { - return Integer.MIN_VALUE; + return Long.MIN_VALUE; } else { return node.longValue(); } @@ -512,11 +529,11 @@ private static Instant asTimestamp(JsonNode node) { /* * !!!!!! WARNING !!!!!! The code below was mostly copypasted from DestinationAcceptanceTest. If you - * make edits here, you probably want to also edit there. !!!!!!!!!!!!!!!!!!!!! + * make edits here, you probably want to also edit there. */ + // These contain some state, so they are instanced per test (i.e. cannot be static) private Path jobRoot; - // This contains some state, so it needs to be instanced per test (i.e. cannot be static) private ProcessFactory processFactory; @BeforeEach From a40935b2e85432b82e2cedc7e885d413ca4fcbcc Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 20:09:38 +0000 Subject: [PATCH 16/46] Automated Commit - Formatting Changes --- .../BaseTypingDedupingTest.java | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 3db76a63de18..025758bb2179 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -397,7 +397,8 @@ private static String diffRecords(List originalExpectedRecords, // For the raw data in particular, we should also diff the fields inside _airbyte_data. JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); JsonNode actualRawData = actualRecord.get("_airbyte_data"); - // Iterate through all the subfields of the expected raw data and check that they match the actual record... + // Iterate through all the subfields of the expected raw data and check that they match the actual + // record... for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { JsonNode expectedValue = expectedRawData.get(field); JsonNode actualValue = actualRawData.get(field); @@ -473,12 +474,13 @@ private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode a } /** - * Verify that all fields in the actual record are present in the expected record. This is primarily relevant for - * detecting fields that we expected to be null, but actually were not. See {@link #dumpFinalTableRecords(String, String)} - * for an explanation of how SQL/JSON nulls are represented in the expected record. + * Verify that all fields in the actual record are present in the expected record. This is primarily + * relevant for detecting fields that we expected to be null, but actually were not. See + * {@link #dumpFinalTableRecords(String, String)} for an explanation of how SQL/JSON nulls are + * represented in the expected record. *

- * This has the side benefit of detecting completely unexpected columns, which would be a very weird bug but is - * probably still useful to catch. + * This has the side benefit of detecting completely unexpected columns, which would be a very weird + * bug but is probably still useful to catch. */ private static LinkedHashMap checkForExtraOrNonNullFields(JsonNode expectedRecord, JsonNode actualRecord) { LinkedHashMap extraFields = new LinkedHashMap<>(); @@ -492,7 +494,8 @@ private static LinkedHashMap checkForExtraOrNonNullFields(Json } /** - * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". It's indented intentionally. + * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". It's + * indented intentionally. */ private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); @@ -500,7 +503,8 @@ private static String generateFieldError(String fieldname, JsonNode expectedValu return " For " + fieldname + ", expected " + expectedString + " but got " + actualString + "\n"; } - // These asFoo methods are used for sorting records, so their defaults are intended to make broken records stand out. + // These asFoo methods are used for sorting records, so their defaults are intended to make broken + // records stand out. private static long asInt(JsonNode node) { if (node == null || !node.isIntegralNumber()) { return Long.MIN_VALUE; From 7bb4b2cf57b2eaee744d460b930a928a4035271d Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 13:18:30 -0700 Subject: [PATCH 17/46] add test for gcs mode --- .../AbstractBigQueryTypingDedupingTest.java | 75 +++++++++++++++++++ .../BigQueryGcsTypingDedupingTest.java | 12 +++ ...ueryStandardInsertsTypingDedupingTest.java | 66 +--------------- 3 files changed, 89 insertions(+), 64 deletions(-) create mode 100644 airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java create mode 100644 airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java new file mode 100644 index 000000000000..2ec980e3cb60 --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java @@ -0,0 +1,75 @@ +package io.airbyte.integrations.destination.bigquery.typing_deduping; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.DatasetId; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableResult; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; +import io.airbyte.integrations.destination.bigquery.BigQueryDestination; +import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; +import java.io.IOException; +import java.nio.file.Path; +import java.time.Instant; +import java.util.LinkedHashMap; +import java.util.List; + +public abstract class AbstractBigQueryTypingDedupingTest extends BaseTypingDedupingTest { + + private static BigQuery bq; + + /** + * Subclasses should call this in an @BeforeAll block rather than directly setting {@see BaseTypingDedupingTest#config}. + */ + protected static void setConfig(String configPath) throws IOException { + final String datasetId = Strings.addRandomSuffix("typing_deduping_default_dataset", "_", 5); + config = BigQueryDestinationTestUtils.createConfig(Path.of(configPath), datasetId); + bq = BigQueryDestination.getBigQuery(config); + } + + @Override + protected String getImageName() { + return "airbyte/destination-bigquery:dev"; + } + + @Override + protected List dumpRawTableRecords(String streamNamespace, String streamName) throws InterruptedException { + TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM airbyte." + streamNamespace + "_" + streamName)); + List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); + return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); + } + + @Override + protected List dumpFinalTableRecords(String streamNamespace, String streamName) throws InterruptedException { + TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM " + streamNamespace + "." + streamName)); + List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); + return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); + } + + @Override + protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { + // bq.delete simply returns false if the table/schema doesn't exist (e.g. if the connector failed to create it) + // so we don't need to do any existence checks here. + bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); + bq.delete(DatasetId.of(streamNamespace), BigQuery.DatasetDeleteOption.deleteContents()); + } + + private static JsonNode toJson(LinkedHashMap map) { + ObjectNode o = (ObjectNode) Jsons.emptyObject(); + map.forEach((key, value) -> { + if (value == null) { + // If the value is null, do nothing. We don't want to insert it into the json at all. + } else if (value instanceof Instant i) { + // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it + o.set(key, Jsons.jsonNode(i.toString())); + } else { + o.set(key, Jsons.jsonNode(value)); + } + }); + return o; + } +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java new file mode 100644 index 000000000000..b79d783df3de --- /dev/null +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java @@ -0,0 +1,12 @@ +package io.airbyte.integrations.destination.bigquery.typing_deduping; + +import java.io.IOException; +import org.junit.jupiter.api.BeforeAll; + +public class BigQueryGcsTypingDedupingTest extends AbstractBigQueryTypingDedupingTest { + + @BeforeAll + public static void buildConfig() throws IOException { + setConfig("secrets/credentials-1s1t-gcs.json"); + } +} diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 0da1f2945c65..16ed7cabd58e 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -1,75 +1,13 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.DatasetId; -import com.google.cloud.bigquery.QueryJobConfiguration; -import com.google.cloud.bigquery.TableId; -import com.google.cloud.bigquery.TableResult; -import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.string.Strings; -import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; -import io.airbyte.integrations.destination.bigquery.BigQueryDestination; -import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; import java.io.IOException; -import java.nio.file.Path; -import java.time.Instant; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; import org.junit.jupiter.api.BeforeAll; -public class BigQueryStandardInsertsTypingDedupingTest extends BaseTypingDedupingTest { - - private static BigQuery bq; +public class BigQueryStandardInsertsTypingDedupingTest extends AbstractBigQueryTypingDedupingTest { // Note that this is not an @Override, because it's a static method. I would love suggestions on how to do this better :) @BeforeAll public static void buildConfig() throws IOException { - final String datasetId = Strings.addRandomSuffix("typing_deduping_default_dataset", "_", 5); - config = BigQueryDestinationTestUtils.createConfig(Path.of("secrets/credentials-1s1t-standard.json"), datasetId); - bq = BigQueryDestination.getBigQuery(config); - } - - @Override - protected String getImageName() { - return "airbyte/destination-bigquery:dev"; - } - - @Override - protected List dumpRawTableRecords(String streamNamespace, String streamName) throws InterruptedException { - TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM airbyte." + streamNamespace + "_" + streamName)); - List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); - return rowsAsMaps.stream().map(BigQueryStandardInsertsTypingDedupingTest::toJson).toList(); - } - - @Override - protected List dumpFinalTableRecords(String streamNamespace, String streamName) throws InterruptedException { - TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM " + streamNamespace + "." + streamName)); - List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); - return rowsAsMaps.stream().map(BigQueryStandardInsertsTypingDedupingTest::toJson).toList(); - } - - @Override - protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { - // bq.delete simply returns false if the table/schema doesn't exist (e.g. if the connector failed to create it) - bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); - bq.delete(DatasetId.of(streamNamespace), BigQuery.DatasetDeleteOption.deleteContents()); - } - - private static JsonNode toJson(LinkedHashMap map) { - ObjectNode o = (ObjectNode) Jsons.emptyObject(); - map.forEach((key, value) -> { - if (value == null) { - // If the value is null, do nothing. We don't want to insert it into the json at all. - } else if (value instanceof Instant i) { - // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it - o.set(key, Jsons.jsonNode(i.toString())); - } else { - o.set(key, Jsons.jsonNode(value)); - } - }); - return o; + setConfig("secrets/credentials-1s1t-standard.json"); } } From 4da21d1866e85e22e42887c86cc54654a8dc192b Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 14:21:12 -0700 Subject: [PATCH 18/46] remove static fields --- .../BaseTypingDedupingTest.java | 40 +++++++++++++------ .../destination-bigquery/build.gradle | 1 - .../AbstractBigQueryTypingDedupingTest.java | 16 ++++---- .../BigQueryGcsTypingDedupingTest.java | 9 ++--- ...ueryStandardInsertsTypingDedupingTest.java | 10 ++--- 5 files changed, 41 insertions(+), 35 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 025758bb2179..8a3994ce1701 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -13,7 +13,6 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.lang.Exceptions; import io.airbyte.commons.resources.MoreResources; -import io.airbyte.commons.string.Strings; import io.airbyte.configoss.WorkerDestinationConfig; import io.airbyte.protocol.models.v0.AirbyteMessage; import io.airbyte.protocol.models.v0.AirbyteStream; @@ -38,6 +37,7 @@ import java.util.Objects; import java.util.UUID; import java.util.function.Function; +import org.apache.commons.lang3.RandomStringUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -60,6 +60,7 @@ // Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's // build.gradle. // See destination-bigquery for an example. +// If you're running from inside intellij, you must run your specific subclass to get concurrent execution. @Execution(ExecutionMode.CONCURRENT) public abstract class BaseTypingDedupingTest { @@ -79,15 +80,8 @@ public abstract class BaseTypingDedupingTest { private static final Comparator FINAL_RECORD_SORT_COMPARATOR = FINAL_RECORD_IDENTITY_COMPARATOR .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - /** - * Subclasses MUST implement a static {@link org.junit.jupiter.api.BeforeAll} method that sets this - * field. - *

- * That method should also start testcontainer(s), if you're using them. That test container will be - * used for all tests. This is safe because each test uses a randomized stream namespace+name. - */ - protected static JsonNode config; - + private String randomSuffix; + private JsonNode config; private String streamNamespace; private String streamName; @@ -96,6 +90,15 @@ public abstract class BaseTypingDedupingTest { */ protected abstract String getImageName(); + /** + * Get the destination connector config. Subclasses may use this method for other setup work, e.g. opening a connection + * to the destination. + *

+ * Subclasses should _not_ start testcontainers in this method; that belongs in a BeforeAll method. The tests in this + * class are intended to be run concurrently on a shared database and will not interfere with each other. + */ + protected abstract JsonNode getConfig() throws Exception; + /** * For a given stream, return the records that exist in the destination's raw table. Each record * must be in the format {"_airbyte_raw_id": "...", "_airbyte_extracted_at": "...", @@ -131,10 +134,21 @@ public abstract class BaseTypingDedupingTest { */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; + /** + * @return A suffix which is different for each concurrent test run. + */ + protected synchronized String getUniqueSuffix() { + if (randomSuffix == null) { + randomSuffix = "_" + RandomStringUtils.randomAlphabetic(5).toLowerCase(); + } + return randomSuffix; + } + @BeforeEach - public void setup() { - streamNamespace = Strings.addRandomSuffix("typing_deduping_test", "_", 5); - streamName = Strings.addRandomSuffix("test_stream", "_", 5); + public void setup() throws Exception { + config = getConfig(); + streamNamespace = "typing_deduping_test" + getUniqueSuffix(); + streamName = "test_stream" + getUniqueSuffix(); LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); } diff --git a/airbyte-integrations/connectors/destination-bigquery/build.gradle b/airbyte-integrations/connectors/destination-bigquery/build.gradle index 3867ce22c210..2229ad250b72 100644 --- a/airbyte-integrations/connectors/destination-bigquery/build.gradle +++ b/airbyte-integrations/connectors/destination-bigquery/build.gradle @@ -60,4 +60,3 @@ integrationTestJava { // 'junit.jupiter.execution.parallel.mode.default': 'concurrent' ] } - diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java index 2ec980e3cb60..ea0f99c0632c 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java @@ -8,7 +8,6 @@ import com.google.cloud.bigquery.TableId; import com.google.cloud.bigquery.TableResult; import io.airbyte.commons.json.Jsons; -import io.airbyte.commons.string.Strings; import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; @@ -20,15 +19,16 @@ public abstract class AbstractBigQueryTypingDedupingTest extends BaseTypingDedupingTest { - private static BigQuery bq; + private BigQuery bq; - /** - * Subclasses should call this in an @BeforeAll block rather than directly setting {@see BaseTypingDedupingTest#config}. - */ - protected static void setConfig(String configPath) throws IOException { - final String datasetId = Strings.addRandomSuffix("typing_deduping_default_dataset", "_", 5); - config = BigQueryDestinationTestUtils.createConfig(Path.of(configPath), datasetId); + protected abstract String getConfigPath(); + + @Override + public JsonNode getConfig() throws IOException { + final String datasetId = "typing_deduping_default_dataset" + getUniqueSuffix(); + JsonNode config = BigQueryDestinationTestUtils.createConfig(Path.of(getConfigPath()), datasetId); bq = BigQueryDestination.getBigQuery(config); + return config; } @Override diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java index b79d783df3de..df201d6c687b 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryGcsTypingDedupingTest.java @@ -1,12 +1,9 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; -import java.io.IOException; -import org.junit.jupiter.api.BeforeAll; - public class BigQueryGcsTypingDedupingTest extends AbstractBigQueryTypingDedupingTest { - @BeforeAll - public static void buildConfig() throws IOException { - setConfig("secrets/credentials-1s1t-gcs.json"); + @Override + public String getConfigPath() { + return "secrets/credentials-1s1t-gcs.json"; } } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java index 16ed7cabd58e..be86379f2719 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryStandardInsertsTypingDedupingTest.java @@ -1,13 +1,9 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; -import java.io.IOException; -import org.junit.jupiter.api.BeforeAll; - public class BigQueryStandardInsertsTypingDedupingTest extends AbstractBigQueryTypingDedupingTest { - // Note that this is not an @Override, because it's a static method. I would love suggestions on how to do this better :) - @BeforeAll - public static void buildConfig() throws IOException { - setConfig("secrets/credentials-1s1t-standard.json"); + @Override + public String getConfigPath() { + return "secrets/credentials-1s1t-standard.json"; } } From a8fa7d40bb2988391d218792f4bdfd1ed43f87a6 Mon Sep 17 00:00:00 2001 From: octavia-approvington Date: Fri, 30 Jun 2023 21:26:21 +0000 Subject: [PATCH 19/46] Automated Commit - Formatting Changes --- .../typing_deduping/BaseTypingDedupingTest.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 8a3994ce1701..9b4dd9b6409f 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -60,7 +60,8 @@ // Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's // build.gradle. // See destination-bigquery for an example. -// If you're running from inside intellij, you must run your specific subclass to get concurrent execution. +// If you're running from inside intellij, you must run your specific subclass to get concurrent +// execution. @Execution(ExecutionMode.CONCURRENT) public abstract class BaseTypingDedupingTest { @@ -91,11 +92,12 @@ public abstract class BaseTypingDedupingTest { protected abstract String getImageName(); /** - * Get the destination connector config. Subclasses may use this method for other setup work, e.g. opening a connection - * to the destination. + * Get the destination connector config. Subclasses may use this method for other setup work, e.g. + * opening a connection to the destination. *

- * Subclasses should _not_ start testcontainers in this method; that belongs in a BeforeAll method. The tests in this - * class are intended to be run concurrently on a shared database and will not interfere with each other. + * Subclasses should _not_ start testcontainers in this method; that belongs in a BeforeAll method. + * The tests in this class are intended to be run concurrently on a shared database and will not + * interfere with each other. */ protected abstract JsonNode getConfig() throws Exception; From 067ee0db5496eb6c70a9fb92ad0dbadea77ef72b Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 15:23:48 -0700 Subject: [PATCH 20/46] add more test cases, tweak test scaffold --- .../BaseTypingDedupingTest.java | 106 +++++++++++++++++- .../AbstractBigQueryTypingDedupingTest.java | 9 +- 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 9b4dd9b6409f..72f3a88a0fec 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -8,6 +8,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Streams; import io.airbyte.commons.features.EnvVariableFeatureFlags; import io.airbyte.commons.json.Jsons; @@ -40,6 +41,7 @@ import org.apache.commons.lang3.RandomStringUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; @@ -99,7 +101,7 @@ public abstract class BaseTypingDedupingTest { * The tests in this class are intended to be run concurrently on a shared database and will not * interfere with each other. */ - protected abstract JsonNode getConfig() throws Exception; + protected abstract JsonNode generateConfig() throws Exception; /** * For a given stream, return the records that exist in the destination's raw table. Each record @@ -108,6 +110,8 @@ public abstract class BaseTypingDedupingTest { *

* The {@code _airbyte_data} column must be an * {@link com.fasterxml.jackson.databind.node.ObjectNode} (i.e. it cannot be a string value). + *

+ * streamNamespace may be null, in which case you should query from the default namespace. */ protected abstract List dumpRawTableRecords(String streamNamespace, String streamName) throws Exception; @@ -123,6 +127,8 @@ public abstract class BaseTypingDedupingTest { *

* The corresponding SQL looks like * {@code INSERT INTO ... (name, address) VALUES ('null' :: jsonb, NULL)}. + *

+ * streamNamespace may be null, in which case you should query from the default namespace. */ protected abstract List dumpFinalTableRecords(String streamNamespace, String streamName) throws Exception; @@ -146,9 +152,13 @@ protected synchronized String getUniqueSuffix() { return randomSuffix; } + protected JsonNode getConfig() { + return config; + } + @BeforeEach public void setup() throws Exception { - config = getConfig(); + config = generateConfig(); streamNamespace = "typing_deduping_test" + getUniqueSuffix(); streamName = "test_stream" + getUniqueSuffix(); LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); @@ -305,6 +315,98 @@ public void incrementalDedup() throws Exception { verifySyncResult(expectedRawRecords2, expectedFinalRecords2); } + @Test + @Disabled("Not yet implemented") + public void testLineBreakCharacters() throws Exception { + // TODO verify that we can handle strings with interesting characters + // build an airbyterecordmessage using something like this, and add it to the input messages: + Jsons.jsonNode(ImmutableMap.builder() + .put("id", 1) + .put("currency", "USD\u2028") + .put("date", "2020-03-\n31T00:00:00Z\r") + // TODO(sherifnada) hack: write decimals with sigfigs because Snowflake stores 10.1 as "10" which + // fails destination tests + .put("HKD", 10.1) + .put("NZD", 700.1) + .build()); + } + + @Test + @Disabled("Not yet implemented") + public void testIncrementalSyncDropOneColumn() throws Exception { + // TODO in incremental dedup mode: run a sync, remove a column from the schema, run another sync + // verify that the column is dropped from the destination table + } + + @Test + @Disabled("Not yet implemented") + public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { + // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using a stream with null namespace: + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) + .withStream(new AirbyteStream() + .withNamespace(null) + .withName(streamName) + .withJsonSchema(getSchema())))); + } + + @Test + @Disabled("Not yet implemented") + public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { + // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using two streams with the same name but different namespace: + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace + "_1") + .withName(streamName) + .withJsonSchema(getSchema())), + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) + .withStream(new AirbyteStream() + .withNamespace(streamNamespace + "_2") + .withName(streamName) + .withJsonSchema(getSchema())))); + } + + @Test + @Disabled("Not yet implemented") + public void testSyncNotFailsWithNewFields() throws Exception { + // TODO duplicate this test for each sync mode. Run a sync, then add a new field to the schema, then run another sync + // We might want to write a test that verifies more general schema evolution (e.g. all valid evolutions) + } + + @Test + @Disabled("Not yet implemented") + public void testSyncWithLargeRecordBatch() throws Exception { + // TODO duplicate this test for each sync mode. Run a single sync with many records + /* + copied from DATs: + This serves to test MSSQL 2100 limit parameters in a single query. this means that for Airbyte + insert data need to limit to ~ 700 records (3 columns for the raw tables) = 2100 params + + this maybe needs configuration per destination to specify that limit? + */ + } + + @Test + @Disabled("Not yet implemented") + public void testDataTypes() throws Exception { + // TODO duplicate this test for each sync mode. See DataTypeTestArgumentProvider for what this test does in DAT-land + // we probably don't want to do the exact same thing, but the general spirit of testing a wide range of values for every data type is approximately correct + // this test probably needs some configuration per destination to specify what values are supported? + } + private static JsonNode getSchema() throws IOException { return Jsons.deserialize(MoreResources.readResource("schema.json")); } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java index ea0f99c0632c..912db29823be 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java @@ -11,6 +11,7 @@ import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; import io.airbyte.integrations.destination.bigquery.BigQueryDestinationTestUtils; +import io.airbyte.integrations.destination.bigquery.BigQueryUtils; import java.io.IOException; import java.nio.file.Path; import java.time.Instant; @@ -24,7 +25,7 @@ public abstract class AbstractBigQueryTypingDedupingTest extends BaseTypingDedup protected abstract String getConfigPath(); @Override - public JsonNode getConfig() throws IOException { + public JsonNode generateConfig() throws IOException { final String datasetId = "typing_deduping_default_dataset" + getUniqueSuffix(); JsonNode config = BigQueryDestinationTestUtils.createConfig(Path.of(getConfigPath()), datasetId); bq = BigQueryDestination.getBigQuery(config); @@ -38,6 +39,9 @@ protected String getImageName() { @Override protected List dumpRawTableRecords(String streamNamespace, String streamName) throws InterruptedException { + if (streamNamespace == null) { + streamNamespace = BigQueryUtils.getDatasetId(getConfig()); + } TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM airbyte." + streamNamespace + "_" + streamName)); List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); @@ -45,6 +49,9 @@ protected List dumpRawTableRecords(String streamNamespace, String stre @Override protected List dumpFinalTableRecords(String streamNamespace, String streamName) throws InterruptedException { + if (streamNamespace == null) { + streamNamespace = BigQueryUtils.getDatasetId(getConfig()); + } TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM " + streamNamespace + "." + streamName)); List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); From c0089158d6f05ced366158ce4a0974f190dd3535 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Fri, 30 Jun 2023 15:26:56 -0700 Subject: [PATCH 21/46] cleanup --- .../typing_deduping/BigQuerySqlGenerator.java | 4 ++-- .../BigQuerySqlGeneratorIntegrationTest.java | 22 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java index 93f21b472f89..3de7c9395ac9 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGenerator.java @@ -69,7 +69,7 @@ public ColumnId buildColumnId(final String name) { return new ColumnId(nameTransformer.getIdentifier(quotedName), name, canonicalized); } - public static StandardSQLTypeName toDialectType(final AirbyteType type) { + public StandardSQLTypeName toDialectType(final AirbyteType type) { // switch pattern-matching is still in preview at language level 17 :( if (type instanceof final AirbyteProtocolType p) { return toDialectType(p); @@ -137,7 +137,7 @@ ELSE JSON_QUERY(`_airbyte_data`, '$.${column_name}') } } - public static StandardSQLTypeName toDialectType(final AirbyteProtocolType airbyteProtocolType) { + public StandardSQLTypeName toDialectType(final AirbyteProtocolType airbyteProtocolType) { return switch (airbyteProtocolType) { // TODO doublecheck these case STRING -> StandardSQLTypeName.STRING; diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index ac4f3bf71a32..4ec3b2876be9 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -894,6 +894,17 @@ private static void logAndExecute(final String sql) throws InterruptedException bq.query(QueryJobConfiguration.newBuilder(sql).build()); } + /** + * TableResult contains records in a somewhat nonintuitive format (and it avoids loading them all into memory). + * That's annoying for us since we're working with small test data, so pull everything into a list, and convert them + * into maps of column name -> value. + *

+ * Note that the values have reasonable types; see {@link #toMap(Schema, FieldValueList)} for details. + */ + public static List> toMaps(TableResult result) { + return result.streamAll().map(row -> toMap(result.getSchema(), row)).toList(); + } + /** * FieldValueList stores everything internally as string (I think?) but provides conversions to more useful types. * This method does that conversion, using the schema to determine which type is most appropriate. @@ -981,17 +992,6 @@ private void assertQueryResult(final List>> expecte } } - /** - * TableResult contains records in a somewhat nonintuitive format (and it avoids loading them all into memory). - * That's annoying for us since we're working with small test data, so pull everything into a list, and convert them - * into maps of column name -> value. - *

- * Note that the values have reasonable types; see {@link #toMap(Schema, FieldValueList)} for details. - */ - public static List> toMaps(TableResult result) { - return result.streamAll().map(row -> toMap(result.getSchema(), row)).toList(); - } - private static String sortedToString(Map record) { return sortedToString(record, Function.identity()); } From 1b376a248b960673242201f5952426b3ccbaa408 Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 30 Jun 2023 22:30:34 +0000 Subject: [PATCH 22/46] Automated Commit - Formatting Changes --- .../BaseTypingDedupingTest.java | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 72f3a88a0fec..af94737bc28f 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -341,7 +341,8 @@ public void testIncrementalSyncDropOneColumn() throws Exception { @Test @Disabled("Not yet implemented") public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { - // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using a stream with null namespace: + // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using a stream with null + // namespace: ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( new ConfiguredAirbyteStream() .withSyncMode(SyncMode.FULL_REFRESH) @@ -357,7 +358,8 @@ public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { @Test @Disabled("Not yet implemented") public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { - // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using two streams with the same name but different namespace: + // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using two streams with the same + // name but different namespace: ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( new ConfiguredAirbyteStream() .withSyncMode(SyncMode.FULL_REFRESH) @@ -382,8 +384,10 @@ public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { @Test @Disabled("Not yet implemented") public void testSyncNotFailsWithNewFields() throws Exception { - // TODO duplicate this test for each sync mode. Run a sync, then add a new field to the schema, then run another sync - // We might want to write a test that verifies more general schema evolution (e.g. all valid evolutions) + // TODO duplicate this test for each sync mode. Run a sync, then add a new field to the schema, then + // run another sync + // We might want to write a test that verifies more general schema evolution (e.g. all valid + // evolutions) } @Test @@ -391,19 +395,21 @@ public void testSyncNotFailsWithNewFields() throws Exception { public void testSyncWithLargeRecordBatch() throws Exception { // TODO duplicate this test for each sync mode. Run a single sync with many records /* - copied from DATs: - This serves to test MSSQL 2100 limit parameters in a single query. this means that for Airbyte - insert data need to limit to ~ 700 records (3 columns for the raw tables) = 2100 params - - this maybe needs configuration per destination to specify that limit? + * copied from DATs: This serves to test MSSQL 2100 limit parameters in a single query. this means + * that for Airbyte insert data need to limit to ~ 700 records (3 columns for the raw tables) = 2100 + * params + * + * this maybe needs configuration per destination to specify that limit? */ } @Test @Disabled("Not yet implemented") public void testDataTypes() throws Exception { - // TODO duplicate this test for each sync mode. See DataTypeTestArgumentProvider for what this test does in DAT-land - // we probably don't want to do the exact same thing, but the general spirit of testing a wide range of values for every data type is approximately correct + // TODO duplicate this test for each sync mode. See DataTypeTestArgumentProvider for what this test + // does in DAT-land + // we probably don't want to do the exact same thing, but the general spirit of testing a wide range + // of values for every data type is approximately correct // this test probably needs some configuration per destination to specify what values are supported? } From d86dd300a2961000ac9c8221d2da1efa3d1fabbe Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 10:17:03 -0700 Subject: [PATCH 23/46] extract recorddiffer --- .../BaseTypingDedupingTest.java | 269 ++----------- .../typing_deduping/RecordDiffer.java | 365 ++++++++++++++++++ .../typing_deduping/AirbyteType.java | 19 + 3 files changed, 410 insertions(+), 243 deletions(-) create mode 100644 airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index af94737bc28f..a8e82447f661 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -15,6 +15,7 @@ import io.airbyte.commons.lang.Exceptions; import io.airbyte.commons.resources.MoreResources; import io.airbyte.configoss.WorkerDestinationConfig; +import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.AirbyteProtocolType; import io.airbyte.protocol.models.v0.AirbyteMessage; import io.airbyte.protocol.models.v0.AirbyteStream; import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; @@ -39,6 +40,7 @@ import java.util.UUID; import java.util.function.Function; import org.apache.commons.lang3.RandomStringUtils; +import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -68,20 +70,19 @@ public abstract class BaseTypingDedupingTest { private static final Logger LOGGER = LoggerFactory.getLogger(BaseTypingDedupingTest.class); - private static final Comparator RAW_RECORD_IDENTITY_COMPARATOR = Comparator - .comparingLong((JsonNode record) -> asInt(record.get("_airbyte_data").get("id1"))) - .thenComparingLong(record -> asInt(record.get("_airbyte_data").get("id2"))) - .thenComparing(record -> asTimestamp(record.get("_airbyte_data").get("updated_at"))) - .thenComparing(record -> asTimestamp(record.get("_airbyte_extracted_at"))); - private static final Comparator RAW_RECORD_SORT_COMPARATOR = RAW_RECORD_IDENTITY_COMPARATOR - .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - private static final Comparator FINAL_RECORD_IDENTITY_COMPARATOR = Comparator - .comparingLong((JsonNode record) -> asInt(record.get("id1"))) - .thenComparingLong(record -> asInt(record.get("id2"))) - .thenComparing(record -> asTimestamp(record.get("updated_at"))) - .thenComparing(record -> asTimestamp(record.get("_airbyte_extracted_at"))); - private static final Comparator FINAL_RECORD_SORT_COMPARATOR = FINAL_RECORD_IDENTITY_COMPARATOR - .thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + private static final JsonNode SCHEMA; + static { + try { + SCHEMA = Jsons.deserialize(MoreResources.readResource("schema.json")); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + private static final RecordDiffer DIFFER = new RecordDiffer( + Pair.of("id1", AirbyteProtocolType.INTEGER), + Pair.of("id2", AirbyteProtocolType.INTEGER), + Pair.of("updated_at", AirbyteProtocolType.TIMESTAMP_WITH_TIMEZONE) + ); private String randomSuffix; private JsonNode config; @@ -183,7 +184,7 @@ public void fullRefreshOverwrite() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace) .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); // First sync List messages1 = readMessages("sync1_messages.jsonl"); @@ -218,7 +219,7 @@ public void fullRefreshAppend() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace) .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); // First sync List messages1 = readMessages("sync1_messages.jsonl"); @@ -257,7 +258,7 @@ public void incrementalAppend() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace) .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); // First sync List messages1 = readMessages("sync1_messages.jsonl"); @@ -294,7 +295,7 @@ public void incrementalDedup() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace) .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); // First sync List messages1 = readMessages("sync1_messages.jsonl"); @@ -352,7 +353,7 @@ public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { .withStream(new AirbyteStream() .withNamespace(null) .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); } @Test @@ -369,7 +370,7 @@ public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace + "_1") .withName(streamName) - .withJsonSchema(getSchema())), + .withJsonSchema(SCHEMA)), new ConfiguredAirbyteStream() .withSyncMode(SyncMode.FULL_REFRESH) .withCursorField(List.of("updated_at")) @@ -378,7 +379,7 @@ public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { .withStream(new AirbyteStream() .withNamespace(streamNamespace + "_2") .withName(streamName) - .withJsonSchema(getSchema())))); + .withJsonSchema(SCHEMA)))); } @Test @@ -413,8 +414,10 @@ public void testDataTypes() throws Exception { // this test probably needs some configuration per destination to specify what values are supported? } - private static JsonNode getSchema() throws IOException { - return Jsons.deserialize(MoreResources.readResource("schema.json")); + private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords) throws Exception { + List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); + List actualFinalRecords = dumpFinalTableRecords(streamNamespace, streamName); + DIFFER.verifySyncResult(expectedRawRecords, actualRawRecords, expectedFinalRecords, actualFinalRecords); } private List readRecords(String filename) throws IOException { @@ -435,226 +438,6 @@ private List readMessages(String filename) throws IOException { }).toList(); } - private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords) throws Exception { - List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); - String rawDiff = diffRawTableRecords(expectedRawRecords, actualRawRecords); - List actualFinalRecords = dumpFinalTableRecords(streamNamespace, streamName); - String finalDiff = diffFinalTableRecords(expectedFinalRecords, actualFinalRecords); - - assertAll( - () -> assertTrue(rawDiff.isEmpty(), "Raw table was incorrect.\n" + rawDiff), - () -> assertTrue(finalDiff.isEmpty(), "Final table was incorrect.\n" + finalDiff)); - } - - private static String diffRawTableRecords(List expectedRecords, List actualRecords) { - return diffRecords( - expectedRecords, - actualRecords, - RAW_RECORD_IDENTITY_COMPARATOR, - RAW_RECORD_SORT_COMPARATOR, - record -> getFieldIfPresent(record.get("_airbyte_data"), "id1") - + getFieldIfPresent(record.get("_airbyte_data"), "id2") - + getFieldIfPresent(record.get("_airbyte_data"), "updated_at") - + getFieldIfPresent(record, "_airbyte_extracted_at"), - true); - } - - private static String diffFinalTableRecords(List expectedRecords, List actualRecords) { - return diffRecords( - expectedRecords, - actualRecords, - FINAL_RECORD_IDENTITY_COMPARATOR, - FINAL_RECORD_SORT_COMPARATOR, - record -> getFieldIfPresent(record, "id1") - + getFieldIfPresent(record, "id2") - + getFieldIfPresent(record, "updated_at") - + getFieldIfPresent(record, "_airbyte_extracted_at"), - false); - } - - private static String getFieldIfPresent(JsonNode record, String field) { - if (record.has(field)) { - return field + "=" + record.get(field) + "; "; - } else { - return ""; - } - } - - /** - * Generate a human-readable diff between the two lists. Only checks the keys specified in - * expectedRecords. Assumes (in general) that two records with the same PK, cursor, and extracted_at - * are the same record. - * - * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same - * PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, - * breaks that tie using _airbyte_raw_id - * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string - * @param extractRawData Whether to look inside the _airbyte_data column and diff its subfields - * @return The diff, or empty string if there were no differences - */ - private static String diffRecords(List originalExpectedRecords, - List originalActualRecords, - Comparator identityComparator, - Comparator sortComparator, - Function recordIdExtractor, - boolean extractRawData) { - List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); - List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); - - // Iterate through both lists in parallel and compare each record. - // Build up an error message listing any incorrect, missing, or unexpected records. - String message = ""; - int expectedRecordIndex = 0; - int actualRecordIndex = 0; - while (expectedRecordIndex < expectedRecords.size() && actualRecordIndex < actualRecords.size()) { - JsonNode expectedRecord = expectedRecords.get(expectedRecordIndex); - JsonNode actualRecord = actualRecords.get(actualRecordIndex); - int compare = identityComparator.compare(expectedRecord, actualRecord); - if (compare == 0) { - // These records should be the same. Find the specific fields that are different. - boolean foundMismatch = false; - String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; - // Iterate through each column in the expected record and compare it to the actual record's value. - for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { - if (extractRawData && "_airbyte_data".equals(column)) { - // For the raw data in particular, we should also diff the fields inside _airbyte_data. - JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); - JsonNode actualRawData = actualRecord.get("_airbyte_data"); - // Iterate through all the subfields of the expected raw data and check that they match the actual - // record... - for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { - JsonNode expectedValue = expectedRawData.get(field); - JsonNode actualValue = actualRawData.get(field); - if (jsonNodesNotEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); - foundMismatch = true; - } - } - // ... and then check the actual raw data for any subfields that we weren't expecting. - LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); - if (extraColumns.size() > 0) { - for (Map.Entry extraColumn : extraColumns.entrySet()) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); - foundMismatch = true; - } - } - } else { - // For all other columns, we can just compare their values directly. - JsonNode expectedValue = expectedRecord.get(column); - JsonNode actualValue = actualRecord.get(column); - if (jsonNodesNotEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); - foundMismatch = true; - } - } - } - // Then check the entire actual record for any columns that we weren't expecting. - LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRecord, actualRecord); - if (extraColumns.size() > 0) { - for (Map.Entry extraColumn : extraColumns.entrySet()) { - mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); - foundMismatch = true; - } - } - if (foundMismatch) { - message += mismatchedRecordMessage; - } - - expectedRecordIndex++; - actualRecordIndex++; - } else if (compare < 0) { - // The expected record is missing from the actual records. Print it and move on to the next expected - // record. - message += "Row was expected but missing: " + expectedRecord + "\n"; - expectedRecordIndex++; - } else { - // There's an actual record which isn't present in the expected records. Print it and move on to the - // next actual record. - message += "Row was not expected but present: " + actualRecord + "\n"; - actualRecordIndex++; - } - } - // Tail loops in case we reached the end of one list before the other. - while (expectedRecordIndex < expectedRecords.size()) { - message += "Row was expected but missing: " + expectedRecords.get(expectedRecordIndex) + "\n"; - expectedRecordIndex++; - } - while (actualRecordIndex < actualRecords.size()) { - message += "Row was not expected but present: " + actualRecords.get(actualRecordIndex) + "\n"; - actualRecordIndex++; - } - - return message; - } - - private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode actualValue) { - // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - return !Objects.equals(expectedValue, actualValue) - // Objects.equals expects the two values to be the same class. - // We need to handle comparisons between e.g. LongNode and IntNode. - && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); - } - - /** - * Verify that all fields in the actual record are present in the expected record. This is primarily - * relevant for detecting fields that we expected to be null, but actually were not. See - * {@link #dumpFinalTableRecords(String, String)} for an explanation of how SQL/JSON nulls are - * represented in the expected record. - *

- * This has the side benefit of detecting completely unexpected columns, which would be a very weird - * bug but is probably still useful to catch. - */ - private static LinkedHashMap checkForExtraOrNonNullFields(JsonNode expectedRecord, JsonNode actualRecord) { - LinkedHashMap extraFields = new LinkedHashMap<>(); - for (String column : Streams.stream(actualRecord.fieldNames()).sorted().toList()) { - // loaded_at and raw_id are generated dynamically, so we just ignore them. - if (!"_airbyte_loaded_at".equals(column) && !"_airbyte_raw_id".equals(column) && !expectedRecord.has(column)) { - extraFields.put(column, actualRecord.get(column)); - } - } - return extraFields; - } - - /** - * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". It's - * indented intentionally. - */ - private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { - String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); - String actualString = actualValue == null ? "SQL NULL (i.e. no value)" : actualValue.toString(); - return " For " + fieldname + ", expected " + expectedString + " but got " + actualString + "\n"; - } - - // These asFoo methods are used for sorting records, so their defaults are intended to make broken - // records stand out. - private static long asInt(JsonNode node) { - if (node == null || !node.isIntegralNumber()) { - return Long.MIN_VALUE; - } else { - return node.longValue(); - } - } - - private static String asString(JsonNode node) { - if (node == null || node.isNull()) { - return ""; - } else if (node.isTextual()) { - return node.asText(); - } else { - return Jsons.serialize(node); - } - } - - private static Instant asTimestamp(JsonNode node) { - if (node == null || !node.isTextual()) { - return Instant.ofEpochMilli(Long.MIN_VALUE); - } else { - return Instant.parse(node.asText()); - } - } - /* * !!!!!! WARNING !!!!!! The code below was mostly copypasted from DestinationAcceptanceTest. If you * make edits here, you probably want to also edit there. diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java new file mode 100644 index 000000000000..c62a1bcfa706 --- /dev/null +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -0,0 +1,365 @@ +package io.airbyte.integrations.base.destination.typing_deduping; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.Streams; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.AirbyteProtocolType; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetTime; +import java.time.ZoneOffset; +import java.util.Arrays; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.commons.lang3.tuple.Pair; + +/** + * Utility class to generate human-readable diffs between expected and actual records. Assumes 1s1t output format. + */ +public class RecordDiffer { + + private final Comparator rawRecordIdentityComparator; + private final Comparator rawRecordSortComparator; + private final Function rawRecordIdentityExtractor; + private final Comparator finalRecordIdentityComparator; + private final Comparator finalRecordSortComparator; + private final Function finalRecordIdentityExtractor; + + public RecordDiffer(Pair... columns) { + // Start with a noop comparator for convenience + Comparator rawIdComp = Comparator.comparing(record -> 0); + Comparator finalIdComp = Comparator.comparing(record -> 0); + for (Pair column : columns) { + rawIdComp = rawIdComp.thenComparing(record -> extract(record.get("_airbyte_data"), column.getKey(), column.getValue())); + finalIdComp = finalIdComp.thenComparing(record -> extract(record, column.getKey(), column.getValue())); + } + this.rawRecordIdentityComparator = rawIdComp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); + this.rawRecordSortComparator = rawRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + this.finalRecordIdentityComparator = finalIdComp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); + this.finalRecordSortComparator = finalRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + + rawRecordIdentityExtractor = record -> Arrays.stream(columns) + .map(column -> getPrintableFieldIfPresent(record.get("_airbyte_data"), column.getKey())) + .collect(Collectors.joining("; ")) + + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); + finalRecordIdentityExtractor = record -> Arrays.stream(columns) + .map(column -> getPrintableFieldIfPresent(record, column.getKey())) + .collect(Collectors.joining("; ")) + + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); + } + + /** + * In the expected records, a SQL null is represented as a JsonNode without that field at all, and a JSON null is + * represented as a NullNode. For example, in the JSON blob {"name": null}, the `name` field is a JSON null, and the + * `address` field is a SQL null. + */ + public void verifySyncResult(List expectedRawRecords, + List actualRawRecords, + List expectedFinalRecords, + List actualFinalRecords) { + assertAll( + () -> diffRawTableRecords(expectedRawRecords, actualRawRecords), + () -> diffFinalTableRecords(expectedFinalRecords, actualFinalRecords) + ); + } + + private void diffRawTableRecords(List expectedRecords, List actualRecords) { + String diff = diffRecords( + expectedRecords, + actualRecords, + rawRecordIdentityComparator, + rawRecordSortComparator, + rawRecordIdentityExtractor, + true); + + assertTrue(diff.isEmpty(), "Raw table was incorrect.\n" + diff); + } + + private void diffFinalTableRecords(List expectedRecords, List actualRecords) { + String diff = diffRecords( + expectedRecords, + actualRecords, + finalRecordIdentityComparator, + finalRecordSortComparator, + finalRecordIdentityExtractor, + false); + + assertTrue(diff.isEmpty(), "Final table was incorrect.\n" + diff); + } + + private static String getPrintableFieldIfPresent(JsonNode record, String field) { + if (record.has(field)) { + return field + "=" + record.get(field) + "; "; + } else { + return ""; + } + } + + /** + * Generate a human-readable diff between the two lists. Only checks the keys specified in + * expectedRecords. Assumes (in general) that two records with the same PK, cursor, and extracted_at + * are the same record. + * + * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same + * PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, + * breaks that tie using _airbyte_raw_id + * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string + * @param extractRawData Whether to look inside the _airbyte_data column and diff its subfields + * @return The diff, or empty string if there were no differences + */ + private static String diffRecords(List originalExpectedRecords, + List originalActualRecords, + Comparator identityComparator, + Comparator sortComparator, + Function recordIdExtractor, + boolean extractRawData) { + List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); + List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); + + // Iterate through both lists in parallel and compare each record. + // Build up an error message listing any incorrect, missing, or unexpected records. + String message = ""; + int expectedRecordIndex = 0; + int actualRecordIndex = 0; + while (expectedRecordIndex < expectedRecords.size() && actualRecordIndex < actualRecords.size()) { + JsonNode expectedRecord = expectedRecords.get(expectedRecordIndex); + JsonNode actualRecord = actualRecords.get(actualRecordIndex); + int compare = identityComparator.compare(expectedRecord, actualRecord); + if (compare == 0) { + // These records should be the same. Find the specific fields that are different. + boolean foundMismatch = false; + String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; + // Iterate through each column in the expected record and compare it to the actual record's value. + for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { + if (extractRawData && "_airbyte_data".equals(column)) { + // For the raw data in particular, we should also diff the fields inside _airbyte_data. + JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); + JsonNode actualRawData = actualRecord.get("_airbyte_data"); + // Iterate through all the subfields of the expected raw data and check that they match the actual + // record... + for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { + JsonNode expectedValue = expectedRawData.get(field); + JsonNode actualValue = actualRawData.get(field); + if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); + foundMismatch = true; + } + } + // ... and then check the actual raw data for any subfields that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); + foundMismatch = true; + } + } + } else { + // For all other columns, we can just compare their values directly. + JsonNode expectedValue = expectedRecord.get(column); + JsonNode actualValue = actualRecord.get(column); + if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); + foundMismatch = true; + } + } + } + // Then check the entire actual record for any columns that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRecord, actualRecord); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); + foundMismatch = true; + } + } + if (foundMismatch) { + message += mismatchedRecordMessage; + } + + expectedRecordIndex++; + actualRecordIndex++; + } else if (compare < 0) { + // The expected record is missing from the actual records. Print it and move on to the next expected + // record. + message += "Row was expected but missing: " + expectedRecord + "\n"; + expectedRecordIndex++; + } else { + // There's an actual record which isn't present in the expected records. Print it and move on to the + // next actual record. + message += "Row was not expected but present: " + actualRecord + "\n"; + actualRecordIndex++; + } + } + // Tail loops in case we reached the end of one list before the other. + while (expectedRecordIndex < expectedRecords.size()) { + message += "Row was expected but missing: " + expectedRecords.get(expectedRecordIndex) + "\n"; + expectedRecordIndex++; + } + while (actualRecordIndex < actualRecords.size()) { + message += "Row was not expected but present: " + actualRecords.get(actualRecordIndex) + "\n"; + actualRecordIndex++; + } + + return message; + } + + private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode actualValue) { + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + return !Objects.equals(expectedValue, actualValue) + // Objects.equals expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); + } + + /** + * Verify that all fields in the actual record are present in the expected record. This is primarily + * relevant for detecting fields that we expected to be null, but actually were not. See + * {@link BaseTypingDedupingTest#dumpFinalTableRecords(String, String)} for an explanation of how SQL/JSON nulls are + * represented in the expected record. + *

+ * This has the side benefit of detecting completely unexpected columns, which would be a very weird + * bug but is probably still useful to catch. + */ + private static LinkedHashMap checkForExtraOrNonNullFields(JsonNode expectedRecord, JsonNode actualRecord) { + LinkedHashMap extraFields = new LinkedHashMap<>(); + for (String column : Streams.stream(actualRecord.fieldNames()).sorted().toList()) { + // loaded_at and raw_id are generated dynamically, so we just ignore them. + if (!"_airbyte_loaded_at".equals(column) && !"_airbyte_raw_id".equals(column) && !expectedRecord.has(column)) { + extraFields.put(column, actualRecord.get(column)); + } + } + return extraFields; + } + + /** + * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". The leading spaces are + * intentional, to make the message easier to read when it's embedded in a larger stacktrace. + */ + private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { + String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); + String actualString = actualValue == null ? "SQL NULL (i.e. no value)" : actualValue.toString(); + return " For " + fieldname + ", expected " + expectedString + " but got " + actualString + "\n"; + } + + // These asFoo methods are used for sorting records, so their defaults are intended to make broken + // records stand out. + private static String asString(JsonNode node) { + if (node == null || node.isNull()) { + return ""; + } else if (node.isTextual()) { + return node.asText(); + } else { + return Jsons.serialize(node); + } + } + + private static double asDouble(JsonNode node) { + if (node == null || !node.isNumber()) { + return Double.MIN_VALUE; + } else { + return node.longValue(); + } + } + + private static long asInt(JsonNode node) { + if (node == null || !node.isIntegralNumber()) { + return Long.MIN_VALUE; + } else { + return node.longValue(); + } + } + + private static boolean asBoolean(JsonNode node) { + if (node == null || !node.isBoolean()) { + return false; + } else { + return node.asBoolean(); + } + } + + private static Instant asTimestampWithTimezone(JsonNode node) { + if (node == null || !node.isTextual()) { + return Instant.ofEpochMilli(Long.MIN_VALUE); + } else { + try { + return Instant.parse(node.asText()); + } catch (Exception e) { + return Instant.ofEpochMilli(Long.MIN_VALUE); + } + } + } + + private static LocalDateTime asTimestampWithoutTimezone(JsonNode node) { + if (node == null || !node.isTextual()) { + return LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.MIN_VALUE), ZoneOffset.UTC); + } else { + try { + return LocalDateTime.parse(node.asText()); + } catch (Exception e) { + return LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.MIN_VALUE), ZoneOffset.UTC); + } + } + } + + private static OffsetTime asTimeWithTimezone(JsonNode node) { + if (node == null || !node.isTextual()) { + return OffsetTime.of(0, 0, 0, 0, ZoneOffset.UTC); + } else { + return OffsetTime.parse(node.asText()); + } + } + + private static LocalTime asTimeWithoutTimezone(JsonNode node) { + if (node == null || !node.isTextual()) { + return LocalTime.of(0, 0, 0); + } else { + try { + return LocalTime.parse(node.asText()); + } catch (Exception e) { + return LocalTime.of(0, 0, 0); + } + } + } + + private static LocalDate asDate(JsonNode node) { + if (node == null || !node.isTextual()) { + return LocalDate.ofInstant(Instant.ofEpochMilli(Long.MIN_VALUE), ZoneOffset.UTC); + } else { + try { + return LocalDate.parse(node.asText()); + } catch (Exception e) { + return LocalDate.ofInstant(Instant.ofEpochMilli(Long.MIN_VALUE), ZoneOffset.UTC); + } + } + } + + private static Comparable extract(JsonNode node, String field, AirbyteType type) { + if (type instanceof AirbyteProtocolType t) { + return switch (t) { + case STRING -> asString(node.get(field)); + case NUMBER -> asDouble(node.get(field)); + case INTEGER -> asInt(node.get(field)); + case BOOLEAN -> asBoolean(node.get(field)); + case TIMESTAMP_WITH_TIMEZONE -> asTimestampWithTimezone(node.get(field)); + case TIMESTAMP_WITHOUT_TIMEZONE -> asTimestampWithoutTimezone(node.get(field)); + case TIME_WITH_TIMEZONE -> asTimeWithTimezone(node.get(field)); + case TIME_WITHOUT_TIMEZONE -> asTimeWithoutTimezone(node.get(field)); + case DATE -> asDate(node.get(field)); + case UNKNOWN -> node.toString(); + }; + } else { + return node.toString(); + } + } +} diff --git a/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java b/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java index 7c6dcc28597d..2d3b0628e8c4 100644 --- a/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java +++ b/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java @@ -74,6 +74,8 @@ static AirbyteType fromJsonSchema(final JsonNode schema) { return AirbyteTypeUtils.getAirbyteProtocolType(schema); } + public LinkedHashMap asColumns(); + private static Struct getStruct(final JsonNode schema) { final LinkedHashMap propertiesMap = new LinkedHashMap<>(); final JsonNode properties = schema.get("properties"); @@ -107,6 +109,11 @@ public static AirbyteProtocolType matches(final String type) { } } + @Override + public LinkedHashMap asColumns() { + throw new UnsupportedOperationException("Basic types cannot be converted to columns."); + } + } /** @@ -114,10 +121,18 @@ public static AirbyteProtocolType matches(final String type) { */ record Struct(LinkedHashMap properties) implements AirbyteType { + @Override + public LinkedHashMap asColumns() { + return properties; + } } record Array(AirbyteType items) implements AirbyteType { + @Override + public LinkedHashMap asColumns() { + throw new UnsupportedOperationException("Arrays cannot be converted to columns."); + } } /** @@ -127,6 +142,10 @@ record Array(AirbyteType items) implements AirbyteType { */ record UnsupportedOneOf(List options) implements AirbyteType { + @Override + public LinkedHashMap asColumns() { + throw new UnsupportedOperationException("OneOf cannot be converted to columns."); + } } /** From 9c136f7ab616b3640a23cb6d1420ebd2f3d785b5 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 10:46:50 -0700 Subject: [PATCH 24/46] and use it in the sql generator test --- .../typing_deduping/RecordDiffer.java | 4 +- .../AbstractBigQueryTypingDedupingTest.java | 21 +- .../BigQuerySqlGeneratorIntegrationTest.java | 919 +++++++----------- 3 files changed, 358 insertions(+), 586 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index c62a1bcfa706..5e4385c0f4e9 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -73,7 +73,7 @@ public void verifySyncResult(List expectedRawRecords, ); } - private void diffRawTableRecords(List expectedRecords, List actualRecords) { + public void diffRawTableRecords(List expectedRecords, List actualRecords) { String diff = diffRecords( expectedRecords, actualRecords, @@ -85,7 +85,7 @@ private void diffRawTableRecords(List expectedRecords, List assertTrue(diff.isEmpty(), "Raw table was incorrect.\n" + diff); } - private void diffFinalTableRecords(List expectedRecords, List actualRecords) { + public void diffFinalTableRecords(List expectedRecords, List actualRecords) { String diff = diffRecords( expectedRecords, actualRecords, diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java index 912db29823be..4fa25ee9b73f 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java @@ -43,8 +43,7 @@ protected List dumpRawTableRecords(String streamNamespace, String stre streamNamespace = BigQueryUtils.getDatasetId(getConfig()); } TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM airbyte." + streamNamespace + "_" + streamName)); - List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); - return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); + return BigQuerySqlGeneratorIntegrationTest.toJsonRecords(result); } @Override @@ -53,8 +52,7 @@ protected List dumpFinalTableRecords(String streamNamespace, String st streamNamespace = BigQueryUtils.getDatasetId(getConfig()); } TableResult result = bq.query(QueryJobConfiguration.of("SELECT * FROM " + streamNamespace + "." + streamName)); - List> rowsAsMaps = BigQuerySqlGeneratorIntegrationTest.toMaps(result); - return rowsAsMaps.stream().map(AbstractBigQueryTypingDedupingTest::toJson).toList(); + return BigQuerySqlGeneratorIntegrationTest.toJsonRecords(result); } @Override @@ -64,19 +62,4 @@ protected void teardownStreamAndNamespace(String streamNamespace, String streamN bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); bq.delete(DatasetId.of(streamNamespace), BigQuery.DatasetDeleteOption.deleteContents()); } - - private static JsonNode toJson(LinkedHashMap map) { - ObjectNode o = (ObjectNode) Jsons.emptyObject(); - map.forEach((key, value) -> { - if (value == null) { - // If the value is null, do nothing. We don't want to insert it into the json at all. - } else if (value instanceof Instant i) { - // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it - o.set(key, Jsons.jsonNode(i.toString())); - } else { - o.set(key, Jsons.jsonNode(value)); - } - }); - return o; - } } diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index 4ec3b2876be9..2c88a54b74c9 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -5,33 +5,43 @@ package io.airbyte.integrations.destination.bigquery.typing_deduping; import static com.google.cloud.bigquery.LegacySQLTypeName.legacySQLTypeName; -import static java.util.stream.Collectors.toSet; import static org.junit.jupiter.api.Assertions.*; import com.fasterxml.jackson.databind.JsonNode; -import com.google.cloud.bigquery.*; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryException; +import com.google.cloud.bigquery.DatasetInfo; +import com.google.cloud.bigquery.Field; import com.google.cloud.bigquery.Field.Mode; -import com.google.common.collect.ImmutableMap; +import com.google.cloud.bigquery.FieldValue; +import com.google.cloud.bigquery.FieldValueList; +import com.google.cloud.bigquery.QueryJobConfiguration; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardSQLTypeName; +import com.google.cloud.bigquery.Table; +import com.google.cloud.bigquery.TableResult; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType; import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.AirbyteProtocolType; import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.Array; import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.Struct; import io.airbyte.integrations.base.destination.typing_deduping.CatalogParser.StreamConfig; +import io.airbyte.integrations.base.destination.typing_deduping.RecordDiffer; import io.airbyte.integrations.base.destination.typing_deduping.SqlGenerator.ColumnId; import io.airbyte.integrations.base.destination.typing_deduping.SqlGenerator.StreamId; import io.airbyte.integrations.destination.bigquery.BigQueryDestination; import io.airbyte.protocol.models.v0.DestinationSyncMode; import io.airbyte.protocol.models.v0.SyncMode; -import java.math.BigDecimal; import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; -import java.time.Instant; -import java.util.*; -import java.util.Map.Entry; -import java.util.function.Function; -import java.util.stream.Collectors; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.text.StringSubstitutor; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -52,22 +62,11 @@ public class BigQuerySqlGeneratorIntegrationTest { public static final List PRIMARY_KEY = List.of(ID_COLUMN); public static final ColumnId CURSOR = GENERATOR.buildColumnId("updated_at"); public static final ColumnId CDC_CURSOR = GENERATOR.buildColumnId("_ab_cdc_lsn"); - /** - * Super hacky way to sort rows represented as {@code Map} - */ - public static final Comparator> ROW_COMPARATOR = (row1, row2) -> { - int cmp; - cmp = compareRowsOnColumn(ID_COLUMN.name(), row1, row2); - if (cmp != 0) { - return cmp; - } - cmp = compareRowsOnColumn(CURSOR.name(), row1, row2); - if (cmp != 0) { - return cmp; - } - cmp = compareRowsOnColumn(CDC_CURSOR.name(), row1, row2); - return cmp; - }; + public static final RecordDiffer DIFFER = new RecordDiffer( + Pair.of("id", AirbyteProtocolType.INTEGER), + Pair.of("updated_at", AirbyteProtocolType.TIMESTAMP_WITH_TIMEZONE), + Pair.of("_ab_cdc_lsn", AirbyteProtocolType.INTEGER) + ); public static final String QUOTE = "`"; private static final LinkedHashMap COLUMNS; private static final LinkedHashMap CDC_COLUMNS; @@ -182,13 +181,13 @@ public void testCreateTableIncremental() throws InterruptedException { public void testVerifyPrimaryKeysIncremental() throws InterruptedException { createRawTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{}', '10d6e27d-ae7a-41b5-baf8-c4c277ef9c11', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1}', '5ce60e70-98aa-4fe3-8159-67207352c4f0', '2023-01-01T00:00:00Z'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{}', '10d6e27d-ae7a-41b5-baf8-c4c277ef9c11', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1}', '5ce60e70-98aa-4fe3-8159-67207352c4f0', '2023-01-01T00:00:00Z'); + """)) .build()); // This variable is declared outside of the transaction, so we need to do it manually here @@ -206,60 +205,58 @@ public void testInsertNewRecordsIncremental() throws InterruptedException { createRawTable(); createFinalTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}}', '972fa08a-aa06-4b91-a6af-a371aee4cb1c', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}}', '233ad43d-de50-4a47-bbe6-7a417ce60d9d', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'd4aeb036-2d95-4880-acd2-dc69b42b03c6', '2023-01-01T00:00:00Z'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}}', '972fa08a-aa06-4b91-a6af-a371aee4cb1c', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}}', '233ad43d-de50-4a47-bbe6-7a417ce60d9d', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'd4aeb036-2d95-4880-acd2-dc69b42b03c6', '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.insertNewRecords(streamId, "", COLUMNS, DestinationSyncMode.OVERWRITE); logAndExecute(sql); final TableResult result = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.finalTableId(QUOTE)).build()); - assertQueryResult( + DIFFER.diffFinalTableRecords( List.of( - Map.of( - "id", Optional.of(1L), - "updated_at", Optional.of(Instant.parse("2023-01-01T01:00:00Z")), - "string", Optional.of("Alice"), - "struct", Optional.of(Jsons.deserialize( - """ - {"city": "San Francisco", "state": "CA"} - """)), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[]} - """))), - Map.of( - "id", Optional.of(1L), - "updated_at", Optional.of(Instant.parse("2023-01-01T02:00:00Z")), - "string", Optional.of("Alice"), - "struct", Optional.of(Jsons.deserialize( + Jsons.deserialize( + """ + { + "id": 1, + "updated_at": "2023-01-01T01:00:00Z", + "string": "Alice", + "struct": {"city": "San Francisco", "state": "CA"}, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors":[]} + } """ - {"city": "San Diego", "state": "CA"} - """)), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_meta", Optional.of(Jsons.deserialize( + ), + Jsons.deserialize( + """ + { + "id": 1, + "updated_at": "2023-01-01T02:00:00Z", + "string": "Alice", + "struct": {"city": "San Diego", "state": "CA"}, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors":[]} + } """ - {"errors":[]} - """))), - Map.of( - "id", Optional.of(2L), - "updated_at", Optional.of(Instant.parse("2023-01-01T03:00:00Z")), - "string", Optional.of("Bob"), - "struct", Optional.empty(), - "integer", Optional.empty(), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_meta", Optional.of(Jsons.deserialize( + ), + Jsons.deserialize( + """ + { + "id": 2, + "updated_at": "2023-01-01T03:00:00Z", + "string": "Bob", + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors":["Problem with `integer`"]} + } """ - {"errors":["Problem with `integer`"]} - """)))), - result); + )), + toJsonRecords(result)); } @Test @@ -267,52 +264,50 @@ public void testDedupFinalTable() throws InterruptedException { createRawTable(); createFinalTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - - INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values - ('d7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T01:00:00Z', 'Alice', JSON'{"city": "San Francisco", "state": "CA"}', 42), - ('80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T02:00:00Z', 'Alice', JSON'{"city": "San Diego", "state": "CA"}', 84), - ('ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z', JSON'{"errors": ["blah blah integer"]}', 2, '2023-01-01T03:00:00Z', 'Bob', NULL, NULL); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + + INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values + ('d7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T01:00:00Z', 'Alice', JSON'{"city": "San Francisco", "state": "CA"}', 42), + ('80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T02:00:00Z', 'Alice', JSON'{"city": "San Diego", "state": "CA"}', 84), + ('ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z', JSON'{"errors": ["blah blah integer"]}', 2, '2023-01-01T03:00:00Z', 'Bob', NULL, NULL); + """)) .build()); final String sql = GENERATOR.dedupFinalTable(streamId, "", PRIMARY_KEY, CURSOR, COLUMNS); logAndExecute(sql); final TableResult result = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.finalTableId(QUOTE)).build()); - assertQueryResult( + DIFFER.diffFinalTableRecords( List.of( - Map.of( - "id", Optional.of(1L), - "updated_at", Optional.of(Instant.parse("2023-01-01T02:00:00Z")), - "string", Optional.of("Alice"), - "struct", Optional.of(Jsons.deserialize( - """ - {"city": "San Diego", "state": "CA"} + Jsons.deserialize( + """ + { + "id": 1, + "updated_at": "2023-01-01T02:00:00Z", + "string": "Alice", + "struct": {"city": "San Diego", "state": "CA"}, + "integer": 84, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors":[]} + } + """), + Jsons.deserialize( + """ + { + "id": 2, + "updated_at": "2023-01-01T03:00:00Z", + "string": "Bob", + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors":["blah blah integer"]} + } """)), - "integer", Optional.of(84L), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[]} - """))), - Map.of( - "id", Optional.of(2L), - "updated_at", Optional.of(Instant.parse("2023-01-01T03:00:00Z")), - "string", Optional.of("Bob"), - "struct", Optional.empty(), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":["blah blah integer"]} - """)))), - result); + toJsonRecords(result)); } @Test @@ -320,54 +315,58 @@ public void testDedupRawTable() throws InterruptedException { createRawTable(); createFinalTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - - INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values - ('80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T02:00:00Z', 'Alice', JSON'{"city": "San Diego", "state": "CA"}', 84), - ('ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z', JSON'{"errors": ["blah blah integer"]}', 2, '2023-01-01T03:00:00Z', 'Bob', NULL, NULL); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + + INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values + ('80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z', JSON'{"errors":[]}', 1, '2023-01-01T02:00:00Z', 'Alice', JSON'{"city": "San Diego", "state": "CA"}', 84), + ('ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z', JSON'{"errors": ["blah blah integer"]}', 2, '2023-01-01T03:00:00Z', 'Bob', NULL, NULL); + """)) .build()); final String sql = GENERATOR.dedupRawTable(streamId, "", CDC_COLUMNS); logAndExecute(sql); final TableResult result = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.rawTableId(QUOTE)).build()); - assertQueryResult( + DIFFER.diffFinalTableRecords( List.of( - Map.of( - "_airbyte_raw_id", Optional.of("80c99b54-54b4-43bd-b51b-1f67dafa2c52"), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_data", Optional.of(Jsons.deserialize( + Jsons.deserialize( + """ + { + "_airbyte_raw_id": "80c99b54-54b4-43bd-b51b-1f67dafa2c52", + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_data": {"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84} + } """ - {"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84} - """))), - Map.of( - "_airbyte_raw_id", Optional.of("ad690bfb-c2c2-4172-bd73-a16c86ccbb67"), - "_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z")), - "_airbyte_data", Optional.of(Jsons.deserialize( + ), + Jsons.deserialize( + """ + { + "_airbyte_raw_id": "ad690bfb-c2c2-4172-bd73-a16c86ccbb67", + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_data": {"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"} + } """ - {"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"} - """)))), - result); + )), + toJsonRecords(result)); } @Test public void testCommitRawTable() throws InterruptedException { createRawTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.commitRawTable(streamId); @@ -383,107 +382,72 @@ public void testFullUpdateAllTypes() throws InterruptedException { createRawTable(); createFinalTable("_foo"); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_data`) VALUES - (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "array": ["foo"], "struct": {"foo": "bar"}, "string": "foo", "number": 42.1, "integer": 42, "boolean": true, "timestamp_with_timezone": "2023-01-23T12:34:56Z", "timestamp_without_timezone": "2023-01-23T12:34:56", "time_with_timezone": "12:34:56Z", "time_without_timezone": "12:34:56", "date": "2023-01-23", "unknown": {}}'), - (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 2, "updated_at": "2023-01-01T01:00:00Z", "array": null, "struct": null, "string": null, "number": null, "integer": null, "boolean": null, "timestamp_with_timezone": null, "timestamp_without_timezone": null, "time_with_timezone": null, "time_without_timezone": null, "date": null, "unknown": null}'), - (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 3, "updated_at": "2023-01-01T01:00:00Z"}'), - (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 4, "updated_at": "2023-01-01T01:00:00Z", "array": {}, "struct": [], "string": {}, "number": {}, "integer": {}, "boolean": {}, "timestamp_with_timezone": {}, "timestamp_without_timezone": {}, "time_with_timezone": {}, "time_without_timezone": {}, "date": {}, "unknown": null}'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_data`) VALUES + (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "array": ["foo"], "struct": {"foo": "bar"}, "string": "foo", "number": 42.1, "integer": 42, "boolean": true, "timestamp_with_timezone": "2023-01-23T12:34:56Z", "timestamp_without_timezone": "2023-01-23T12:34:56", "time_with_timezone": "12:34:56Z", "time_without_timezone": "12:34:56", "date": "2023-01-23", "unknown": {}}'), + (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 2, "updated_at": "2023-01-01T01:00:00Z", "array": null, "struct": null, "string": null, "number": null, "integer": null, "boolean": null, "timestamp_with_timezone": null, "timestamp_without_timezone": null, "time_with_timezone": null, "time_without_timezone": null, "date": null, "unknown": null}'), + (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 3, "updated_at": "2023-01-01T01:00:00Z"}'), + (generate_uuid(), '2023-01-01T00:00:00Z', JSON'{"id": 4, "updated_at": "2023-01-01T01:00:00Z", "array": {}, "struct": [], "string": {}, "number": {}, "integer": {}, "boolean": {}, "timestamp_with_timezone": {}, "timestamp_without_timezone": {}, "time_with_timezone": {}, "time_without_timezone": {}, "date": {}, "unknown": null}'); + """)) .build()); final String sql = GENERATOR.updateTable("_foo", incrementalDedupStreamConfig()); logAndExecute(sql); final TableResult finalTable = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.finalTableId("_foo", QUOTE)).build()); - assertQueryResult( + DIFFER.diffFinalTableRecords( List.of( - new ImmutableMap.Builder>() - .put("id", Optional.of(1L)) - .put("updated_at", Optional.of(Instant.parse("2023-01-01T01:00:00Z"))) - .put("array", Optional.of(Jsons.deserialize( - """ - ["foo"] - """))) - .put("struct", Optional.of(Jsons.deserialize( - """ - {"foo": "bar"} - """))) - .put("string", Optional.of("foo")) - .put("number", Optional.of(new BigDecimal("42.1"))) - .put("integer", Optional.of(42L)) - .put("boolean", Optional.of(true)) - .put("timestamp_with_timezone", Optional.of(Instant.parse("2023-01-23T12:34:56Z"))) - .put("timestamp_without_timezone", Optional.of("2023-01-23T12:34:56")) - .put("time_with_timezone", Optional.of("12:34:56Z")) - .put("time_without_timezone", Optional.of("12:34:56")) - .put("date", Optional.of("2023-01-23")) - .put("_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z"))) - .put("_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[]} - """))) - .build(), - new ImmutableMap.Builder>() - .put("id", Optional.of(2L)) - .put("updated_at", Optional.of(Instant.parse("2023-01-01T01:00:00Z"))) - .put("array", Optional.empty()) - .put("struct", Optional.empty()) - .put("string", Optional.empty()) - .put("number", Optional.empty()) - .put("integer", Optional.empty()) - .put("boolean", Optional.empty()) - .put("timestamp_with_timezone", Optional.empty()) - .put("timestamp_without_timezone", Optional.empty()) - .put("time_with_timezone", Optional.empty()) - .put("time_without_timezone", Optional.empty()) - .put("date", Optional.empty()) - .put("_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z"))) - .put("_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[]} - """))) - .build(), - new ImmutableMap.Builder>() - .put("id", Optional.of(3L)) - .put("updated_at", Optional.of(Instant.parse("2023-01-01T01:00:00Z"))) - .put("array", Optional.empty()) - .put("struct", Optional.empty()) - .put("string", Optional.empty()) - .put("number", Optional.empty()) - .put("integer", Optional.empty()) - .put("boolean", Optional.empty()) - .put("timestamp_with_timezone", Optional.empty()) - .put("timestamp_without_timezone", Optional.empty()) - .put("time_with_timezone", Optional.empty()) - .put("time_without_timezone", Optional.empty()) - .put("date", Optional.empty()) - .put("_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z"))) - .put("_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[]} - """))) - .build(), - new ImmutableMap.Builder>() - .put("id", Optional.of(4L)) - .put("updated_at", Optional.of(Instant.parse("2023-01-01T01:00:00Z"))) - .put("array", Optional.empty()) - .put("struct", Optional.empty()) - .put("string", Optional.empty()) - .put("number", Optional.empty()) - .put("integer", Optional.empty()) - .put("boolean", Optional.empty()) - .put("timestamp_with_timezone", Optional.empty()) - .put("timestamp_without_timezone", Optional.empty()) - .put("time_with_timezone", Optional.empty()) - .put("time_without_timezone", Optional.empty()) - .put("date", Optional.empty()) - .put("_airbyte_extracted_at", Optional.of(Instant.parse("2023-01-01T00:00:00Z"))) - .put("_airbyte_meta", Optional.of(Jsons.deserialize( - """ - {"errors":[ + Jsons.deserialize( + """ + { + "id": 1, + "updated_at": "2023-01-01T01:00:00Z", + "array": ["foo"], + "struct": {"foo": "bar"}, + "string": "foo", + "number": 42.1, + "integer": 42, + "boolean": true, + "timestamp_with_timezone": "2023-01-23T12:34:56Z", + "timestamp_without_timezone": "2023-01-23T12:34:56", + "time_with_timezone": "12:34:56Z", + "time_without_timezone": "12:34:56", + "date": "2023-01-23", + "unknown": {}, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors": []} + } + """), + Jsons.deserialize( + """ + { + "id": 2, + "updated_at": "2023-01-01T01:00:00Z", + "unknown": null, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors": []} + } + """), + Jsons.deserialize( + """ + { + "id": 3, + "updated_at": "2023-01-01T01:00:00Z", + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": {"errors": []} + } + """), + Jsons.deserialize( + """ + { + "id": 4, + "updated_at": "2023-01-01T01:00:00Z", + "unknown": null, + "_airbyte_extracted_at": "2023-01-01T00:00:00Z", + "_airbyte_meta": { + "errors": [ "Problem with `struct`", "Problem with `array`", "Problem with `string`", @@ -495,10 +459,11 @@ public void testFullUpdateAllTypes() throws InterruptedException { "Problem with `time_with_timezone`", "Problem with `time_without_timezone`", "Problem with `date`" - ]} - """))) - .build()), - finalTable); + ] + } + } + """)), + toJsonRecords(finalTable)); final long rawRows = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.rawTableId(QUOTE)).build()).getTotalRows(); assertEquals(4, rawRows); @@ -512,14 +477,14 @@ public void testFullUpdateIncrementalDedup() throws InterruptedException { createRawTable(); createFinalTable(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.updateTable("", incrementalDedupStreamConfig()); @@ -540,14 +505,14 @@ public void testFullUpdateIncrementalAppend() throws InterruptedException { createRawTable(); createFinalTable("_foo"); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.updateTable("_foo", incrementalAppendStreamConfig()); @@ -571,17 +536,17 @@ public void testFullUpdateFullRefreshAppend() throws InterruptedException { createRawTable(); createFinalTable("_foo"); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); - - INSERT INTO ${dataset}.users_final_foo (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values - ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', JSON'{"errors": []}', 1, '2022-12-31T00:00:00Z', 'Alice', NULL, NULL); - """)) + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "updated_at": "2023-01-01T01:00:00Z", "string": "Alice", "struct": {"city": "San Francisco", "state": "CA"}, "integer": 42}', 'd7b81af0-01da-4846-a650-cc398986bc99', '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "updated_at": "2023-01-01T02:00:00Z", "string": "Alice", "struct": {"city": "San Diego", "state": "CA"}, "integer": 84}', '80c99b54-54b4-43bd-b51b-1f67dafa2c52', '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "updated_at": "2023-01-01T03:00:00Z", "string": "Bob", "integer": "oops"}', 'ad690bfb-c2c2-4172-bd73-a16c86ccbb67', '2023-01-01T00:00:00Z'); + + INSERT INTO ${dataset}.users_final_foo (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `updated_at`, `string`, `struct`, `integer`) values + ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', JSON'{"errors": []}', 1, '2022-12-31T00:00:00Z', 'Alice', NULL, NULL); + """)) .build()); final String sql = GENERATOR.updateTable("_foo", fullRefreshAppendStreamConfig()); @@ -614,30 +579,30 @@ public void testCdcUpdate() throws InterruptedException { createRawTable(); createFinalTableCdc(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - -- records from a previous sync - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES - (JSON'{"id": 1, "_ab_cdc_lsn": 900, "string": "spooky ghost", "_ab_cdc_deleted_at": null}', '64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'), - (JSON'{"id": 0, "_ab_cdc_lsn": 901, "string": "zombie", "_ab_cdc_deleted_at": "2022-12-31T00:O0:00Z"}', generate_uuid(), '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'), - (JSON'{"id": 5, "_ab_cdc_lsn": 902, "string": "will be deleted", "_ab_cdc_deleted_at": null}', 'b6139181-a42c-45c3-89f2-c4b4bb3a8c9d', '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'); - INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `_ab_cdc_lsn`, `string`, `struct`, `integer`) values - ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', JSON'{}', 1, 900, 'spooky ghost', NULL, NULL), - ('b6139181-a42c-45c3-89f2-c4b4bb3a8c9d', '2022-12-31T00:00:00Z', JSON'{}', 5, 901, 'will be deleted', NULL, NULL); - - -- new records from the current sync - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 2, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": null, "string": "alice"}', generate_uuid(), '2023-01-01T00:00:00Z'), - (JSON'{"id": 2, "_ab_cdc_lsn": 10002, "_ab_cdc_deleted_at": null, "string": "alice2"}', generate_uuid(), '2023-01-01T00:00:00Z'), - (JSON'{"id": 3, "_ab_cdc_lsn": 10003, "_ab_cdc_deleted_at": null, "string": "bob"}', generate_uuid(), '2023-01-01T00:00:00Z'), - (JSON'{"id": 1, "_ab_cdc_lsn": 10004, "_ab_cdc_deleted_at": "2022-12-31T23:59:59Z"}', generate_uuid(), '2023-01-01T00:00:00Z'), - (JSON'{"id": 0, "_ab_cdc_lsn": 10005, "_ab_cdc_deleted_at": null, "string": "zombie_returned"}', generate_uuid(), '2023-01-01T00:00:00Z'), - -- CDC generally outputs an explicit null for deleted_at, but verify that we can also handle the case where deleted_at is unset. - (JSON'{"id": 4, "_ab_cdc_lsn": 10006, "string": "charlie"}', generate_uuid(), '2023-01-01T00:00:00Z'), - -- Verify that we can handle weird values in deleted_at - (JSON'{"id": 5, "_ab_cdc_lsn": 10007, "_ab_cdc_deleted_at": {}, "string": "david"}', generate_uuid(), '2023-01-01T00:00:00Z'); - """)) + -- records from a previous sync + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES + (JSON'{"id": 1, "_ab_cdc_lsn": 900, "string": "spooky ghost", "_ab_cdc_deleted_at": null}', '64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'), + (JSON'{"id": 0, "_ab_cdc_lsn": 901, "string": "zombie", "_ab_cdc_deleted_at": "2022-12-31T00:O0:00Z"}', generate_uuid(), '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'), + (JSON'{"id": 5, "_ab_cdc_lsn": 902, "string": "will be deleted", "_ab_cdc_deleted_at": null}', 'b6139181-a42c-45c3-89f2-c4b4bb3a8c9d', '2022-12-31T00:00:00Z', '2022-12-31T00:00:01Z'); + INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `_ab_cdc_lsn`, `string`, `struct`, `integer`) values + ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2022-12-31T00:00:00Z', JSON'{}', 1, 900, 'spooky ghost', NULL, NULL), + ('b6139181-a42c-45c3-89f2-c4b4bb3a8c9d', '2022-12-31T00:00:00Z', JSON'{}', 5, 901, 'will be deleted', NULL, NULL); + + -- new records from the current sync + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 2, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": null, "string": "alice"}', generate_uuid(), '2023-01-01T00:00:00Z'), + (JSON'{"id": 2, "_ab_cdc_lsn": 10002, "_ab_cdc_deleted_at": null, "string": "alice2"}', generate_uuid(), '2023-01-01T00:00:00Z'), + (JSON'{"id": 3, "_ab_cdc_lsn": 10003, "_ab_cdc_deleted_at": null, "string": "bob"}', generate_uuid(), '2023-01-01T00:00:00Z'), + (JSON'{"id": 1, "_ab_cdc_lsn": 10004, "_ab_cdc_deleted_at": "2022-12-31T23:59:59Z"}', generate_uuid(), '2023-01-01T00:00:00Z'), + (JSON'{"id": 0, "_ab_cdc_lsn": 10005, "_ab_cdc_deleted_at": null, "string": "zombie_returned"}', generate_uuid(), '2023-01-01T00:00:00Z'), + -- CDC generally outputs an explicit null for deleted_at, but verify that we can also handle the case where deleted_at is unset. + (JSON'{"id": 4, "_ab_cdc_lsn": 10006, "string": "charlie"}', generate_uuid(), '2023-01-01T00:00:00Z'), + -- Verify that we can handle weird values in deleted_at + (JSON'{"id": 5, "_ab_cdc_lsn": 10007, "_ab_cdc_deleted_at": {}, "string": "david"}', generate_uuid(), '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.updateTable("", cdcStreamConfig()); @@ -678,18 +643,18 @@ public void testCdcOrdering_updateAfterDelete() throws InterruptedException { createRawTable(); createFinalTableCdc(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - -- Write raw deletion record from the first batch, which resulted in an empty final table. - -- Note the non-null loaded_at - this is to simulate that we previously ran T+D on this record. - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES - (JSON'{"id": 1, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": "2023-01-01T00:01:00Z"}', generate_uuid(), '2023-01-01T00:00:00Z', '2023-01-01T00:00:01Z'); - - -- insert raw record from the second record batch - this is an outdated record that should be ignored. - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "_ab_cdc_lsn": 10000, "string": "alice"}', generate_uuid(), '2023-01-01T00:00:00Z'); - """)) + -- Write raw deletion record from the first batch, which resulted in an empty final table. + -- Note the non-null loaded_at - this is to simulate that we previously ran T+D on this record. + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES + (JSON'{"id": 1, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": "2023-01-01T00:01:00Z"}', generate_uuid(), '2023-01-01T00:00:00Z', '2023-01-01T00:00:01Z'); + + -- insert raw record from the second record batch - this is an outdated record that should be ignored. + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "_ab_cdc_lsn": 10000, "string": "alice"}', generate_uuid(), '2023-01-01T00:00:00Z'); + """)) .build()); final String sql = GENERATOR.updateTable("", cdcStreamConfig()); @@ -724,19 +689,19 @@ public void testCdcOrdering_insertAfterDelete() throws InterruptedException { createRawTable(); createFinalTableCdc(); bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - -- records from the first batch - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES - (JSON'{"id": 1, "_ab_cdc_lsn": 10002, "string": "alice_reinsert"}', '64f4390f-3da1-4b65-b64a-a6c67497f18d', '2023-01-01T00:00:00Z', '2023-01-01T00:00:01Z'); - INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `_ab_cdc_lsn`, `string`) values - ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2023-01-01T00:00:00Z', JSON'{}', 1, 10002, 'alice_reinsert'); - - -- second record batch - INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES - (JSON'{"id": 1, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": "2023-01-01T00:01:00Z"}', generate_uuid(), '2023-01-01T00:00:00Z'); - """)) + -- records from the first batch + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`, `_airbyte_loaded_at`) VALUES + (JSON'{"id": 1, "_ab_cdc_lsn": 10002, "string": "alice_reinsert"}', '64f4390f-3da1-4b65-b64a-a6c67497f18d', '2023-01-01T00:00:00Z', '2023-01-01T00:00:01Z'); + INSERT INTO ${dataset}.users_final (_airbyte_raw_id, _airbyte_extracted_at, _airbyte_meta, `id`, `_ab_cdc_lsn`, `string`) values + ('64f4390f-3da1-4b65-b64a-a6c67497f18d', '2023-01-01T00:00:00Z', JSON'{}', 1, 10002, 'alice_reinsert'); + + -- second record batch + INSERT INTO ${dataset}.users_raw (`_airbyte_data`, `_airbyte_raw_id`, `_airbyte_extracted_at`) VALUES + (JSON'{"id": 1, "_ab_cdc_lsn": 10001, "_ab_cdc_deleted_at": "2023-01-01T00:01:00Z"}', generate_uuid(), '2023-01-01T00:00:00Z'); + """)) .build()); // Run the second round of typing and deduping. This should do nothing to the final table, because // the delete is outdated. @@ -808,18 +773,18 @@ private StreamConfig fullRefreshOverwriteStreamConfig() { // Some of them are identical to what the sql generator does, and that's intentional. private void createRawTable() throws InterruptedException { bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - CREATE TABLE ${dataset}.users_raw ( - _airbyte_raw_id STRING NOT NULL, - _airbyte_data JSON NOT NULL, - _airbyte_extracted_at TIMESTAMP NOT NULL, - _airbyte_loaded_at TIMESTAMP - ) PARTITION BY ( - DATE_TRUNC(_airbyte_extracted_at, DAY) - ) CLUSTER BY _airbyte_loaded_at; - """)) + CREATE TABLE ${dataset}.users_raw ( + _airbyte_raw_id STRING NOT NULL, + _airbyte_data JSON NOT NULL, + _airbyte_extracted_at TIMESTAMP NOT NULL, + _airbyte_loaded_at TIMESTAMP + ) PARTITION BY ( + DATE_TRUNC(_airbyte_extracted_at, DAY) + ) CLUSTER BY _airbyte_loaded_at; + """)) .build()); } @@ -829,63 +794,63 @@ private void createFinalTable() throws InterruptedException { private void createFinalTable(String suffix) throws InterruptedException { bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset, - "suffix", suffix)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset, + "suffix", suffix)).replace( """ - CREATE TABLE ${dataset}.users_final${suffix} ( - _airbyte_raw_id STRING NOT NULL, - _airbyte_extracted_at TIMESTAMP NOT NULL, - _airbyte_meta JSON NOT NULL, - `id` INT64, - `updated_at` TIMESTAMP, - `struct` JSON, - `array` JSON, - `string` STRING, - `number` NUMERIC, - `integer` INT64, - `boolean` BOOL, - `timestamp_with_timezone` TIMESTAMP, - `timestamp_without_timezone` DATETIME, - `time_with_timezone` STRING, - `time_without_timezone` TIME, - `date` DATE, - `unknown` JSON - ) - PARTITION BY (DATE_TRUNC(_airbyte_extracted_at, DAY)) - CLUSTER BY id, _airbyte_extracted_at; - """)) + CREATE TABLE ${dataset}.users_final${suffix} ( + _airbyte_raw_id STRING NOT NULL, + _airbyte_extracted_at TIMESTAMP NOT NULL, + _airbyte_meta JSON NOT NULL, + `id` INT64, + `updated_at` TIMESTAMP, + `struct` JSON, + `array` JSON, + `string` STRING, + `number` NUMERIC, + `integer` INT64, + `boolean` BOOL, + `timestamp_with_timezone` TIMESTAMP, + `timestamp_without_timezone` DATETIME, + `time_with_timezone` STRING, + `time_without_timezone` TIME, + `date` DATE, + `unknown` JSON + ) + PARTITION BY (DATE_TRUNC(_airbyte_extracted_at, DAY)) + CLUSTER BY id, _airbyte_extracted_at; + """)) .build()); } private void createFinalTableCdc() throws InterruptedException { bq.query(QueryJobConfiguration.newBuilder( - new StringSubstitutor(Map.of( - "dataset", testDataset)).replace( + new StringSubstitutor(Map.of( + "dataset", testDataset)).replace( """ - CREATE TABLE ${dataset}.users_final ( - _airbyte_raw_id STRING NOT NULL, - _airbyte_extracted_at TIMESTAMP NOT NULL, - _airbyte_meta JSON NOT NULL, - `id` INT64, - `_ab_cdc_deleted_at` TIMESTAMP, - `_ab_cdc_lsn` INT64, - `struct` JSON, - `array` JSON, - `string` STRING, - `number` NUMERIC, - `integer` INT64, - `boolean` BOOL, - `timestamp_with_timezone` TIMESTAMP, - `timestamp_without_timezone` DATETIME, - `time_with_timezone` STRING, - `time_without_timezone` TIME, - `date` DATE, - `unknown` JSON - ) - PARTITION BY (DATE_TRUNC(_airbyte_extracted_at, DAY)) - CLUSTER BY id, _airbyte_extracted_at; - """)) + CREATE TABLE ${dataset}.users_final ( + _airbyte_raw_id STRING NOT NULL, + _airbyte_extracted_at TIMESTAMP NOT NULL, + _airbyte_meta JSON NOT NULL, + `id` INT64, + `_ab_cdc_deleted_at` TIMESTAMP, + `_ab_cdc_lsn` INT64, + `struct` JSON, + `array` JSON, + `string` STRING, + `number` NUMERIC, + `integer` INT64, + `boolean` BOOL, + `timestamp_with_timezone` TIMESTAMP, + `timestamp_without_timezone` DATETIME, + `time_with_timezone` STRING, + `time_without_timezone` TIME, + `date` DATE, + `unknown` JSON + ) + PARTITION BY (DATE_TRUNC(_airbyte_extracted_at, DAY)) + CLUSTER BY id, _airbyte_extracted_at; + """)) .build()); } @@ -899,10 +864,10 @@ private static void logAndExecute(final String sql) throws InterruptedException * That's annoying for us since we're working with small test data, so pull everything into a list, and convert them * into maps of column name -> value. *

- * Note that the values have reasonable types; see {@link #toMap(Schema, FieldValueList)} for details. + * Note that the values have reasonable types; see {@link #toJson(Schema, FieldValueList)} for details. */ - public static List> toMaps(TableResult result) { - return result.streamAll().map(row -> toMap(result.getSchema(), row)).toList(); + public static List toJsonRecords(TableResult result) { + return result.streamAll().map(row -> toJson(result.getSchema(), row)).toList(); } /** @@ -911,209 +876,33 @@ public static List> toMaps(TableResult result) { *

* SQL nulls are represented as explicit null values. JSON nulls are represented as {@link com.fasterxml.jackson.databind.node.NullNode}. */ - private static LinkedHashMap toMap(Schema schema, FieldValueList row) { - final LinkedHashMap map = new LinkedHashMap<>(); + private static JsonNode toJson(Schema schema, FieldValueList row) { + final ObjectNode json = (ObjectNode) Jsons.emptyObject(); for (int i = 0; i < schema.getFields().size(); i++) { final Field field = schema.getFields().get(i); final FieldValue value = row.get(i); - Object typedValue; - if (value.isNull()) { - typedValue = null; - } else { + JsonNode typedValue; + if (!value.isNull()) { typedValue = switch (field.getType().getStandardType()) { - case BOOL -> value.getBooleanValue(); - case INT64 -> value.getLongValue(); - case FLOAT64 -> value.getDoubleValue(); - case NUMERIC, BIGNUMERIC -> value.getNumericValue(); - case STRING -> value.getStringValue(); - case BYTES -> value.getBytesValue(); - case TIMESTAMP -> value.getTimestampInstant(); + case BOOL -> Jsons.jsonNode(value.getBooleanValue()); + case INT64 -> Jsons.jsonNode(value.getLongValue()); + case FLOAT64 -> Jsons.jsonNode(value.getDoubleValue()); + case NUMERIC, BIGNUMERIC -> Jsons.jsonNode(value.getNumericValue()); + case STRING -> Jsons.jsonNode(value.getStringValue()); + // naively converting an Instant returns a DecimalNode with the unix epoch, so instead we manually stringify it + case TIMESTAMP -> Jsons.jsonNode(value.getTimestampInstant().toString()); // value.getTimestampInstant() fails to parse these types - case DATE, DATETIME, TIME -> value.getStringValue(); + case DATE, DATETIME, TIME -> Jsons.jsonNode(value.getStringValue()); // bigquery returns JSON columns as string; manually parse it into a JsonNode - case JSON -> Jsons.deserialize(value.getStringValue()); + case JSON -> Jsons.jsonNode(Jsons.deserialize(value.getStringValue())); // Default case for weird types (struct, array, geography, interval) - default -> value.getStringValue(); + default -> Jsons.jsonNode(value.getStringValue()); }; + json.set(field.getName(), typedValue); } - map.put(field.getName(), typedValue); - } - return map; - } - - /** - * Asserts that the expected rows match the query result. Please don't read this code. Trust the - * logs. - */ - private void assertQueryResult(final List>> expectedRows, final TableResult result) { - List> actualRows = toMaps(result); - List>> missingRows = new ArrayList<>(); - Set> matchedRows = new HashSet<>(); - boolean foundMultiMatch = false; - // For each expected row, iterate through all actual rows to find a match. - for (Map> expectedRow : expectedRows) { - final List> matchingRows = actualRows.stream().filter(actualRow -> { - // We only want to check the fields that are specified in the expected row. - // E.g.we shouldn't assert against randomized UUIDs. - for (Entry> expectedEntry : expectedRow.entrySet()) { - // If the expected value is empty, we just check that the actual value is null. - if (expectedEntry.getValue().isEmpty()) { - if (actualRow.get(expectedEntry.getKey()) != null) { - // It wasn't null, so this actualRow doesn't match the expected row - return false; - } else { - // It _was_ null, so we can move on the next key. - continue; - } - } - // If the expected value is non-empty, we check that the actual value matches. - if (!expectedEntry.getValue().get().equals(actualRow.get(expectedEntry.getKey()))) { - return false; - } - } - return true; - }).toList(); - - if (matchingRows.size() == 0) { - missingRows.add(expectedRow); - } else if (matchingRows.size() > 1) { - foundMultiMatch = true; - } - matchedRows.addAll(matchingRows); - } - - // TODO is the foundMultiMatch condition correct? E.g. what if we try to write the same row twice - // (because of a retry)? Are we - // guaranteed to have some differentiator? - if (foundMultiMatch || !missingRows.isEmpty() || matchedRows.size() != actualRows.size()) { - Set> extraRows = actualRows.stream().filter(row -> !matchedRows.contains(row)).collect(toSet()); - fail(diff(missingRows, extraRows)); - } - } - - private static String sortedToString(Map record) { - return sortedToString(record, Function.identity()); - } - - private static String sortedToString(Map record, Function valueMapper) { - return "{" - + record.entrySet().stream() - .sorted(Entry.comparingByKey()) - .map(entry -> entry.getKey() + "=" + valueMapper.apply(entry.getValue())) - .collect(Collectors.joining(", ")) - + "}"; - } - - /** - * Attempts to generate a pretty-print diff of the rows. Output will look something like: - * {@code Missing row: {id=1} Extra row: {id=2} Mismatched row: id=3; foo_column expected String - * arst, got Long 42 } - * - * Assumes that rows with the same id and cursor are the same row. - */ - private static String diff(List>> missingRowsRaw, Set> extraRowsRaw) { - List> missingRows = missingRowsRaw.stream() - .map(row -> { - // Extract everything from inside the optionals. - Map newRow = new HashMap<>(); - for (Entry> entry : row.entrySet()) { - newRow.put(entry.getKey(), entry.getValue().orElse(null)); - } - return newRow; - }).sorted(ROW_COMPARATOR) - .toList(); - - List> extraRows = extraRowsRaw.stream().sorted(ROW_COMPARATOR).toList(); - - String output = ""; - int missingIndex = 0; - int extraIndex = 0; - while (missingIndex < missingRows.size() && extraIndex < extraRows.size()) { - Map missingRow = missingRows.get(missingIndex); - Map extraRow = extraRows.get(extraIndex); - int compare = ROW_COMPARATOR.compare(missingRow, extraRow); - if (compare < 0) { - // missing row is too low - we should print missing rows until we catch up - output += "Missing row: " + sortedToString(missingRow) + "\n"; - missingIndex++; - } else if (compare == 0) { - // rows match - we should print the diff between them - output += "Mismatched row: "; - if (missingRow.containsKey(ID_COLUMN.name())) { - output += "id=" + missingRow.get(ID_COLUMN.name()) + "; "; - } - if (missingRow.containsKey(CURSOR.name())) { - output += "updated_at=" + missingRow.get(CURSOR.name()) + "; "; - } - if (missingRow.containsKey(CDC_CURSOR.name())) { - output += "_ab_cdc_lsn=" + missingRow.get(CDC_CURSOR.name()) + "; "; - } - output += "\n"; - for (String key : missingRow.keySet().stream().sorted().toList()) { - Object missingValue = missingRow.get(key); - Object extraValue = extraRow.get(key); - if (!Objects.equals(missingValue, extraValue)) { - output += " " + key + " expected " + getClassAndValue(missingValue) + ", got " + getClassAndValue(extraValue) + "\n"; - } - } - - missingIndex++; - extraIndex++; - } else { - // extra row is too low - we should print extra rows until we catch up - output += "Extra row: " + sortedToString(extraRow) + "\n"; - extraIndex++; - } - } - while (missingIndex < missingRows.size()) { - Map missingRow = missingRows.get(missingIndex); - output += "Missing row: " + sortedToString(missingRow) + "\n"; - missingIndex++; - } - while (extraIndex < extraRows.size()) { - Map extraRow = extraRows.get(extraIndex); - output += "Extra row: " + sortedToString(extraRow) + "\n"; - extraIndex++; - } - return output; - } - - /** - * Compare two rows on the given column. Sorts nulls first. If the values are not the same type, - * assumes the left value is smaller. - */ - private static int compareRowsOnColumn(String column, Map row1, Map row2) { - Comparable r1id = (Comparable) row1.get(column); - Comparable r2id = (Comparable) row2.get(column); - if (r1id == null) { - if (r2id == null) { - return 0; - } else { - return -1; - } - } else { - if (r2id == null) { - return 1; - } else { - if (r1id.getClass().equals(r2id.getClass())) { - // We're doing some very sketchy type-casting nonsense here, but it's guarded by the class equality - // check. - return ((Comparable) r1id).compareTo(r2id); - } else { - // Both values are non-null, but they're not the same type. Assume left is smaller. - return -1; - } - } - } - } - - private static String getClassAndValue(Object o) { - if (o == null) { - return null; - } else { - return o.getClass().getSimpleName() + " " + o; } + return json; } } From 844bba66f733a6e92cb46570acc4e04b7641a474 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 10:56:52 -0700 Subject: [PATCH 25/46] fix --- .../base/destination/typing_deduping/RecordDiffer.java | 8 ++++---- .../BigQuerySqlGeneratorIntegrationTest.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 5e4385c0f4e9..96607adde3de 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -50,11 +50,11 @@ public RecordDiffer(Pair... columns) { rawRecordIdentityExtractor = record -> Arrays.stream(columns) .map(column -> getPrintableFieldIfPresent(record.get("_airbyte_data"), column.getKey())) - .collect(Collectors.joining("; ")) + .collect(Collectors.joining(", ")) + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); finalRecordIdentityExtractor = record -> Arrays.stream(columns) .map(column -> getPrintableFieldIfPresent(record, column.getKey())) - .collect(Collectors.joining("; ")) + .collect(Collectors.joining(", ")) + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); } @@ -99,7 +99,7 @@ public void diffFinalTableRecords(List expectedRecords, List private static String getPrintableFieldIfPresent(JsonNode record, String field) { if (record.has(field)) { - return field + "=" + record.get(field) + "; "; + return field + "=" + record.get(field); } else { return ""; } @@ -139,7 +139,7 @@ private static String diffRecords(List originalExpectedRecords, if (compare == 0) { // These records should be the same. Find the specific fields that are different. boolean foundMismatch = false; - String mismatchedRecordMessage = "Row had incorrect data:" + recordIdExtractor.apply(expectedRecord) + "\n"; + String mismatchedRecordMessage = "Row had incorrect data: " + recordIdExtractor.apply(expectedRecord) + "\n"; // Iterate through each column in the expected record and compare it to the actual record's value. for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { if (extractRawData && "_airbyte_data".equals(column)) { diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index 2c88a54b74c9..e04a963c2fec 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -333,7 +333,7 @@ public void testDedupRawTable() throws InterruptedException { logAndExecute(sql); final TableResult result = bq.query(QueryJobConfiguration.newBuilder("SELECT * FROM " + streamId.rawTableId(QUOTE)).build()); - DIFFER.diffFinalTableRecords( + DIFFER.diffRawTableRecords( List.of( Jsons.deserialize( """ From 0f1e7ab62de8c359958ee0eb892216213f7e6100 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 10:59:47 -0700 Subject: [PATCH 26/46] comment --- .../base/destination/typing_deduping/RecordDiffer.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 96607adde3de..daed5a4f2bbc 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -106,9 +106,13 @@ private static String getPrintableFieldIfPresent(JsonNode record, String field) } /** - * Generate a human-readable diff between the two lists. Only checks the keys specified in - * expectedRecords. Assumes (in general) that two records with the same PK, cursor, and extracted_at - * are the same record. + * Generate a human-readable diff between the two lists. Assumes (in general) that two records with + * the same PK, cursor, and extracted_at are the same record. + *

+ * Verifies that all values specified in the expected records are correct (_including_ raw_id), and + * that no other fields are present (except for loaded_at and raw_id). We assume that it's impossible + * to verify loaded_at, since it's generated dynamically; however, we do provide the ability to assert + * on the exact raw_id if desired; we simply assume that raw_id is always expected to be present. * * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same * PK+cursor+extracted_at) From dc5ab242f700e426f9436060db429b21df6b1921 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 11:04:12 -0700 Subject: [PATCH 27/46] naming+comment --- .../destination/typing_deduping/RecordDiffer.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index daed5a4f2bbc..c3ee5bba7234 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -35,11 +35,16 @@ public class RecordDiffer { private final Comparator finalRecordSortComparator; private final Function finalRecordIdentityExtractor; - public RecordDiffer(Pair... columns) { + /** + * @param identifyingColumns Which fields constitute a unique record (typically PK+cursor). Do _not_ + * include extracted_at; it is handled automatically. + */ + public RecordDiffer(Pair... identifyingColumns) { // Start with a noop comparator for convenience + // The raw and final stuff are almost identical, except the raw version has to extract _airbyte_data first. Comparator rawIdComp = Comparator.comparing(record -> 0); Comparator finalIdComp = Comparator.comparing(record -> 0); - for (Pair column : columns) { + for (Pair column : identifyingColumns) { rawIdComp = rawIdComp.thenComparing(record -> extract(record.get("_airbyte_data"), column.getKey(), column.getValue())); finalIdComp = finalIdComp.thenComparing(record -> extract(record, column.getKey(), column.getValue())); } @@ -48,11 +53,11 @@ public RecordDiffer(Pair... columns) { this.finalRecordIdentityComparator = finalIdComp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); this.finalRecordSortComparator = finalRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - rawRecordIdentityExtractor = record -> Arrays.stream(columns) + rawRecordIdentityExtractor = record -> Arrays.stream(identifyingColumns) .map(column -> getPrintableFieldIfPresent(record.get("_airbyte_data"), column.getKey())) .collect(Collectors.joining(", ")) + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); - finalRecordIdentityExtractor = record -> Arrays.stream(columns) + finalRecordIdentityExtractor = record -> Arrays.stream(identifyingColumns) .map(column -> getPrintableFieldIfPresent(record, column.getKey())) .collect(Collectors.joining(", ")) + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); From a607793dc05ed7b59e182af6c2a35d1870953cc0 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 11:09:43 -0700 Subject: [PATCH 28/46] one more comment --- .../base/destination/typing_deduping/RecordDiffer.java | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index c3ee5bba7234..38d772d9e5cc 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -353,6 +353,7 @@ private static LocalDate asDate(JsonNode node) { } } + // Generics? Never heard of 'em. (I'm sorry) private static Comparable extract(JsonNode node, String field, AirbyteType type) { if (type instanceof AirbyteProtocolType t) { return switch (t) { From 84c387da95ab03b77b1ab7018ad78ea8d69f07e1 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 11:18:18 -0700 Subject: [PATCH 29/46] better assert --- .../destination/typing_deduping/RecordDiffer.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 38d772d9e5cc..88e4feb751b5 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -1,7 +1,6 @@ package io.airbyte.integrations.base.destination.typing_deduping; -import static org.junit.jupiter.api.Assertions.assertAll; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.collect.Streams; @@ -87,7 +86,9 @@ public void diffRawTableRecords(List expectedRecords, List a rawRecordIdentityExtractor, true); - assertTrue(diff.isEmpty(), "Raw table was incorrect.\n" + diff); + if (!diff.isEmpty()) { + fail("Raw table was incorrect.\n" + diff); + } } public void diffFinalTableRecords(List expectedRecords, List actualRecords) { @@ -99,7 +100,9 @@ public void diffFinalTableRecords(List expectedRecords, List finalRecordIdentityExtractor, false); - assertTrue(diff.isEmpty(), "Final table was incorrect.\n" + diff); + if (!diff.isEmpty()) { + fail("Final table was incorrect.\n" + diff); + } } private static String getPrintableFieldIfPresent(JsonNode record, String field) { From ffa9df0afc2f7604e2f5b409995c7ee2fb7a87e2 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 11:19:47 -0700 Subject: [PATCH 30/46] remove unnecessary thing --- .../typing_deduping/AirbyteType.java | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java b/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java index 2d3b0628e8c4..7c6dcc28597d 100644 --- a/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java +++ b/airbyte-integrations/bases/base-typing-deduping/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/AirbyteType.java @@ -74,8 +74,6 @@ static AirbyteType fromJsonSchema(final JsonNode schema) { return AirbyteTypeUtils.getAirbyteProtocolType(schema); } - public LinkedHashMap asColumns(); - private static Struct getStruct(final JsonNode schema) { final LinkedHashMap propertiesMap = new LinkedHashMap<>(); final JsonNode properties = schema.get("properties"); @@ -109,11 +107,6 @@ public static AirbyteProtocolType matches(final String type) { } } - @Override - public LinkedHashMap asColumns() { - throw new UnsupportedOperationException("Basic types cannot be converted to columns."); - } - } /** @@ -121,18 +114,10 @@ public LinkedHashMap asColumns() { */ record Struct(LinkedHashMap properties) implements AirbyteType { - @Override - public LinkedHashMap asColumns() { - return properties; - } } record Array(AirbyteType items) implements AirbyteType { - @Override - public LinkedHashMap asColumns() { - throw new UnsupportedOperationException("Arrays cannot be converted to columns."); - } } /** @@ -142,10 +127,6 @@ public LinkedHashMap asColumns() { */ record UnsupportedOneOf(List options) implements AirbyteType { - @Override - public LinkedHashMap asColumns() { - throw new UnsupportedOperationException("OneOf cannot be converted to columns."); - } } /** From ffd3e3f28e7cd47a98ef146010878b9a985d8cf4 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 3 Jul 2023 11:24:17 -0700 Subject: [PATCH 31/46] one last thing --- .../BigQuerySqlGeneratorIntegrationTest.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java index e04a963c2fec..fd3c9bb20088 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQuerySqlGeneratorIntegrationTest.java @@ -861,10 +861,7 @@ private static void logAndExecute(final String sql) throws InterruptedException /** * TableResult contains records in a somewhat nonintuitive format (and it avoids loading them all into memory). - * That's annoying for us since we're working with small test data, so pull everything into a list, and convert them - * into maps of column name -> value. - *

- * Note that the values have reasonable types; see {@link #toJson(Schema, FieldValueList)} for details. + * That's annoying for us since we're working with small test data, so just pull everything into a list. */ public static List toJsonRecords(TableResult result) { return result.streamAll().map(row -> toJson(result.getSchema(), row)).toList(); @@ -872,9 +869,8 @@ public static List toJsonRecords(TableResult result) { /** * FieldValueList stores everything internally as string (I think?) but provides conversions to more useful types. - * This method does that conversion, using the schema to determine which type is most appropriate. - *

- * SQL nulls are represented as explicit null values. JSON nulls are represented as {@link com.fasterxml.jackson.databind.node.NullNode}. + * This method does that conversion, using the schema to determine which type is most appropriate. Then we just dump + * everything into a jsonnode for interop with RecordDiffer. */ private static JsonNode toJson(Schema schema, FieldValueList row) { final ObjectNode json = (ObjectNode) Jsons.emptyObject(); @@ -896,7 +892,7 @@ private static JsonNode toJson(Schema schema, FieldValueList row) { // bigquery returns JSON columns as string; manually parse it into a JsonNode case JSON -> Jsons.jsonNode(Jsons.deserialize(value.getStringValue())); - // Default case for weird types (struct, array, geography, interval) + // Default case for weird types (struct, array, geography, interval, bytes) default -> Jsons.jsonNode(value.getStringValue()); }; json.set(field.getName(), typedValue); From 3dbaa160f0367ad2eb0e8dc43d09d71a7dbf4622 Mon Sep 17 00:00:00 2001 From: edgao Date: Mon, 3 Jul 2023 23:08:13 +0000 Subject: [PATCH 32/46] Automated Commit - Formatting Changes --- .../BaseTypingDedupingTest.java | 13 +------ .../typing_deduping/RecordDiffer.java | 38 +++++++++++-------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index a8e82447f661..c8051c58cad9 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -4,12 +4,8 @@ package io.airbyte.integrations.base.destination.typing_deduping; -import static org.junit.jupiter.api.Assertions.assertAll; -import static org.junit.jupiter.api.Assertions.assertTrue; - import com.fasterxml.jackson.databind.JsonNode; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Streams; import io.airbyte.commons.features.EnvVariableFeatureFlags; import io.airbyte.commons.json.Jsons; import io.airbyte.commons.lang.Exceptions; @@ -30,15 +26,9 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.time.Instant; import java.util.Collections; -import java.util.Comparator; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.UUID; -import java.util.function.Function; import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.AfterEach; @@ -81,8 +71,7 @@ public abstract class BaseTypingDedupingTest { private static final RecordDiffer DIFFER = new RecordDiffer( Pair.of("id1", AirbyteProtocolType.INTEGER), Pair.of("id2", AirbyteProtocolType.INTEGER), - Pair.of("updated_at", AirbyteProtocolType.TIMESTAMP_WITH_TIMEZONE) - ); + Pair.of("updated_at", AirbyteProtocolType.TIMESTAMP_WITH_TIMEZONE)); private String randomSuffix; private JsonNode config; diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 88e4feb751b5..0bc8543a3c24 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + package io.airbyte.integrations.base.destination.typing_deduping; import static org.junit.jupiter.api.Assertions.*; @@ -23,7 +27,8 @@ import org.apache.commons.lang3.tuple.Pair; /** - * Utility class to generate human-readable diffs between expected and actual records. Assumes 1s1t output format. + * Utility class to generate human-readable diffs between expected and actual records. Assumes 1s1t + * output format. */ public class RecordDiffer { @@ -36,11 +41,12 @@ public class RecordDiffer { /** * @param identifyingColumns Which fields constitute a unique record (typically PK+cursor). Do _not_ - * include extracted_at; it is handled automatically. + * include extracted_at; it is handled automatically. */ public RecordDiffer(Pair... identifyingColumns) { // Start with a noop comparator for convenience - // The raw and final stuff are almost identical, except the raw version has to extract _airbyte_data first. + // The raw and final stuff are almost identical, except the raw version has to extract _airbyte_data + // first. Comparator rawIdComp = Comparator.comparing(record -> 0); Comparator finalIdComp = Comparator.comparing(record -> 0); for (Pair column : identifyingColumns) { @@ -63,9 +69,9 @@ public RecordDiffer(Pair... identifyingColumns) { } /** - * In the expected records, a SQL null is represented as a JsonNode without that field at all, and a JSON null is - * represented as a NullNode. For example, in the JSON blob {"name": null}, the `name` field is a JSON null, and the - * `address` field is a SQL null. + * In the expected records, a SQL null is represented as a JsonNode without that field at all, and a + * JSON null is represented as a NullNode. For example, in the JSON blob {"name": null}, the `name` + * field is a JSON null, and the `address` field is a SQL null. */ public void verifySyncResult(List expectedRawRecords, List actualRawRecords, @@ -73,8 +79,7 @@ public void verifySyncResult(List expectedRawRecords, List actualFinalRecords) { assertAll( () -> diffRawTableRecords(expectedRawRecords, actualRawRecords), - () -> diffFinalTableRecords(expectedFinalRecords, actualFinalRecords) - ); + () -> diffFinalTableRecords(expectedFinalRecords, actualFinalRecords)); } public void diffRawTableRecords(List expectedRecords, List actualRecords) { @@ -118,9 +123,10 @@ private static String getPrintableFieldIfPresent(JsonNode record, String field) * the same PK, cursor, and extracted_at are the same record. *

* Verifies that all values specified in the expected records are correct (_including_ raw_id), and - * that no other fields are present (except for loaded_at and raw_id). We assume that it's impossible - * to verify loaded_at, since it's generated dynamically; however, we do provide the ability to assert - * on the exact raw_id if desired; we simply assume that raw_id is always expected to be present. + * that no other fields are present (except for loaded_at and raw_id). We assume that it's + * impossible to verify loaded_at, since it's generated dynamically; however, we do provide the + * ability to assert on the exact raw_id if desired; we simply assume that raw_id is always expected + * to be present. * * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same * PK+cursor+extracted_at) @@ -237,8 +243,8 @@ private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode a /** * Verify that all fields in the actual record are present in the expected record. This is primarily * relevant for detecting fields that we expected to be null, but actually were not. See - * {@link BaseTypingDedupingTest#dumpFinalTableRecords(String, String)} for an explanation of how SQL/JSON nulls are - * represented in the expected record. + * {@link BaseTypingDedupingTest#dumpFinalTableRecords(String, String)} for an explanation of how + * SQL/JSON nulls are represented in the expected record. *

* This has the side benefit of detecting completely unexpected columns, which would be a very weird * bug but is probably still useful to catch. @@ -255,8 +261,9 @@ private static LinkedHashMap checkForExtraOrNonNullFields(Json } /** - * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". The leading spaces are - * intentional, to make the message easier to read when it's embedded in a larger stacktrace. + * Produce a pretty-printed error message, e.g. " For column foo, expected 1 but got 2". The leading + * spaces are intentional, to make the message easier to read when it's embedded in a larger + * stacktrace. */ private static String generateFieldError(String fieldname, JsonNode expectedValue, JsonNode actualValue) { String expectedString = expectedValue == null ? "SQL NULL (i.e. no value)" : expectedValue.toString(); @@ -375,4 +382,5 @@ private static Comparable extract(JsonNode node, String field, AirbyteType type) return node.toString(); } } + } From 97f8c199083dedc6e1ca5ec761201877797f1cd3 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Wed, 5 Jul 2023 10:06:55 -0700 Subject: [PATCH 33/46] enable concurrent execution on all java integration tests --- .../typing_deduping/BaseTypingDedupingTest.java | 3 --- .../connectors/destination-bigquery/build.gradle | 8 -------- .../src/main/groovy/airbyte-integration-test-java.gradle | 5 +++++ 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index c8051c58cad9..8d264a5cd2e5 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -51,9 +51,6 @@ * sync modes which use a cursor, the stream provides an updated_at field. The stream also has an * _ab_cdc_deleted_at field. */ -// Remember to set `'junit.jupiter.execution.parallel.enabled': 'true'` in your connector's -// build.gradle. -// See destination-bigquery for an example. // If you're running from inside intellij, you must run your specific subclass to get concurrent // execution. @Execution(ExecutionMode.CONCURRENT) diff --git a/airbyte-integrations/connectors/destination-bigquery/build.gradle b/airbyte-integrations/connectors/destination-bigquery/build.gradle index 2229ad250b72..a413759fed0f 100644 --- a/airbyte-integrations/connectors/destination-bigquery/build.gradle +++ b/airbyte-integrations/connectors/destination-bigquery/build.gradle @@ -52,11 +52,3 @@ configurations.all { force 'com.google.api-client:google-api-client:1.31.5' } } - -integrationTestJava { - systemProperties = [ - 'junit.jupiter.execution.parallel.enabled': 'true' - // TODO what's preventing us from turning this on? (probably a lot of things) - // 'junit.jupiter.execution.parallel.mode.default': 'concurrent' - ] -} diff --git a/buildSrc/src/main/groovy/airbyte-integration-test-java.gradle b/buildSrc/src/main/groovy/airbyte-integration-test-java.gradle index a6938bc96791..e650889c417c 100644 --- a/buildSrc/src/main/groovy/airbyte-integration-test-java.gradle +++ b/buildSrc/src/main/groovy/airbyte-integration-test-java.gradle @@ -53,6 +53,11 @@ class AirbyteIntegrationTestJavaPlugin implements Plugin { // This is needed to make the destination-snowflake tests succeed - https://github.com/snowflakedb/snowflake-jdbc/issues/589#issuecomment-983944767 jvmArgs = ["--add-opens=java.base/java.nio=ALL-UNNAMED"] + + systemProperties = [ + // Allow tests to set @Execution(ExecutionMode.CONCURRENT) + 'junit.jupiter.execution.parallel.enabled': 'true' + ] } // make sure we create the integrationTest task once in case a standard source test was already initialized From 9934901454a8ddf9877f1a027941fc82254c38fd Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 6 Jul 2023 09:44:19 -0700 Subject: [PATCH 34/46] add test for default namespace --- .../BaseTypingDedupingTest.java | 50 ++++++++++++++++++- .../AbstractBigQueryTypingDedupingTest.java | 3 ++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 8d264a5cd2e5..75763214272a 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -87,6 +87,8 @@ public abstract class BaseTypingDedupingTest { * Subclasses should _not_ start testcontainers in this method; that belongs in a BeforeAll method. * The tests in this class are intended to be run concurrently on a shared database and will not * interfere with each other. + *

+ * Sublcasses which need access to the config may use {@link #getConfig()}. */ protected abstract JsonNode generateConfig() throws Exception; @@ -302,6 +304,41 @@ public void incrementalDedup() throws Exception { verifySyncResult(expectedRawRecords2, expectedFinalRecords2); } + /** + * Identical to {@link #incrementalDedup()}, except that the stream has no namespace. + */ + @Test + public void incrementalDedupDefaultNamespace() throws Exception { + ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.INCREMENTAL) + .withCursorField(List.of("updated_at")) + .withDestinationSyncMode(DestinationSyncMode.APPEND_DEDUP) + .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) + .withStream(new AirbyteStream() + // NB: we don't call `withNamespace` here + .withName(streamName) + .withJsonSchema(SCHEMA)))); + + // First sync + List messages1 = readMessages("sync1_messages.jsonl", null, streamName); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_dedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1, null, streamName); + + // Second sync + List messages2 = readMessages("sync2_messages.jsonl", null, streamName); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2, null, streamName); + } + @Test @Disabled("Not yet implemented") public void testLineBreakCharacters() throws Exception { @@ -401,12 +438,19 @@ public void testDataTypes() throws Exception { } private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords) throws Exception { + verifySyncResult(expectedRawRecords, expectedFinalRecords, streamNamespace, streamName); + } + + private void verifySyncResult(List expectedRawRecords, + List expectedFinalRecords, + String streamNamespace, + String streamName) throws Exception { List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); List actualFinalRecords = dumpFinalTableRecords(streamNamespace, streamName); DIFFER.verifySyncResult(expectedRawRecords, actualRawRecords, expectedFinalRecords, actualFinalRecords); } - private List readRecords(String filename) throws IOException { + private static List readRecords(String filename) throws IOException { return MoreResources.readResource(filename).lines() .map(String::trim) .filter(line -> !line.isEmpty()) @@ -416,6 +460,10 @@ private List readRecords(String filename) throws IOException { } private List readMessages(String filename) throws IOException { + return readMessages(filename, streamNamespace, streamName); + } + + private static List readMessages(String filename, String streamNamespace, String streamName) throws IOException { return readRecords(filename).stream() .map(record -> Jsons.convertValue(record, AirbyteMessage.class)) .peek(message -> { diff --git a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java index 4fa25ee9b73f..15fc029302b3 100644 --- a/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java +++ b/airbyte-integrations/connectors/destination-bigquery/src/test-integration/java/io/airbyte/integrations/destination/bigquery/typing_deduping/AbstractBigQueryTypingDedupingTest.java @@ -57,6 +57,9 @@ protected List dumpFinalTableRecords(String streamNamespace, String st @Override protected void teardownStreamAndNamespace(String streamNamespace, String streamName) { + if (streamNamespace == null) { + streamNamespace = BigQueryUtils.getDatasetId(getConfig()); + } // bq.delete simply returns false if the table/schema doesn't exist (e.g. if the connector failed to create it) // so we don't need to do any existence checks here. bq.delete(TableId.of("airbyte", streamNamespace + "_" + streamName)); From 760f8298069610b5d7ff7f0913da97c8f8d7af1f Mon Sep 17 00:00:00 2001 From: edgao Date: Thu, 6 Jul 2023 16:47:45 +0000 Subject: [PATCH 35/46] Automated Commit - Formatting Changes --- .../destination/typing_deduping/BaseTypingDedupingTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 75763214272a..d850fbe204f4 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -444,7 +444,8 @@ private void verifySyncResult(List expectedRawRecords, List private void verifySyncResult(List expectedRawRecords, List expectedFinalRecords, String streamNamespace, - String streamName) throws Exception { + String streamName) + throws Exception { List actualRawRecords = dumpRawTableRecords(streamNamespace, streamName); List actualFinalRecords = dumpFinalTableRecords(streamNamespace, streamName); DIFFER.verifySyncResult(expectedRawRecords, actualRawRecords, expectedFinalRecords, actualFinalRecords); From c82cadc1b54469b27ac1ef9e1c3d3567442f7e9e Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Thu, 6 Jul 2023 18:44:14 -0700 Subject: [PATCH 36/46] implement a 2-stream test --- .../BaseTypingDedupingTest.java | 71 +++++++++++++++---- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index d850fbe204f4..42793ff0ec97 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -14,6 +14,7 @@ import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.AirbyteProtocolType; import io.airbyte.protocol.models.v0.AirbyteMessage; import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; import io.airbyte.protocol.models.v0.DestinationSyncMode; @@ -26,9 +27,11 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.UUID; +import java.util.stream.Stream; import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.AfterEach; @@ -74,6 +77,7 @@ public abstract class BaseTypingDedupingTest { private JsonNode config; private String streamNamespace; private String streamName; + private List streamsToTearDown; /** * @return the docker image to run, e.g. {@code "airbyte/destination-bigquery:dev"}. @@ -124,10 +128,11 @@ public abstract class BaseTypingDedupingTest { /** * Delete any resources in the destination associated with this stream AND its namespace. We need * this because we write raw tables to a shared {@code airbyte} namespace, which we can't drop - * wholesale. + * wholesale. Must handle the case where the table/namespace doesn't exist (e.g. if the connector + * crashed without writing any data). *

* In general, this should resemble - * {@code DROP TABLE IF EXISTS airbyte.namespace_name; DROP SCHEMA IF EXISTS namespace}. + * {@code DROP TABLE IF EXISTS airbyte._; DROP SCHEMA IF EXISTS }. */ protected abstract void teardownStreamAndNamespace(String streamNamespace, String streamName) throws Exception; @@ -150,12 +155,15 @@ public void setup() throws Exception { config = generateConfig(); streamNamespace = "typing_deduping_test" + getUniqueSuffix(); streamName = "test_stream" + getUniqueSuffix(); + streamsToTearDown = new ArrayList<>(); LOGGER.info("Using stream namespace {} and name {}", streamNamespace, streamName); } @AfterEach public void teardown() throws Exception { - teardownStreamAndNamespace(streamNamespace, streamName); + for (AirbyteStreamNameNamespacePair streamId : streamsToTearDown) { + teardownStreamAndNamespace(streamId.getNamespace(), streamId.getName()); + } } /** @@ -379,30 +387,63 @@ public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { .withJsonSchema(SCHEMA)))); } + // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using two streams with the same + // name but different namespace + // TODO maybe we don't even need the single-stream versions... + /** + * Identical to {@link #incrementalDedup()}, except there are two streams with the same name and different namespace. + */ @Test - @Disabled("Not yet implemented") - public void testSyncWriteSameTableNameDifferentNamespace() throws Exception { - // TODO duplicate this test for each sync mode. Run 1st+2nd syncs using two streams with the same - // name but different namespace: + public void incrementalDedupIdenticalName() throws Exception { + String namespace1 = streamNamespace + "_1"; + String namespace2 = streamNamespace + "_2"; ConfiguredAirbyteCatalog catalog = new ConfiguredAirbyteCatalog().withStreams(List.of( new ConfiguredAirbyteStream() - .withSyncMode(SyncMode.FULL_REFRESH) + .withSyncMode(SyncMode.INCREMENTAL) .withCursorField(List.of("updated_at")) - .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withDestinationSyncMode(DestinationSyncMode.APPEND_DEDUP) .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) .withStream(new AirbyteStream() - .withNamespace(streamNamespace + "_1") + .withNamespace(namespace1) .withName(streamName) .withJsonSchema(SCHEMA)), new ConfiguredAirbyteStream() - .withSyncMode(SyncMode.FULL_REFRESH) + .withSyncMode(SyncMode.INCREMENTAL) .withCursorField(List.of("updated_at")) - .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withDestinationSyncMode(DestinationSyncMode.APPEND_DEDUP) .withPrimaryKey(List.of(List.of("id1"), List.of("id2"))) .withStream(new AirbyteStream() - .withNamespace(streamNamespace + "_2") + .withNamespace(namespace2) .withName(streamName) - .withJsonSchema(SCHEMA)))); + .withJsonSchema(SCHEMA)) + )); + + // First sync + // Read the same set of messages for both streams + List messages1 = Stream.concat( + readMessages("sync1_messages.jsonl", namespace1, streamName).stream(), + readMessages("sync1_messages.jsonl", namespace2, streamName).stream() + ).toList(); + + runSync(catalog, messages1); + + List expectedRawRecords1 = readRecords("sync1_expectedrecords_dedup_raw.jsonl"); + List expectedFinalRecords1 = readRecords("sync1_expectedrecords_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1, namespace1, streamName); + verifySyncResult(expectedRawRecords1, expectedFinalRecords1, namespace2, streamName); + + // Second sync + List messages2 = Stream.concat( + readMessages("sync2_messages.jsonl", namespace1, streamName).stream(), + readMessages("sync2_messages.jsonl", namespace2, streamName).stream() + ).toList(); + + runSync(catalog, messages2); + + List expectedRawRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_raw.jsonl"); + List expectedFinalRecords2 = readRecords("sync2_expectedrecords_incremental_dedup_final.jsonl"); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2, namespace1, streamName); + verifySyncResult(expectedRawRecords2, expectedFinalRecords2, namespace2, streamName); } @Test @@ -498,6 +539,8 @@ public void setupProcessFactory() throws IOException { } private void runSync(ConfiguredAirbyteCatalog catalog, List messages) throws Exception { + catalog.getStreams().forEach(s -> streamsToTearDown.add(AirbyteStreamNameNamespacePair.fromAirbyteStream(s.getStream()))); + final WorkerDestinationConfig destinationConfig = new WorkerDestinationConfig() .withConnectionId(UUID.randomUUID()) .withCatalog(convertProtocolObject(catalog, io.airbyte.protocol.models.ConfiguredAirbyteCatalog.class)) From 0fdea845fcf70c8f194038ae17e5866617e4fe04 Mon Sep 17 00:00:00 2001 From: edgao Date: Fri, 7 Jul 2023 01:48:31 +0000 Subject: [PATCH 37/46] Automated Commit - Formatting Changes --- .../typing_deduping/BaseTypingDedupingTest.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java index 42793ff0ec97..85ca77f81b69 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java @@ -391,7 +391,8 @@ public void testSyncUsesAirbyteStreamNamespaceIfNotNull() throws Exception { // name but different namespace // TODO maybe we don't even need the single-stream versions... /** - * Identical to {@link #incrementalDedup()}, except there are two streams with the same name and different namespace. + * Identical to {@link #incrementalDedup()}, except there are two streams with the same name and + * different namespace. */ @Test public void incrementalDedupIdenticalName() throws Exception { @@ -415,15 +416,13 @@ public void incrementalDedupIdenticalName() throws Exception { .withStream(new AirbyteStream() .withNamespace(namespace2) .withName(streamName) - .withJsonSchema(SCHEMA)) - )); + .withJsonSchema(SCHEMA)))); // First sync // Read the same set of messages for both streams List messages1 = Stream.concat( readMessages("sync1_messages.jsonl", namespace1, streamName).stream(), - readMessages("sync1_messages.jsonl", namespace2, streamName).stream() - ).toList(); + readMessages("sync1_messages.jsonl", namespace2, streamName).stream()).toList(); runSync(catalog, messages1); @@ -435,8 +434,7 @@ public void incrementalDedupIdenticalName() throws Exception { // Second sync List messages2 = Stream.concat( readMessages("sync2_messages.jsonl", namespace1, streamName).stream(), - readMessages("sync2_messages.jsonl", namespace2, streamName).stream() - ).toList(); + readMessages("sync2_messages.jsonl", namespace2, streamName).stream()).toList(); runSync(catalog, messages2); From 4442b065118a2543cc70e28a8ec91b5fafebce12 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 10 Jul 2023 16:16:37 -0700 Subject: [PATCH 38/46] extract methods --- .../typing_deduping/RecordDiffer.java | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 0bc8543a3c24..23b97f7ca139 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -43,29 +43,16 @@ public class RecordDiffer { * @param identifyingColumns Which fields constitute a unique record (typically PK+cursor). Do _not_ * include extracted_at; it is handled automatically. */ - public RecordDiffer(Pair... identifyingColumns) { - // Start with a noop comparator for convenience - // The raw and final stuff are almost identical, except the raw version has to extract _airbyte_data - // first. - Comparator rawIdComp = Comparator.comparing(record -> 0); - Comparator finalIdComp = Comparator.comparing(record -> 0); - for (Pair column : identifyingColumns) { - rawIdComp = rawIdComp.thenComparing(record -> extract(record.get("_airbyte_data"), column.getKey(), column.getValue())); - finalIdComp = finalIdComp.thenComparing(record -> extract(record, column.getKey(), column.getValue())); - } - this.rawRecordIdentityComparator = rawIdComp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); + @SafeVarargs + public RecordDiffer(final Pair... identifyingColumns) { + this.rawRecordIdentityComparator = buildIdentityComparator(record -> record.get("_airbyte_data"), identifyingColumns); + this.finalRecordIdentityComparator = buildIdentityComparator(record -> record, identifyingColumns); + this.rawRecordSortComparator = rawRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - this.finalRecordIdentityComparator = finalIdComp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); this.finalRecordSortComparator = finalRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - rawRecordIdentityExtractor = record -> Arrays.stream(identifyingColumns) - .map(column -> getPrintableFieldIfPresent(record.get("_airbyte_data"), column.getKey())) - .collect(Collectors.joining(", ")) - + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); - finalRecordIdentityExtractor = record -> Arrays.stream(identifyingColumns) - .map(column -> getPrintableFieldIfPresent(record, column.getKey())) - .collect(Collectors.joining(", ")) - + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); + this.rawRecordIdentityExtractor = buildIdentityExtractor(record -> record.get("_airbyte_data"), identifyingColumns); + this.finalRecordIdentityExtractor = buildIdentityExtractor(record -> record, identifyingColumns); } /** @@ -110,6 +97,34 @@ public void diffFinalTableRecords(List expectedRecords, List } } + /** + * Build a Comparator to detect equality between two records. It first compares all the identifying + * columns in order, and breaks ties using extracted_at. + * + * @param dataExtractor A function that extracts the data from a record. For raw records, this should + * return the _airbyte_data field; for final records, this should return the + * record itself. + */ + private Comparator buildIdentityComparator(Function dataExtractor, Pair[] identifyingColumns) { + // Start with a noop comparator for convenience + Comparator comp = Comparator.comparing(record -> 0); + for (Pair column : identifyingColumns) { + comp = comp.thenComparing(record -> extract(dataExtractor.apply(record), column.getKey(), column.getValue())); + } + comp = comp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); + return comp; + } + + /** + * See {@link #buildIdentityComparator(Function, Pair[])} for an explanation of dataExtractor. + */ + private Function buildIdentityExtractor(Function dataExtractor, Pair[] identifyingColumns) { + return record -> Arrays.stream(identifyingColumns) + .map(column -> getPrintableFieldIfPresent(dataExtractor.apply(record), column.getKey())) + .collect(Collectors.joining(", ")) + + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); + } + private static String getPrintableFieldIfPresent(JsonNode record, String field) { if (record.has(field)) { return field + "=" + record.get(field); From 6d437da026aff24db93ad94e043469746b0fb3c4 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 10 Jul 2023 16:18:11 -0700 Subject: [PATCH 39/46] invert jsonNodesNotEquivalent --- .../base/destination/typing_deduping/RecordDiffer.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 23b97f7ca139..5088140b1973 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -184,7 +184,7 @@ private static String diffRecords(List originalExpectedRecords, for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { JsonNode expectedValue = expectedRawData.get(field); JsonNode actualValue = actualRawData.get(field); - if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + if (!areJsonNodesEquivalent(expectedValue, actualValue)) { mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); foundMismatch = true; } @@ -201,7 +201,7 @@ private static String diffRecords(List originalExpectedRecords, // For all other columns, we can just compare their values directly. JsonNode expectedValue = expectedRecord.get(column); JsonNode actualValue = actualRecord.get(column); - if (jsonNodesNotEquivalent(expectedValue, actualValue)) { + if (!areJsonNodesEquivalent(expectedValue, actualValue)) { mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); foundMismatch = true; } @@ -246,7 +246,7 @@ private static String diffRecords(List originalExpectedRecords, return message; } - private static boolean jsonNodesNotEquivalent(JsonNode expectedValue, JsonNode actualValue) { + private static boolean areJsonNodesEquivalent(JsonNode expectedValue, JsonNode actualValue) { // This is kind of sketchy, but seems to work fine for the data we have in our test cases. return !Objects.equals(expectedValue, actualValue) // Objects.equals expects the two values to be the same class. From de3c2e902e839ef44ba9a5e795b37235d4c63be5 Mon Sep 17 00:00:00 2001 From: edgao Date: Mon, 10 Jul 2023 23:27:38 +0000 Subject: [PATCH 40/46] Automated Commit - Formatting Changes --- .../base/destination/typing_deduping/RecordDiffer.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 5088140b1973..3df9e167d110 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -101,9 +101,9 @@ public void diffFinalTableRecords(List expectedRecords, List * Build a Comparator to detect equality between two records. It first compares all the identifying * columns in order, and breaks ties using extracted_at. * - * @param dataExtractor A function that extracts the data from a record. For raw records, this should - * return the _airbyte_data field; for final records, this should return the - * record itself. + * @param dataExtractor A function that extracts the data from a record. For raw records, this + * should return the _airbyte_data field; for final records, this should return the record + * itself. */ private Comparator buildIdentityComparator(Function dataExtractor, Pair[] identifyingColumns) { // Start with a noop comparator for convenience @@ -118,7 +118,8 @@ private Comparator buildIdentityComparator(Function buildIdentityExtractor(Function dataExtractor, Pair[] identifyingColumns) { + private Function buildIdentityExtractor(Function dataExtractor, + Pair[] identifyingColumns) { return record -> Arrays.stream(identifyingColumns) .map(column -> getPrintableFieldIfPresent(dataExtractor.apply(record), column.getKey())) .collect(Collectors.joining(", ")) From 58e5d10338667f2625879380d9da9b07bcc3b7d8 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 10 Jul 2023 16:48:54 -0700 Subject: [PATCH 41/46] fix conditional --- .../base/destination/typing_deduping/RecordDiffer.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 3df9e167d110..0f494d26b481 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -249,11 +249,11 @@ private static String diffRecords(List originalExpectedRecords, private static boolean areJsonNodesEquivalent(JsonNode expectedValue, JsonNode actualValue) { // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - return !Objects.equals(expectedValue, actualValue) + return Objects.equals(expectedValue, actualValue) // Objects.equals expects the two values to be the same class. // We need to handle comparisons between e.g. LongNode and IntNode. - && !(expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - && !(expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); + || (expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + || (expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); } /** From 6ef65b18d7cd6c7860432098f47580711d767823 Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 10 Jul 2023 16:36:22 -0700 Subject: [PATCH 42/46] pull out diffSingleRecord --- .../typing_deduping/RecordDiffer.java | 104 +++++++++--------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 0f494d26b481..3a14b1901b23 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -171,55 +171,9 @@ private static String diffRecords(List originalExpectedRecords, JsonNode actualRecord = actualRecords.get(actualRecordIndex); int compare = identityComparator.compare(expectedRecord, actualRecord); if (compare == 0) { - // These records should be the same. Find the specific fields that are different. - boolean foundMismatch = false; - String mismatchedRecordMessage = "Row had incorrect data: " + recordIdExtractor.apply(expectedRecord) + "\n"; - // Iterate through each column in the expected record and compare it to the actual record's value. - for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { - if (extractRawData && "_airbyte_data".equals(column)) { - // For the raw data in particular, we should also diff the fields inside _airbyte_data. - JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); - JsonNode actualRawData = actualRecord.get("_airbyte_data"); - // Iterate through all the subfields of the expected raw data and check that they match the actual - // record... - for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { - JsonNode expectedValue = expectedRawData.get(field); - JsonNode actualValue = actualRawData.get(field); - if (!areJsonNodesEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); - foundMismatch = true; - } - } - // ... and then check the actual raw data for any subfields that we weren't expecting. - LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); - if (extraColumns.size() > 0) { - for (Map.Entry extraColumn : extraColumns.entrySet()) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); - foundMismatch = true; - } - } - } else { - // For all other columns, we can just compare their values directly. - JsonNode expectedValue = expectedRecord.get(column); - JsonNode actualValue = actualRecord.get(column); - if (!areJsonNodesEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); - foundMismatch = true; - } - } - } - // Then check the entire actual record for any columns that we weren't expecting. - LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRecord, actualRecord); - if (extraColumns.size() > 0) { - for (Map.Entry extraColumn : extraColumns.entrySet()) { - mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); - foundMismatch = true; - } - } - if (foundMismatch) { - message += mismatchedRecordMessage; - } - + // These records should be the same. Find the specific fields that are different and move on + // to the next records in both lists. + message += diffSingleRecord(recordIdExtractor, extractRawData, expectedRecord, actualRecord); expectedRecordIndex++; actualRecordIndex++; } else if (compare < 0) { @@ -247,6 +201,58 @@ private static String diffRecords(List originalExpectedRecords, return message; } + private static String diffSingleRecord(Function recordIdExtractor, boolean extractRawData, JsonNode expectedRecord, JsonNode actualRecord) { + boolean foundMismatch = false; + String mismatchedRecordMessage = "Row had incorrect data: " + recordIdExtractor.apply(expectedRecord) + "\n"; + // Iterate through each column in the expected record and compare it to the actual record's value. + for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { + if (extractRawData && "_airbyte_data".equals(column)) { + // For the raw data in particular, we should also diff the fields inside _airbyte_data. + JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); + JsonNode actualRawData = actualRecord.get("_airbyte_data"); + // Iterate through all the subfields of the expected raw data and check that they match the actual + // record... + for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { + JsonNode expectedValue = expectedRawData.get(field); + JsonNode actualValue = actualRawData.get(field); + if (!areJsonNodesEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); + foundMismatch = true; + } + } + // ... and then check the actual raw data for any subfields that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); + foundMismatch = true; + } + } + } else { + // For all other columns, we can just compare their values directly. + JsonNode expectedValue = expectedRecord.get(column); + JsonNode actualValue = actualRecord.get(column); + if (!areJsonNodesEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); + foundMismatch = true; + } + } + } + // Then check the entire actual record for any columns that we weren't expecting. + LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRecord, actualRecord); + if (extraColumns.size() > 0) { + for (Map.Entry extraColumn : extraColumns.entrySet()) { + mismatchedRecordMessage += generateFieldError("column " + extraColumn.getKey(), null, extraColumn.getValue()); + foundMismatch = true; + } + } + if (foundMismatch) { + return mismatchedRecordMessage; + } else { + return ""; + } + } + private static boolean areJsonNodesEquivalent(JsonNode expectedValue, JsonNode actualValue) { // This is kind of sketchy, but seems to work fine for the data we have in our test cases. return Objects.equals(expectedValue, actualValue) From 797b60f128f2ac568208297d8d048b221c2cdea1 Mon Sep 17 00:00:00 2001 From: edgao Date: Mon, 10 Jul 2023 23:53:01 +0000 Subject: [PATCH 43/46] Automated Commit - Formatting Changes --- .../base/destination/typing_deduping/RecordDiffer.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 3a14b1901b23..21f4c53b8887 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -201,7 +201,10 @@ private static String diffRecords(List originalExpectedRecords, return message; } - private static String diffSingleRecord(Function recordIdExtractor, boolean extractRawData, JsonNode expectedRecord, JsonNode actualRecord) { + private static String diffSingleRecord(Function recordIdExtractor, + boolean extractRawData, + JsonNode expectedRecord, + JsonNode actualRecord) { boolean foundMismatch = false; String mismatchedRecordMessage = "Row had incorrect data: " + recordIdExtractor.apply(expectedRecord) + "\n"; // Iterate through each column in the expected record and compare it to the actual record's value. From 060d30f9c0a175d77a086cd78f32e2b3cd910fcb Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Mon, 10 Jul 2023 16:58:01 -0700 Subject: [PATCH 44/46] handle nulls correctly --- .../typing_deduping/RecordDiffer.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 21f4c53b8887..ac6f0b4977f6 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -257,12 +257,18 @@ private static String diffSingleRecord(Function recordIdExtrac } private static boolean areJsonNodesEquivalent(JsonNode expectedValue, JsonNode actualValue) { - // This is kind of sketchy, but seems to work fine for the data we have in our test cases. - return Objects.equals(expectedValue, actualValue) - // Objects.equals expects the two values to be the same class. - // We need to handle comparisons between e.g. LongNode and IntNode. - || (expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) - || (expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); + if (expectedValue == null || actualValue == null) { + // If one of the values is null, then we expect both of them to be null. + return expectedValue == null && actualValue == null; + } else { + // Otherwise, we need to compare the actual values. + // This is kind of sketchy, but seems to work fine for the data we have in our test cases. + return expectedValue.equals(actualValue) + // equals() expects the two values to be the same class. + // We need to handle comparisons between e.g. LongNode and IntNode. + || (expectedValue.isIntegralNumber() && actualValue.isIntegralNumber() && expectedValue.asLong() == actualValue.asLong()) + || (expectedValue.isNumber() && actualValue.isNumber() && expectedValue.asDouble() == actualValue.asDouble()); + } } /** From 7539094d533a0432bcc82ed5030bd1d27abacd7c Mon Sep 17 00:00:00 2001 From: Edward Gao Date: Tue, 11 Jul 2023 09:02:45 -0700 Subject: [PATCH 45/46] remove raw-specific handling; break up methods --- .../typing_deduping/RecordDiffer.java | 134 ++++++++---------- 1 file changed, 56 insertions(+), 78 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index ac6f0b4977f6..34879a9cbef4 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -4,9 +4,12 @@ package io.airbyte.integrations.base.destination.typing_deduping; -import static org.junit.jupiter.api.Assertions.*; +import static java.util.stream.Collectors.toList; +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.fail; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.Streams; import io.airbyte.commons.json.Jsons; import io.airbyte.integrations.base.destination.typing_deduping.AirbyteType.AirbyteProtocolType; @@ -21,7 +24,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.function.Function; import java.util.stream.Collectors; import org.apache.commons.lang3.tuple.Pair; @@ -32,12 +34,9 @@ */ public class RecordDiffer { - private final Comparator rawRecordIdentityComparator; - private final Comparator rawRecordSortComparator; - private final Function rawRecordIdentityExtractor; - private final Comparator finalRecordIdentityComparator; - private final Comparator finalRecordSortComparator; - private final Function finalRecordIdentityExtractor; + private final Comparator recordIdentityComparator; + private final Comparator recordSortComparator; + private final Function recordIdentityExtractor; /** * @param identifyingColumns Which fields constitute a unique record (typically PK+cursor). Do _not_ @@ -45,14 +44,9 @@ public class RecordDiffer { */ @SafeVarargs public RecordDiffer(final Pair... identifyingColumns) { - this.rawRecordIdentityComparator = buildIdentityComparator(record -> record.get("_airbyte_data"), identifyingColumns); - this.finalRecordIdentityComparator = buildIdentityComparator(record -> record, identifyingColumns); - - this.rawRecordSortComparator = rawRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - this.finalRecordSortComparator = finalRecordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); - - this.rawRecordIdentityExtractor = buildIdentityExtractor(record -> record.get("_airbyte_data"), identifyingColumns); - this.finalRecordIdentityExtractor = buildIdentityExtractor(record -> record, identifyingColumns); + this.recordIdentityComparator = buildIdentityComparator(identifyingColumns); + this.recordSortComparator = recordIdentityComparator.thenComparing(record -> asString(record.get("_airbyte_raw_id"))); + this.recordIdentityExtractor = buildIdentityExtractor(identifyingColumns); } /** @@ -71,12 +65,12 @@ public void verifySyncResult(List expectedRawRecords, public void diffRawTableRecords(List expectedRecords, List actualRecords) { String diff = diffRecords( - expectedRecords, - actualRecords, - rawRecordIdentityComparator, - rawRecordSortComparator, - rawRecordIdentityExtractor, - true); + expectedRecords.stream().map(RecordDiffer::copyWithLiftedData).collect(toList()), + actualRecords.stream().map(RecordDiffer::copyWithLiftedData).collect(toList()), + recordIdentityComparator, + recordSortComparator, + recordIdentityExtractor + ); if (!diff.isEmpty()) { fail("Raw table was incorrect.\n" + diff); @@ -87,41 +81,54 @@ public void diffFinalTableRecords(List expectedRecords, List String diff = diffRecords( expectedRecords, actualRecords, - finalRecordIdentityComparator, - finalRecordSortComparator, - finalRecordIdentityExtractor, - false); + recordIdentityComparator, + recordSortComparator, + recordIdentityExtractor + ); if (!diff.isEmpty()) { fail("Final table was incorrect.\n" + diff); } } + /** + * @return A copy of the record, but with all fields in _airbyte_data lifted to the top level. + */ + private static JsonNode copyWithLiftedData(JsonNode record) { + ObjectNode copy = record.deepCopy(); + copy.remove("_airbyte_data"); + Streams.stream(record.get("_airbyte_data").fields()).forEach(field -> { + if (!copy.has(field.getKey())) { + copy.set(field.getKey(), field.getValue()); + } else { + // This would only happen if the record has one of the metadata columns (e.g. _airbyte_raw_id) + // We don't support that in production, so we don't support it here either. + throw new RuntimeException("Cannot lift field " + field.getKey() + " because it already exists in the record."); + } + }); + return copy; + } + /** * Build a Comparator to detect equality between two records. It first compares all the identifying * columns in order, and breaks ties using extracted_at. - * - * @param dataExtractor A function that extracts the data from a record. For raw records, this - * should return the _airbyte_data field; for final records, this should return the record - * itself. */ - private Comparator buildIdentityComparator(Function dataExtractor, Pair[] identifyingColumns) { + private Comparator buildIdentityComparator(Pair[] identifyingColumns) { // Start with a noop comparator for convenience Comparator comp = Comparator.comparing(record -> 0); for (Pair column : identifyingColumns) { - comp = comp.thenComparing(record -> extract(dataExtractor.apply(record), column.getKey(), column.getValue())); + comp = comp.thenComparing(record -> extract(record, column.getKey(), column.getValue())); } comp = comp.thenComparing(record -> asTimestampWithTimezone(record.get("_airbyte_extracted_at"))); return comp; } /** - * See {@link #buildIdentityComparator(Function, Pair[])} for an explanation of dataExtractor. + * See {@link #buildIdentityComparator(Pair[])} for an explanation of dataExtractor. */ - private Function buildIdentityExtractor(Function dataExtractor, - Pair[] identifyingColumns) { + private Function buildIdentityExtractor(Pair[] identifyingColumns) { return record -> Arrays.stream(identifyingColumns) - .map(column -> getPrintableFieldIfPresent(dataExtractor.apply(record), column.getKey())) + .map(column -> getPrintableFieldIfPresent(record, column.getKey())) .collect(Collectors.joining(", ")) + getPrintableFieldIfPresent(record, "_airbyte_extracted_at"); } @@ -145,19 +152,17 @@ private static String getPrintableFieldIfPresent(JsonNode record, String field) * to be present. * * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same - * PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, - * breaks that tie using _airbyte_raw_id - * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string - * @param extractRawData Whether to look inside the _airbyte_data column and diff its subfields + * PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, + * breaks that tie using _airbyte_raw_id + * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string * @return The diff, or empty string if there were no differences */ private static String diffRecords(List originalExpectedRecords, List originalActualRecords, Comparator identityComparator, Comparator sortComparator, - Function recordIdExtractor, - boolean extractRawData) { + Function recordIdExtractor) { List expectedRecords = originalExpectedRecords.stream().sorted(sortComparator).toList(); List actualRecords = originalActualRecords.stream().sorted(sortComparator).toList(); @@ -173,7 +178,7 @@ private static String diffRecords(List originalExpectedRecords, if (compare == 0) { // These records should be the same. Find the specific fields that are different and move on // to the next records in both lists. - message += diffSingleRecord(recordIdExtractor, extractRawData, expectedRecord, actualRecord); + message += diffSingleRecord(recordIdExtractor, expectedRecord, actualRecord); expectedRecordIndex++; actualRecordIndex++; } else if (compare < 0) { @@ -201,44 +206,17 @@ private static String diffRecords(List originalExpectedRecords, return message; } - private static String diffSingleRecord(Function recordIdExtractor, - boolean extractRawData, - JsonNode expectedRecord, - JsonNode actualRecord) { + private static String diffSingleRecord(Function recordIdExtractor, JsonNode expectedRecord, JsonNode actualRecord) { boolean foundMismatch = false; String mismatchedRecordMessage = "Row had incorrect data: " + recordIdExtractor.apply(expectedRecord) + "\n"; // Iterate through each column in the expected record and compare it to the actual record's value. for (String column : Streams.stream(expectedRecord.fieldNames()).sorted().toList()) { - if (extractRawData && "_airbyte_data".equals(column)) { - // For the raw data in particular, we should also diff the fields inside _airbyte_data. - JsonNode expectedRawData = expectedRecord.get("_airbyte_data"); - JsonNode actualRawData = actualRecord.get("_airbyte_data"); - // Iterate through all the subfields of the expected raw data and check that they match the actual - // record... - for (String field : Streams.stream(expectedRawData.fieldNames()).sorted().toList()) { - JsonNode expectedValue = expectedRawData.get(field); - JsonNode actualValue = actualRawData.get(field); - if (!areJsonNodesEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + field, expectedValue, actualValue); - foundMismatch = true; - } - } - // ... and then check the actual raw data for any subfields that we weren't expecting. - LinkedHashMap extraColumns = checkForExtraOrNonNullFields(expectedRawData, actualRawData); - if (extraColumns.size() > 0) { - for (Map.Entry extraColumn : extraColumns.entrySet()) { - mismatchedRecordMessage += generateFieldError("_airbyte_data." + extraColumn.getKey(), null, extraColumn.getValue()); - foundMismatch = true; - } - } - } else { - // For all other columns, we can just compare their values directly. - JsonNode expectedValue = expectedRecord.get(column); - JsonNode actualValue = actualRecord.get(column); - if (!areJsonNodesEquivalent(expectedValue, actualValue)) { - mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); - foundMismatch = true; - } + // For all other columns, we can just compare their values directly. + JsonNode expectedValue = expectedRecord.get(column); + JsonNode actualValue = actualRecord.get(column); + if (!areJsonNodesEquivalent(expectedValue, actualValue)) { + mismatchedRecordMessage += generateFieldError("column " + column, expectedValue, actualValue); + foundMismatch = true; } } // Then check the entire actual record for any columns that we weren't expecting. From 144970b77163cece6931e9607752468c6f15d3df Mon Sep 17 00:00:00 2001 From: edgao Date: Tue, 11 Jul 2023 16:09:16 +0000 Subject: [PATCH 46/46] Automated Commit - Formatting Changes --- .../destination/typing_deduping/RecordDiffer.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java index 34879a9cbef4..846fb4a88bff 100644 --- a/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java +++ b/airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/RecordDiffer.java @@ -69,8 +69,7 @@ public void diffRawTableRecords(List expectedRecords, List a actualRecords.stream().map(RecordDiffer::copyWithLiftedData).collect(toList()), recordIdentityComparator, recordSortComparator, - recordIdentityExtractor - ); + recordIdentityExtractor); if (!diff.isEmpty()) { fail("Raw table was incorrect.\n" + diff); @@ -83,8 +82,7 @@ public void diffFinalTableRecords(List expectedRecords, List actualRecords, recordIdentityComparator, recordSortComparator, - recordIdentityExtractor - ); + recordIdentityExtractor); if (!diff.isEmpty()) { fail("Final table was incorrect.\n" + diff); @@ -152,10 +150,10 @@ private static String getPrintableFieldIfPresent(JsonNode record, String field) * to be present. * * @param identityComparator Returns 0 iff two records are the "same" record (i.e. have the same - * PK+cursor+extracted_at) - * @param sortComparator Behaves identically to identityComparator, but if two records are the same, - * breaks that tie using _airbyte_raw_id - * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string + * PK+cursor+extracted_at) + * @param sortComparator Behaves identically to identityComparator, but if two records are the same, + * breaks that tie using _airbyte_raw_id + * @param recordIdExtractor Dump the record's PK+cursor+extracted_at into a human-readable string * @return The diff, or empty string if there were no differences */ private static String diffRecords(List originalExpectedRecords,