From f46354b1b2d83d07312cf37812c0036e1e44e8f1 Mon Sep 17 00:00:00 2001 From: subodh Date: Thu, 20 May 2021 20:37:47 +0530 Subject: [PATCH 01/13] source: implementation for mysql cdc --- .../source/StandardSourceTest.java | 6 +- .../source/jdbc/AbstractJdbcSource.java | 2 + .../connectors/source-mysql/build.gradle | 7 +- ...eFileDatabaseHistoryStorageOperations.java | 147 ++++ .../mysql/AirbyteFileOffsetBackingStore.java | 162 ++++ .../source/mysql/DebeziumEventUtils.java | 82 ++ .../source/mysql/DebeziumRecordIterator.java | 176 ++++ .../source/mysql/DebeziumRecordPublisher.java | 203 +++++ .../mysql/FilteredFileDatabaseHistory.java | 130 +++ .../source/mysql/MySqlSource.java | 203 +++++ .../source-mysql/src/main/resources/spec.json | 30 +- .../source/mysql/CdcMySqlStandardTest.java | 156 ++++ .../source/mysql/CdcMySqlSourceTest.java | 668 +++++++++++++++ .../source/postgres/PostgresSource.java | 8 + .../CdcPostgresSourceStandardTest.java | 22 +- .../resources/postgresql.conf | 783 ++++++++++++++++++ 16 files changed, 2774 insertions(+), 11 deletions(-) create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumEventUtils.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java create mode 100644 airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java create mode 100644 airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf diff --git a/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java b/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java index 85e9b9efd0fb..f36d18eec022 100644 --- a/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java +++ b/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java @@ -79,6 +79,8 @@ public abstract class StandardSourceTest { public static final String CDC_LSN = "_ab_cdc_lsn"; public static final String CDC_UPDATED_AT = "_ab_cdc_updated_at"; public static final String CDC_DELETED_AT = "_ab_cdc_deleted_at"; + public static final String CDC_LOG_FILE = "_ab_cdc_log_file"; + public static final String CDC_LOG_POS = "_ab_cdc_log_pos"; private static final long JOB_ID = 0L; private static final int JOB_ATTEMPT = 0; @@ -383,7 +385,7 @@ private List filterRecords(Collection mess .collect(Collectors.toList()); } - private ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { + public ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { final ConfiguredAirbyteCatalog clone = Jsons.clone(catalog); for (ConfiguredAirbyteStream configuredStream : clone.getStreams()) { if (configuredStream.getSyncMode() == INCREMENTAL @@ -472,6 +474,8 @@ private AirbyteRecordMessage pruneEmittedAt(AirbyteRecordMessage m) { private AirbyteRecordMessage pruneCdcMetadata(AirbyteRecordMessage m) { final AirbyteRecordMessage clone = Jsons.clone(m); ((ObjectNode) clone.getData()).remove(CDC_LSN); + ((ObjectNode) clone.getData()).remove(CDC_LOG_FILE); + ((ObjectNode) clone.getData()).remove(CDC_LOG_POS); ((ObjectNode) clone.getData()).remove(CDC_UPDATED_AT); ((ObjectNode) clone.getData()).remove(CDC_DELETED_AT); return clone; diff --git a/airbyte-integrations/connectors/source-jdbc/src/main/java/io/airbyte/integrations/source/jdbc/AbstractJdbcSource.java b/airbyte-integrations/connectors/source-jdbc/src/main/java/io/airbyte/integrations/source/jdbc/AbstractJdbcSource.java index e2cd9aeccadd..2913c6c8d386 100644 --- a/airbyte-integrations/connectors/source-jdbc/src/main/java/io/airbyte/integrations/source/jdbc/AbstractJdbcSource.java +++ b/airbyte-integrations/connectors/source-jdbc/src/main/java/io/airbyte/integrations/source/jdbc/AbstractJdbcSource.java @@ -87,6 +87,8 @@ public abstract class AbstractJdbcSource extends BaseConnector implements Source public static final String CDC_LSN = "_ab_cdc_lsn"; public static final String CDC_UPDATED_AT = "_ab_cdc_updated_at"; public static final String CDC_DELETED_AT = "_ab_cdc_deleted_at"; + public static final String CDC_LOG_FILE = "_ab_cdc_log_file"; + public static final String CDC_LOG_POS = "_ab_cdc_log_pos"; private static final String JDBC_COLUMN_DATABASE_NAME = "TABLE_CAT"; private static final String JDBC_COLUMN_SCHEMA_NAME = "TABLE_SCHEM"; diff --git a/airbyte-integrations/connectors/source-mysql/build.gradle b/airbyte-integrations/connectors/source-mysql/build.gradle index 6e2955a268e0..546b471995d4 100644 --- a/airbyte-integrations/connectors/source-mysql/build.gradle +++ b/airbyte-integrations/connectors/source-mysql/build.gradle @@ -16,13 +16,18 @@ dependencies { implementation 'mysql:mysql-connector-java:8.0.22' implementation 'org.apache.commons:commons-lang3:3.11' + implementation 'io.debezium:debezium-embedded:1.4.2.Final' + implementation 'io.debezium:debezium-api:1.4.2.Final' + implementation 'io.debezium:debezium-connector-mysql:1.4.2.Final' testImplementation testFixtures(project(':airbyte-integrations:connectors:source-jdbc')) - + testImplementation project(":airbyte-json-validation") + testImplementation project(':airbyte-test-utils') testImplementation 'org.apache.commons:commons-lang3:3.11' testImplementation 'org.testcontainers:mysql:1.15.1' integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-source-test') + integrationTestJavaImplementation project(':airbyte-integrations:connectors:source-mysql') implementation files(project(':airbyte-integrations:bases:base-java').airbyteDocker.outputs) integrationTestJavaImplementation files(project(':airbyte-integrations:bases:base-java').airbyteDocker.outputs) diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java new file mode 100644 index 000000000000..3137a38c2fc5 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java @@ -0,0 +1,147 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import static io.airbyte.integrations.source.mysql.MySqlSource.MYSQL_DB_HISTORY; + +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.source.jdbc.models.CdcState; +import io.debezium.document.Document; +import io.debezium.document.DocumentReader; +import io.debezium.document.DocumentWriter; +import io.debezium.relational.history.HistoryRecord; +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileAlreadyExistsException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.function.Consumer; +import org.apache.commons.io.FileUtils; + +public class AirbyteFileDatabaseHistoryStorageOperations { + + private final Path path; + private static final Charset UTF8 = StandardCharsets.UTF_8; + private final DocumentReader reader = DocumentReader.defaultReader(); + private final DocumentWriter writer = DocumentWriter.defaultWriter(); + + public AirbyteFileDatabaseHistoryStorageOperations(final Path path) { + this.path = path; + } + + public Path getPath() { + return path; + } + + /** + * This implementation is is kind of similar to + * {@link io.debezium.relational.history.FileDatabaseHistory#recoverRecords(Consumer)} ()} + */ + public String read() { + StringBuilder fileAsString = new StringBuilder(); + try { + for (String line : Files.readAllLines(path, UTF8)) { + if (line != null && !line.isEmpty()) { + Document record = reader.read(line); + String recordAsString = writer.write(record); + fileAsString.append(recordAsString); + fileAsString.append(System.lineSeparator()); + } + } + return fileAsString.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * This implementation is is kind of similar to + * {@link io.debezium.relational.history.FileDatabaseHistory#start()} + */ + private void makeSureFileExists() { + try { + // Make sure the file exists ... + if (!Files.exists(path)) { + // Create parent directories if we have them ... + if (path.getParent() != null) { + Files.createDirectories(path.getParent()); + } + try { + Files.createFile(path); + } catch (FileAlreadyExistsException e) { + // do nothing + } + } + } catch (IOException e) { + throw new RuntimeException( + "Unable to create history file at " + path + ": " + e.getMessage(), e); + } + } + + public void persist(CdcState cdcState) { + String fileAsString = cdcState != null && cdcState.getState() != null ? Jsons + .object(cdcState.getState().get(MYSQL_DB_HISTORY), String.class) : null; + + if (fileAsString == null || fileAsString.isEmpty()) { + return; + } + + FileUtils.deleteQuietly(path.toFile()); + makeSureFileExists(); + writeToFile(fileAsString); + } + + /** + * This implementation is kind of similar to + * {@link io.debezium.relational.history.FileDatabaseHistory#storeRecord(HistoryRecord)} + * + * @param fileAsString Represents the contents of the file saved in state from previous syncs + */ + private void writeToFile(String fileAsString) { + try { + String[] split = fileAsString.split(System.lineSeparator()); + for (String element : split) { + Document read = reader.read(element); + String line = writer.write(read); + + try (BufferedWriter historyWriter = Files + .newBufferedWriter(path, StandardOpenOption.APPEND)) { + try { + historyWriter.append(line); + historyWriter.newLine(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java new file mode 100644 index 000000000000..ef228afcd3c0 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java @@ -0,0 +1,162 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import static io.airbyte.integrations.source.mysql.MySqlSource.MYSQL_CDC_OFFSET; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Preconditions; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.source.jdbc.models.CdcState; +import java.io.EOFException; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; +import org.apache.commons.io.FileUtils; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.util.SafeObjectInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class handles reading and writing a debezium offset file. In many cases it is duplicating + * logic in debezium because that logic is not exposed in the public API. We mostly treat the + * contents of this state file like a black box. We know it is a Map. We + * deserialize it to a Map so that the state file can be human readable. If we ever + * discover that any of the contents of these offset files is not string serializable we will likely + * have to drop the human readability support and just base64 encode it. + */ +public class AirbyteFileOffsetBackingStore { + + private static final Logger LOGGER = LoggerFactory.getLogger(AirbyteFileOffsetBackingStore.class); + + private final Path offsetFilePath; + + public AirbyteFileOffsetBackingStore(final Path offsetFilePath) { + this.offsetFilePath = offsetFilePath; + } + + public Path getOffsetFilePath() { + return offsetFilePath; + } + + public CdcState read() { + final Map raw = load(); + + final Map mappedAsStrings = raw.entrySet().stream().collect(Collectors.toMap( + e -> byteBufferToString(e.getKey()), + e -> byteBufferToString(e.getValue()))); + final JsonNode asJson = Jsons.jsonNode(mappedAsStrings); + + LOGGER.info("debezium state: {}", asJson); + + return new CdcState().withState(asJson); + } + + public Map readMap() { + final Map raw = load(); + + return raw.entrySet().stream().collect(Collectors.toMap( + e -> byteBufferToString(e.getKey()), + e -> byteBufferToString(e.getValue()))); + } + + @SuppressWarnings("unchecked") + public void persist(CdcState cdcState) { + final Map mapAsString = + cdcState != null && cdcState.getState() != null ? Jsons.object(cdcState.getState().get(MYSQL_CDC_OFFSET), Map.class) : Collections.emptyMap(); + final Map mappedAsStrings = mapAsString.entrySet().stream().collect(Collectors.toMap( + e -> stringToByteBuffer(e.getKey()), + e -> stringToByteBuffer(e.getValue()))); + + FileUtils.deleteQuietly(offsetFilePath.toFile()); + save(mappedAsStrings); + } + + private static String byteBufferToString(ByteBuffer byteBuffer) { + Preconditions.checkNotNull(byteBuffer); + return new String(byteBuffer.array(), StandardCharsets.UTF_8); + } + + private static ByteBuffer stringToByteBuffer(String s) { + Preconditions.checkNotNull(s); + return ByteBuffer.wrap(s.getBytes(StandardCharsets.UTF_8)); + } + + /** + * See FileOffsetBackingStore#load - logic is mostly borrowed from here. duplicated because this + * method is not public. + */ + @SuppressWarnings("unchecked") + private Map load() { + try (final SafeObjectInputStream is = new SafeObjectInputStream(Files.newInputStream(offsetFilePath))) { + final Object obj = is.readObject(); + if (!(obj instanceof HashMap)) + throw new ConnectException("Expected HashMap but found " + obj.getClass()); + final Map raw = (Map) obj; + final Map data = new HashMap<>(); + for (Map.Entry mapEntry : raw.entrySet()) { + final ByteBuffer key = (mapEntry.getKey() != null) ? ByteBuffer.wrap(mapEntry.getKey()) : null; + final ByteBuffer value = (mapEntry.getValue() != null) ? ByteBuffer.wrap(mapEntry.getValue()) : null; + data.put(key, value); + } + + return data; + } catch (NoSuchFileException | EOFException e) { + // NoSuchFileException: Ignore, may be new. + // EOFException: Ignore, this means the file was missing or corrupt + return Collections.emptyMap(); + } catch (IOException | ClassNotFoundException e) { + throw new ConnectException(e); + } + } + + /** + * See FileOffsetBackingStore#save - logic is mostly borrowed from here. duplicated because this + * method is not public. + */ + private void save(Map data) { + try (ObjectOutputStream os = new ObjectOutputStream(Files.newOutputStream(offsetFilePath))) { + Map raw = new HashMap<>(); + for (Map.Entry mapEntry : data.entrySet()) { + byte[] key = (mapEntry.getKey() != null) ? mapEntry.getKey().array() : null; + byte[] value = (mapEntry.getValue() != null) ? mapEntry.getValue().array() : null; + raw.put(key, value); + } + os.writeObject(raw); + } catch (IOException e) { + throw new ConnectException(e); + } + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumEventUtils.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumEventUtils.java new file mode 100644 index 000000000000..02db98401481 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumEventUtils.java @@ -0,0 +1,82 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_DELETED_AT; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_LOG_FILE; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_LOG_POS; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_UPDATED_AT; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.debezium.engine.ChangeEvent; +import java.time.Instant; + +public class DebeziumEventUtils { + + public static AirbyteMessage toAirbyteMessage(ChangeEvent event, Instant emittedAt) { + final JsonNode debeziumRecord = Jsons.deserialize(event.value()); + final JsonNode before = debeziumRecord.get("before"); + final JsonNode after = debeziumRecord.get("after"); + final JsonNode source = debeziumRecord.get("source"); + + final JsonNode data = formatDebeziumData(before, after, source); + final String schemaName = source.get("db").asText(); + final String streamName = source.get("table").asText(); + + final AirbyteRecordMessage airbyteRecordMessage = new AirbyteRecordMessage() + .withStream(streamName) + .withNamespace(schemaName) + .withEmittedAt(emittedAt.toEpochMilli()) + .withData(data); + + return new AirbyteMessage() + .withType(AirbyteMessage.Type.RECORD) + .withRecord(airbyteRecordMessage); + } + + // warning mutates input args. + private static JsonNode formatDebeziumData(JsonNode before, JsonNode after, JsonNode source) { + final ObjectNode base = (ObjectNode) (after.isNull() ? before : after); + + long transactionMillis = source.get("ts_ms").asLong(); + + base.put(CDC_UPDATED_AT, transactionMillis); + base.put(CDC_LOG_FILE, source.get("file").asText()); + base.put(CDC_LOG_POS, source.get("pos").asLong()); + + if (after.isNull()) { + base.put(CDC_DELETED_AT, transactionMillis); + } else { + base.put("_ab_cdc_deleted_at", (Long) null); + } + + return base; + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java new file mode 100644 index 000000000000..6e19dca1a933 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java @@ -0,0 +1,176 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import com.google.common.collect.AbstractIterator; +import io.airbyte.commons.concurrency.VoidCallable; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.lang.MoreBooleans; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.db.PgLsn; +import io.debezium.engine.ChangeEvent; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Optional; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.source.SourceRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The record iterator is the consumer (in the producer / consumer relationship with debezium) is + * responsible for 1. making sure every record produced by the record publisher is processed 2. + * signalling to the record publisher when it is time for it to stop producing records. It emits + * this signal either when the publisher had not produced a new record for a long time or when it + * has processed at least all of the records that were present in the database when the source was + * started. Because the publisher might publish more records between the consumer sending this + * signal and the publisher acutally shutting down, the consumer must stay alive as long as the + * publisher is not closed or if there are any new records for it to process (even if the publisher + * is closed). + */ +public class DebeziumRecordIterator extends AbstractIterator> + implements AutoCloseableIterator> { + + private static final Logger LOGGER = LoggerFactory.getLogger(DebeziumRecordIterator.class); + + private static final TimeUnit SLEEP_TIME_UNIT = TimeUnit.SECONDS; + private static final int SLEEP_TIME_AMOUNT = 5; + + private final LinkedBlockingQueue> queue; + // private final PgLsn targetLsn; + private final Supplier publisherStatusSupplier; + private final VoidCallable requestClose; + + public DebeziumRecordIterator(LinkedBlockingQueue> queue, + // PgLsn targetLsn, + Supplier publisherStatusSupplier, + VoidCallable requestClose) { + this.queue = queue; + // this.targetLsn = targetLsn; + this.publisherStatusSupplier = publisherStatusSupplier; + this.requestClose = requestClose; + } + + @Override + protected ChangeEvent computeNext() { + // keep trying until the publisher is closed or until the queue is empty. the latter case is + // possible when the publisher has shutdown but the consumer has not yet processed all messages it + // emitted. + while (!MoreBooleans.isTruthy(publisherStatusSupplier.get()) || !queue.isEmpty()) { + final ChangeEvent next; + try { + next = queue.poll(SLEEP_TIME_AMOUNT, SLEEP_TIME_UNIT); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + // if within the timeout, the consumer could not get a record, it is time to tell the producer to + // shutdown. + if (next == null) { + requestClose(); + LOGGER.info("no record found. polling again."); + continue; + } + + // if the last record matches the target lsn, it is time to tell the producer to shutdown. + // if (shouldSignalClose(next)) { + // requestClose(); + // } + + return next; + } + return endOfData(); + } + + @Override + public void close() throws Exception { + requestClose.call(); + } + + // private boolean shouldSignalClose(ChangeEvent event) { + + // for mysql + // SnapshotMetadata.valueOf(Jsons.deserialize(next.value()).get("source").get("snapshot").asText().toUpperCase()) + // + // Jsons.deserialize(next.value()).get("source").get("file") + // + // Jsons.deserialize(next.value()).get("source").get("pos") + + // final PgLsn eventLsn = extractLsn(event); + // + // if (targetLsn.compareTo(eventLsn) > 0) { + // return false; + // } else { + // final SnapshotMetadata snapshotMetadata = getSnapshotMetadata(event); + // // if not snapshot or is snapshot but last record in snapshot. + // return SnapshotMetadata.TRUE != snapshotMetadata; + // } + // } + + private SnapshotMetadata getSnapshotMetadata(ChangeEvent event) { + try { + final Method sourceRecordMethod = event.getClass().getMethod("sourceRecord"); + sourceRecordMethod.setAccessible(true); + final SourceRecord sourceRecord = (SourceRecord) sourceRecordMethod.invoke(event); + final String snapshot = ((Struct) sourceRecord.value()).getStruct("source").getString("snapshot"); + + if (snapshot == null) { + return null; + } + + // the snapshot field is an enum of true, false, and last. + return SnapshotMetadata.valueOf(snapshot.toUpperCase()); + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + private PgLsn extractLsn(ChangeEvent event) { + return Optional.ofNullable(event.value()) + .flatMap(value -> Optional.ofNullable(Jsons.deserialize(value).get("source"))) + .flatMap(source -> Optional.ofNullable(source.get("lsn").asText())) + .map(Long::parseLong) + .map(PgLsn::fromLong) + .orElseThrow(() -> new IllegalStateException("Could not find LSN")); + } + + private void requestClose() { + try { + requestClose.call(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + enum SnapshotMetadata { + TRUE, + FALSE, + LAST + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java new file mode 100644 index 000000000000..ffa41b0ed0d5 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java @@ -0,0 +1,203 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.SyncMode; +import io.debezium.engine.ChangeEvent; +import io.debezium.engine.DebeziumEngine; +import io.debezium.engine.format.Json; +import io.debezium.engine.spi.OffsetCommitPolicy; +import java.util.Properties; +import java.util.Queue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import org.codehaus.plexus.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DebeziumRecordPublisher implements AutoCloseable { + + private static final Logger LOGGER = LoggerFactory.getLogger(DebeziumRecordPublisher.class); + private final ExecutorService executor; + private DebeziumEngine> engine; + + private final JsonNode config; + private final ConfiguredAirbyteCatalog catalog; + private final AirbyteFileOffsetBackingStore offsetManager; + private final AirbyteFileDatabaseHistoryStorageOperations airbyteFileDatabaseHistoryStorageOperations; + + private final AtomicBoolean hasClosed; + private final AtomicBoolean isClosing; + private final AtomicReference thrownError; + private final CountDownLatch engineLatch; + + public DebeziumRecordPublisher(JsonNode config, + ConfiguredAirbyteCatalog catalog, + AirbyteFileOffsetBackingStore offsetManager, + AirbyteFileDatabaseHistoryStorageOperations airbyteFileDatabaseHistoryStorageOperations) { + this.config = config; + this.catalog = catalog; + this.offsetManager = offsetManager; + this.airbyteFileDatabaseHistoryStorageOperations = airbyteFileDatabaseHistoryStorageOperations; + this.hasClosed = new AtomicBoolean(false); + this.isClosing = new AtomicBoolean(false); + this.thrownError = new AtomicReference<>(); + this.executor = Executors.newSingleThreadExecutor(); + this.engineLatch = new CountDownLatch(1); + } + + public void start(Queue> queue) { + engine = DebeziumEngine.create(Json.class) + .using(getDebeziumProperties(config, catalog, offsetManager)) + .using(new OffsetCommitPolicy.AlwaysCommitOffsetPolicy()) + .notifying(e -> { + // debezium outputs a tombstone event that has a value of null. this is an artifact of how it + // interacts with kafka. we want to ignore it. + // more on the tombstone: + // https://debezium.io/documentation/reference/configuration/event-flattening.html + if (e.value() != null) { + queue.add(e); + } + }) + .using((success, message, error) -> { + LOGGER.info("Debezium engine shutdown."); + thrownError.set(error); + engineLatch.countDown(); + }) + .build(); + + // Run the engine asynchronously ... + executor.execute(engine); + } + + public boolean hasClosed() { + return hasClosed.get(); + } + + public void close() throws Exception { + if (isClosing.compareAndSet(false, true)) { + // consumers should assume records can be produced until engine has closed. + if (engine != null) { + engine.close(); + } + + // wait for closure before shutting down executor service + engineLatch.await(5, TimeUnit.MINUTES); + + // shut down and await for thread to actually go down + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.MINUTES); + + // after the engine is completely off, we can mark this as closed + hasClosed.set(true); + + if (thrownError.get() != null) { + throw new RuntimeException(thrownError.get()); + } + } + } + + protected Properties getDebeziumProperties(JsonNode config, + ConfiguredAirbyteCatalog catalog, + AirbyteFileOffsetBackingStore offsetManager) { + final Properties props = new Properties(); + + // debezium engine configuration + props.setProperty("name", "engine"); + props.setProperty("connector.class", "io.debezium.connector.mysql.MySqlConnector"); + props.setProperty("offset.storage", "org.apache.kafka.connect.storage.FileOffsetBackingStore"); + props.setProperty("offset.storage.file.filename", offsetManager.getOffsetFilePath().toString()); + props.setProperty("offset.flush.interval.ms", "1000"); // todo: make this longer + + // snapshot config + // https://debezium.io/documentation/reference/1.4/connectors/mysql.html#mysql-property-snapshot-mode + props.setProperty("snapshot.mode", "initial"); + // https://debezium.io/documentation/reference/1.4/connectors/mysql.html#mysql-property-snapshot-locking-mode + // This is to make sure other database clients are allowed to write to a table while Airbyte is + // taking a snapshot. There is a risk involved that + // if any database client makes a schema change then the sync might break + props.setProperty("snapshot.locking.mode", "none"); + + // https://debezium.io/documentation/reference/1.4/operations/debezium-server.html#debezium-source-database-history-file-filename + // https://debezium.io/documentation/reference/development/engine.html#_in_the_code + // As mentioned in the documents above, debezium connector for MySQL needs to track the schema + // changes. If we don't do this, we can't fetch records for the table + // We have implemented our own implementation to filter out the schema information from other + // databases that the connector is not syncing + props.setProperty("database.history", + "io.airbyte.integrations.source.mysql.FilteredFileDatabaseHistory"); + props.setProperty("database.history.file.filename", + airbyteFileDatabaseHistoryStorageOperations.getPath().toString()); + + // https://debezium.io/documentation/reference/configuration/avro.html + props.setProperty("key.converter.schemas.enable", "false"); + props.setProperty("value.converter.schemas.enable", "false"); + + // https://debezium.io/documentation/reference/1.4/connectors/mysql.html#mysql-property-include-schema-changes + props.setProperty("include.schema.changes", "false"); + + // debezium names + props.setProperty("name", config.get("database").asText()); + props.setProperty("database.server.name", config.get("database").asText()); + + // db connection configuration + props.setProperty("database.hostname", config.get("host").asText()); + props.setProperty("database.port", config.get("port").asText()); + props.setProperty("database.user", config.get("username").asText()); + props.setProperty("database.dbname", config.get("database").asText()); + + if (config.has("password")) { + props.setProperty("database.password", config.get("password").asText()); + } + + // table selection + final String tableWhitelist = getTableWhitelist(catalog); + props.setProperty("table.include.list", tableWhitelist); + props.setProperty("database.include.list", config.get("database").asText()); + + return props; + } + + @VisibleForTesting + protected static String getTableWhitelist(ConfiguredAirbyteCatalog catalog) { + return catalog.getStreams().stream() + .filter(s -> s.getSyncMode() == SyncMode.INCREMENTAL) + .map(ConfiguredAirbyteStream::getStream) + .map(stream -> stream.getNamespace() + "." + stream.getName()) + // debezium needs commas escaped to split properly + .map(x -> StringUtils.escape(x, new char[] {','}, "\\,")) + .collect(Collectors.joining(",")); + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java new file mode 100644 index 000000000000..3b8e7c1bb7c1 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java @@ -0,0 +1,130 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import io.debezium.config.Configuration; +import io.debezium.relational.history.AbstractDatabaseHistory; +import io.debezium.relational.history.DatabaseHistoryException; +import io.debezium.relational.history.DatabaseHistoryListener; +import io.debezium.relational.history.FileDatabaseHistory; +import io.debezium.relational.history.HistoryRecord; +import io.debezium.relational.history.HistoryRecord.Fields; +import io.debezium.relational.history.HistoryRecordComparator; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.function.Consumer; + +public class FilteredFileDatabaseHistory extends AbstractDatabaseHistory { + + private final FileDatabaseHistory fileDatabaseHistory; + private static String databaseName; + + public FilteredFileDatabaseHistory() { + this.fileDatabaseHistory = new FileDatabaseHistory(); + } + + static void setDatabaseName(String databaseName) { + if (FilteredFileDatabaseHistory.databaseName == null) { + FilteredFileDatabaseHistory.databaseName = databaseName; + } else if (!FilteredFileDatabaseHistory.databaseName.equals(databaseName)) { + throw new RuntimeException( + "Database name has already been set : " + FilteredFileDatabaseHistory.databaseName + + " can't set to : " + databaseName); + } + } + + @Override + public void configure(Configuration config, + HistoryRecordComparator comparator, + DatabaseHistoryListener listener, + boolean useCatalogBeforeSchema) { + fileDatabaseHistory.configure(config, comparator, listener, useCatalogBeforeSchema); + } + + @Override + public void start() { + fileDatabaseHistory.start(); + } + + @Override + public void storeRecord(HistoryRecord record) throws DatabaseHistoryException { + if (record == null) { + return; + } + try { + String dbNameInRecord = record.document().getString(Fields.DATABASE_NAME); + if (databaseName != null && dbNameInRecord != null && !dbNameInRecord.equals(databaseName)) { + return; + } + + final Method storeRecordMethod = fileDatabaseHistory.getClass() + .getDeclaredMethod("storeRecord", record.getClass()); + storeRecordMethod.setAccessible(true); + storeRecordMethod.invoke(fileDatabaseHistory, record); + } catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + @Override + public void stop() { + fileDatabaseHistory.stop(); + // this is mainly for tests + databaseName = null; + } + + @Override + protected void recoverRecords(Consumer records) { + try { + final Method recoverRecords = fileDatabaseHistory.getClass() + .getDeclaredMethod("recoverRecords", Consumer.class); + recoverRecords.setAccessible(true); + recoverRecords.invoke(fileDatabaseHistory, records); + } catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + @Override + public boolean storageExists() { + return fileDatabaseHistory.storageExists(); + } + + @Override + public void initializeStorage() { + fileDatabaseHistory.initializeStorage(); + } + + @Override + public boolean exists() { + return fileDatabaseHistory.exists(); + } + + @Override + public String toString() { + return fileDatabaseHistory.toString(); + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 334e080acdbf..2e6d07d37c28 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -24,13 +24,44 @@ package io.airbyte.integrations.source.mysql; +import static java.util.stream.Collectors.toList; + import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableMap; import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.commons.util.AutoCloseableIterators; +import io.airbyte.commons.util.CompositeIterator; +import io.airbyte.commons.util.MoreIterators; +import io.airbyte.db.jdbc.JdbcDatabase; import io.airbyte.integrations.base.IntegrationRunner; import io.airbyte.integrations.base.Source; import io.airbyte.integrations.source.jdbc.AbstractJdbcSource; +import io.airbyte.integrations.source.jdbc.JdbcStateManager; +import io.airbyte.integrations.source.jdbc.models.CdcState; +import io.airbyte.protocol.models.AirbyteCatalog; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteMessage.Type; +import io.airbyte.protocol.models.AirbyteStateMessage; +import io.airbyte.protocol.models.AirbyteStream; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.SyncMode; +import io.debezium.engine.ChangeEvent; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.Set; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,11 +70,62 @@ public class MySqlSource extends AbstractJdbcSource implements Source { private static final Logger LOGGER = LoggerFactory.getLogger(MySqlSource.class); public static final String DRIVER_CLASS = "com.mysql.cj.jdbc.Driver"; + public static final String MYSQL_CDC_OFFSET = "mysql_cdc_offset"; + public static final String MYSQL_DB_HISTORY = "mysql_db_history"; public MySqlSource() { super(DRIVER_CLASS, new MySqlJdbcStreamingQueryConfiguration()); } + private static AirbyteStream removeIncrementalWithoutPk(AirbyteStream stream) { + if (stream.getSourceDefinedPrimaryKey().isEmpty()) { + stream.getSupportedSyncModes().remove(SyncMode.INCREMENTAL); + } + + return stream; + } + + private static AirbyteStream setIncrementalToSourceDefined(AirbyteStream stream) { + if (stream.getSupportedSyncModes().contains(SyncMode.INCREMENTAL)) { + stream.setSourceDefinedCursor(true); + } + + return stream; + } + + // Note: in place mutation. + private static AirbyteStream addCdcMetadataColumns(AirbyteStream stream) { + + ObjectNode jsonSchema = (ObjectNode) stream.getJsonSchema(); + ObjectNode properties = (ObjectNode) jsonSchema.get("properties"); + + final JsonNode numberType = Jsons.jsonNode(ImmutableMap.of("type", "number")); + final JsonNode stringType = Jsons.jsonNode(ImmutableMap.of("type", "string")); + properties.set(CDC_LOG_FILE, stringType); + properties.set(CDC_LOG_POS, numberType); + properties.set(CDC_UPDATED_AT, numberType); + properties.set(CDC_DELETED_AT, numberType); + + return stream; + } + + @Override + public AirbyteCatalog discover(JsonNode config) throws Exception { + AirbyteCatalog catalog = super.discover(config); + + if (isCdc(config)) { + final List streams = catalog.getStreams().stream() + .map(MySqlSource::removeIncrementalWithoutPk) + .map(MySqlSource::setIncrementalToSourceDefined) + .map(MySqlSource::addCdcMetadataColumns) + .collect(toList()); + + catalog.setStreams(streams); + } + + return catalog; + } + @Override public JsonNode toJdbcConfig(JsonNode config) { final StringBuilder jdbc_url = new StringBuilder(String.format("jdbc:mysql://%s:%s/%s", @@ -66,6 +148,122 @@ public JsonNode toJdbcConfig(JsonNode config) { return Jsons.jsonNode(configBuilder.build()); } + private static boolean isCdc(JsonNode config) { + final boolean isCdc = config.hasNonNull("replication_method") + && ReplicationMethod.valueOf(config.get("replication_method").asText()) + .equals(ReplicationMethod.CDC); + + return isCdc; + } + + static boolean shouldUseCDC(ConfiguredAirbyteCatalog catalog) { + Optional any = catalog.getStreams().stream().map(ConfiguredAirbyteStream::getSyncMode) + .filter(syncMode -> syncMode == SyncMode.INCREMENTAL).findAny(); + return any.isPresent(); + } + + @Override + public List> getIncrementalIterators(JsonNode config, + JdbcDatabase database, + ConfiguredAirbyteCatalog catalog, + Map tableNameToTable, + JdbcStateManager stateManager, + Instant emittedAt) { + if (isCdc(config) && shouldUseCDC(catalog)) { + LOGGER.info("using CDC: {}", true); + // TODO: Figure out how to set the isCDC of stateManager to true. Its always false + final AirbyteFileOffsetBackingStore offsetManager = initializeState(stateManager); + AirbyteFileDatabaseHistoryStorageOperations dbHistoryStorageManager = initializeDBHistory( + stateManager); + FilteredFileDatabaseHistory.setDatabaseName(config.get("database").asText()); + final LinkedBlockingQueue> queue = new LinkedBlockingQueue<>(); + final DebeziumRecordPublisher publisher = new DebeziumRecordPublisher(config, catalog, + offsetManager, dbHistoryStorageManager); + publisher.start(queue); + + // handle state machine around pub/sub logic. + final AutoCloseableIterator> eventIterator = new DebeziumRecordIterator( + queue, + // targetLsn, + publisher::hasClosed, + publisher::close); + + // convert to airbyte message. + final AutoCloseableIterator messageIterator = AutoCloseableIterators + .transform( + eventIterator, + (event) -> DebeziumEventUtils.toAirbyteMessage(event, emittedAt)); + + // our goal is to get the state at the time this supplier is called (i.e. after all message records + // have been produced) + final Supplier stateMessageSupplier = () -> { + Map offset = offsetManager.readMap(); + String dbHistory = dbHistoryStorageManager.read(); + + Map state = new HashMap<>(); + state.put(MYSQL_CDC_OFFSET, offset); + state.put(MYSQL_DB_HISTORY, dbHistory); + + final JsonNode asJson = Jsons.jsonNode(state); + + LOGGER.info("debezium state: {}", asJson); + + CdcState cdcState = new CdcState().withState(asJson); + stateManager.getCdcStateManager().setCdcState(cdcState); + final AirbyteStateMessage stateMessage = stateManager.emit(); + return new AirbyteMessage().withType(Type.STATE).withState(stateMessage); + + }; + + // wrap the supplier in an iterator so that we can concat it to the message iterator. + final Iterator stateMessageIterator = MoreIterators + .singletonIteratorFromSupplier(stateMessageSupplier); + + // this structure guarantees that the debezium engine will be closed, before we attempt to emit the + // state file. we want this so that we have a guarantee that the debezium offset file (which we use + // to produce the state file) is up-to-date. + final CompositeIterator messageIteratorWithStateDecorator = AutoCloseableIterators + .concatWithEagerClose(messageIterator, + AutoCloseableIterators.fromIterator(stateMessageIterator)); + + return Collections.singletonList(messageIteratorWithStateDecorator); + } else { + LOGGER.info("using CDC: {}", false); + return super.getIncrementalIterators(config, database, catalog, tableNameToTable, stateManager, + emittedAt); + } + } + + private AirbyteFileOffsetBackingStore initializeState(JdbcStateManager stateManager) { + final Path cdcWorkingDir; + try { + cdcWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-state-offset"); + } catch (IOException e) { + throw new RuntimeException(e); + } + final Path cdcOffsetFilePath = cdcWorkingDir.resolve("offset.dat"); + + final AirbyteFileOffsetBackingStore offsetManager = new AirbyteFileOffsetBackingStore( + cdcOffsetFilePath); + offsetManager.persist(stateManager.getCdcStateManager().getCdcState()); + return offsetManager; + } + + private AirbyteFileDatabaseHistoryStorageOperations initializeDBHistory( + JdbcStateManager stateManager) { + final Path dbHistoryWorkingDir; + try { + dbHistoryWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-db-history"); + } catch (IOException e) { + throw new RuntimeException(e); + } + final Path dbHistoryFilePath = dbHistoryWorkingDir.resolve("dbhistory.dat"); + + final AirbyteFileDatabaseHistoryStorageOperations dbHistoryStorageManager = new AirbyteFileDatabaseHistoryStorageOperations(dbHistoryFilePath); + dbHistoryStorageManager.persist(stateManager.getCdcStateManager().getCdcState()); + return dbHistoryStorageManager; + } + @Override public Set getExcludedInternalSchemas() { return Set.of( @@ -82,4 +280,9 @@ public static void main(String[] args) throws Exception { LOGGER.info("completed source: {}", MySqlSource.class); } + public enum ReplicationMethod { + STANDARD, + CDC + } + } diff --git a/airbyte-integrations/connectors/source-mysql/src/main/resources/spec.json b/airbyte-integrations/connectors/source-mysql/src/main/resources/spec.json index 9d76f8aadbaa..69adb708927b 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/resources/spec.json +++ b/airbyte-integrations/connectors/source-mysql/src/main/resources/spec.json @@ -1,15 +1,16 @@ { - "documentationUrl": "https://docs.airbyte.io/integrations/destinations/mysql", + "documentationUrl": "https://docs.airbyte.io/integrations/source/mysql", "connectionSpecification": { "$schema": "http://json-schema.org/draft-07/schema#", "title": "MySql Source Spec", "type": "object", - "required": ["host", "port", "database", "username"], + "required": ["host", "port", "database", "username", "replication_method"], "additionalProperties": false, "properties": { "host": { "description": "Hostname of the database.", - "type": "string" + "type": "string", + "order": 0 }, "port": { "description": "Port of the database.", @@ -17,24 +18,37 @@ "minimum": 0, "maximum": 65536, "default": 3306, - "examples": ["3306"] + "examples": ["3306"], + "order": 1 }, "database": { "description": "Name of the database.", - "type": "string" + "type": "string", + "order": 2 }, "username": { "description": "Username to use to access the database.", - "type": "string" + "type": "string", + "order": 3 }, "password": { "description": "Password associated with the username.", "type": "string", - "airbyte_secret": true + "airbyte_secret": true, + "order": 4 }, "jdbc_url_params": { "description": "Additional properties to pass to the jdbc url string when connecting to the database formatted as 'key=value' pairs separated by the symbol '&'. (example: key1=value1&key2=value2&key3=value3)", - "type": "string" + "type": "string", + "order": 5 + }, + "replication_method": { + "type": "string", + "title": "Replication Method", + "description": "Replication method to use for extracting data from the database. STANDARD replication requires no setup on the DB side but will not be able to represent deletions incrementally. CDC uses the Binlog to detect inserts, updates, and deletes. This needs to be configured on the source database itself.", + "order": 6, + "default": "STANDARD", + "enum": ["STANDARD", "CDC"] } } } diff --git a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java new file mode 100644 index 000000000000..05755cc52b4a --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java @@ -0,0 +1,156 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import static io.airbyte.integrations.source.mysql.MySqlSource.DRIVER_CLASS; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.db.Database; +import io.airbyte.db.Databases; +import io.airbyte.integrations.standardtest.source.StandardSourceTest; +import io.airbyte.protocol.models.CatalogHelpers; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.ConnectorSpecification; +import io.airbyte.protocol.models.DestinationSyncMode; +import io.airbyte.protocol.models.Field; +import io.airbyte.protocol.models.Field.JsonSchemaPrimitive; +import io.airbyte.protocol.models.SyncMode; +import java.util.Collections; +import java.util.List; +import org.jooq.SQLDialect; +import org.testcontainers.containers.MySQLContainer; + +public class CdcMySqlStandardTest extends StandardSourceTest { + + private static final String STREAM_NAME = "id_and_name"; + private static final String STREAM_NAME2 = "starships"; + private MySQLContainer container; + private JsonNode config; + + @Override + protected String getImageName() { + return "airbyte/source-mysql:dev"; + } + + @Override + protected ConnectorSpecification getSpec() throws Exception { + return Jsons.deserialize(MoreResources.readResource("spec.json"), ConnectorSpecification.class); + } + + @Override + protected JsonNode getConfig() { + return config; + } + + @Override + public ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { + return catalog; + } + + @Override + protected ConfiguredAirbyteCatalog getConfiguredCatalog() { + return new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList( + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.INCREMENTAL) + .withDestinationSyncMode(DestinationSyncMode.APPEND) + .withStream(CatalogHelpers.createAirbyteStream( + String.format("%s", STREAM_NAME), + String.format("%s", config.get("database").asText()), + Field.of("id", JsonSchemaPrimitive.NUMBER), + Field.of("name", JsonSchemaPrimitive.STRING)) + .withSourceDefinedCursor(true) + .withSourceDefinedPrimaryKey(List.of(List.of("id"))) + .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))), + new ConfiguredAirbyteStream() + .withSyncMode(SyncMode.INCREMENTAL) + .withDestinationSyncMode(DestinationSyncMode.APPEND) + .withStream(CatalogHelpers.createAirbyteStream( + String.format("%s", STREAM_NAME2), + String.format("%s", config.get("database").asText()), + Field.of("id", JsonSchemaPrimitive.NUMBER), + Field.of("name", JsonSchemaPrimitive.STRING)) + .withSourceDefinedCursor(true) + .withSourceDefinedPrimaryKey(List.of(List.of("id"))) + .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))))); + } + + @Override + protected JsonNode getState() { + return null; + } + + @Override + protected List getRegexTests() { + return Collections.emptyList(); + } + + @Override + protected void setup(TestDestinationEnv testEnv) throws Exception { + container = new MySQLContainer<>("mysql:8.0"); + container.start(); + + config = Jsons.jsonNode(ImmutableMap.builder() + .put("host", container.getHost()) + .put("port", container.getFirstMappedPort()) + .put("database", container.getDatabaseName()) + .put("username", "root") + .put("password", "test") + .put("replication_method", "CDC") + .build()); + + final Database database = Databases.createDatabase( + config.get("username").asText(), + config.get("password").asText(), + String.format("jdbc:mysql://%s:%s/%s", + config.get("host").asText(), + config.get("port").asText(), + config.get("database").asText()), + DRIVER_CLASS, + SQLDialect.MYSQL); + + database.query(ctx -> { + ctx.fetch("CREATE TABLE id_and_name(id INTEGER PRIMARY KEY, name VARCHAR(200));"); + ctx.fetch( + "INSERT INTO id_and_name (id, name) VALUES (1,'picard'), (2, 'crusher'), (3, 'vash');"); + ctx.fetch("CREATE TABLE starships(id INTEGER PRIMARY KEY, name VARCHAR(200));"); + ctx.fetch( + "INSERT INTO starships (id, name) VALUES (1,'enterprise-d'), (2, 'defiant'), (3, 'yamato');"); + return null; + }); + + database.close(); + } + + @Override + protected void tearDown(TestDestinationEnv testEnv) { + container.close(); + } + +} diff --git a/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java b/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java new file mode 100644 index 000000000000..6e9f1dd2efa5 --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java @@ -0,0 +1,668 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_DELETED_AT; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_LOG_FILE; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_LOG_POS; +import static io.airbyte.integrations.source.jdbc.AbstractJdbcSource.CDC_UPDATED_AT; +import static io.airbyte.integrations.source.mysql.MySqlSource.DRIVER_CLASS; +import static io.airbyte.integrations.source.mysql.MySqlSource.MYSQL_CDC_OFFSET; +import static io.airbyte.integrations.source.mysql.MySqlSource.MYSQL_DB_HISTORY; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.google.common.collect.Streams; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.commons.util.AutoCloseableIterators; +import io.airbyte.db.Database; +import io.airbyte.db.Databases; +import io.airbyte.protocol.models.AirbyteCatalog; +import io.airbyte.protocol.models.AirbyteConnectionStatus; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteMessage.Type; +import io.airbyte.protocol.models.AirbyteRecordMessage; +import io.airbyte.protocol.models.AirbyteStateMessage; +import io.airbyte.protocol.models.AirbyteStream; +import io.airbyte.protocol.models.CatalogHelpers; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.Field; +import io.airbyte.protocol.models.Field.JsonSchemaPrimitive; +import io.airbyte.protocol.models.SyncMode; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.jooq.DSLContext; +import org.jooq.SQLDialect; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.MySQLContainer; + +public class CdcMySqlSourceTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(CdcMySqlSourceTest.class); + + private static final String MODELS_SCHEMA = "models_schema"; + private static final String MODELS_STREAM_NAME = "models"; + private static final Set STREAM_NAMES = Sets + .newHashSet(MODELS_STREAM_NAME); + private static final String COL_ID = "id"; + private static final String COL_MAKE_ID = "make_id"; + private static final String COL_MODEL = "model"; + private static final String dbName = MODELS_SCHEMA; + + private static final AirbyteCatalog CATALOG = new AirbyteCatalog().withStreams(List.of( + CatalogHelpers.createAirbyteStream( + MODELS_STREAM_NAME, + MODELS_SCHEMA, + Field.of(COL_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MAKE_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MODEL, JsonSchemaPrimitive.STRING)) + .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedPrimaryKey(List.of(List.of(COL_ID))))); + private static final ConfiguredAirbyteCatalog CONFIGURED_CATALOG = CatalogHelpers + .toDefaultConfiguredCatalog(CATALOG); + + // set all streams to incremental. + static { + CONFIGURED_CATALOG.getStreams().forEach(s -> s.setSyncMode(SyncMode.INCREMENTAL)); + } + + private static final List MODEL_RECORDS = ImmutableList.of( + Jsons.jsonNode(ImmutableMap.of(COL_ID, 11, COL_MAKE_ID, 1, COL_MODEL, "Fiesta")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 12, COL_MAKE_ID, 1, COL_MODEL, "Focus")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 13, COL_MAKE_ID, 1, COL_MODEL, "Ranger")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 14, COL_MAKE_ID, 2, COL_MODEL, "GLA")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 15, COL_MAKE_ID, 2, COL_MODEL, "A 220")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 16, COL_MAKE_ID, 2, COL_MODEL, "E 350"))); + + private static MySQLContainer container; + + private Database database; + private MySqlSource source; + + @AfterEach + public void tearDown() { + container.close(); + container.stop(); + } + + @BeforeEach + public void setup() throws Exception { + container = new MySQLContainer<>("mysql:8.0"); + container.start(); + source = new MySqlSource(); + final JsonNode config = getConfig(container, dbName); + database = getDatabaseFromConfig(config); + database.query(ctx -> { + ctx.execute("CREATE DATABASE " + MODELS_SCHEMA + ";"); + ctx.execute(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); + + for (JsonNode recordJson : MODEL_RECORDS) { + writeModelRecord(ctx, recordJson); + } + + return null; + }); + /** + * This database and table is not part of Airbyte sync. It is being created just to make sure the + * databases not being synced by Airbyte are not causing issues with our debezium logic + */ + database.query(ctx -> { + ctx.execute("CREATE DATABASE " + MODELS_SCHEMA + "_random" + ";"); + ctx.execute(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", COL_ID + "_random", + COL_MAKE_ID + "_random", + COL_MODEL + "_random", COL_ID + "_random")); + + final List MODEL_RECORDS_RANDOM = ImmutableList.of( + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 11000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Fiesta-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 12000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Focus-random")), + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 13000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Ranger-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 14000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "GLA-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 15000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "A 220-random")), + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 16000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "E 350-random"))); + for (JsonNode recordJson : MODEL_RECORDS_RANDOM) { + writeRecords(ctx, recordJson, MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", + COL_ID + "_random", COL_MAKE_ID + "_random", COL_MODEL + "_random"); + } + + return null; + }); + } + + private JsonNode getConfig(MySQLContainer db, String dbName) { + + return Jsons.jsonNode(ImmutableMap.builder() + .put("host", db.getHost()) + .put("port", db.getFirstMappedPort()) + .put("database", dbName) + .put("username", "root") + .put("password", "test") + .put("replication_method", "CDC") + .build()); + } + + private Database getDatabaseFromConfig(JsonNode config) { + return Databases.createDatabase( + config.get("username").asText(), + config.get("password").asText(), + String.format("jdbc:mysql://%s:%s", + config.get("host").asText(), + config.get("port").asText()), + DRIVER_CLASS, + SQLDialect.MYSQL); + } + + @Test + @DisplayName("On the first sync, produce returns records that exist in the database.") + void testExistingData() throws Exception { + final AutoCloseableIterator read = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List actualRecords = AutoCloseableIterators.toListAndClose(read); + + final Set recordMessages = extractRecordMessages(actualRecords); + final List stateMessages = extractStateMessages(actualRecords); + + assertExpectedRecords( + new HashSet<>(MODEL_RECORDS), recordMessages); + assertExpectedStateMessages(stateMessages); + } + + @Test + @DisplayName("When a record is deleted, produces a deletion record.") + void testDelete() throws Exception { + final AutoCloseableIterator read1 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); + final List stateMessages1 = extractStateMessages(actualRecords1); + + assertExpectedStateMessages(stateMessages1); + + database.query(ctx -> { + ctx.execute(String + .format("DELETE FROM %s.%s WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, + 11)); + return null; + }); + + final JsonNode state = stateMessages1.get(0).getData(); + final AutoCloseableIterator read2 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); + final List recordMessages2 = new ArrayList<>( + extractRecordMessages(actualRecords2)); + final List stateMessages2 = extractStateMessages(actualRecords2); + + assertExpectedStateMessages(stateMessages2); + assertEquals(1, recordMessages2.size()); + assertEquals(11, recordMessages2.get(0).getData().get(COL_ID).asInt()); + assertNotNull(recordMessages2.get(0).getData().get(CDC_LOG_FILE)); + assertNotNull(recordMessages2.get(0).getData().get(CDC_UPDATED_AT)); + assertNotNull(recordMessages2.get(0).getData().get(CDC_DELETED_AT)); + } + + @Test + @DisplayName("When a record is updated, produces an update record.") + void testUpdate() throws Exception { + final String updatedModel = "Explorer"; + final AutoCloseableIterator read1 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); + final List stateMessages1 = extractStateMessages(actualRecords1); + + assertExpectedStateMessages(stateMessages1); + + database.query(ctx -> { + ctx.execute(String + .format("UPDATE %s.%s SET %s = '%s' WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, + COL_MODEL, updatedModel, COL_ID, 11)); + return null; + }); + + final JsonNode state = stateMessages1.get(0).getData(); + final AutoCloseableIterator read2 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); + final List recordMessages2 = new ArrayList<>( + extractRecordMessages(actualRecords2)); + final List stateMessages2 = extractStateMessages(actualRecords2); + + assertExpectedStateMessages(stateMessages2); + assertEquals(1, recordMessages2.size()); + assertEquals(11, recordMessages2.get(0).getData().get(COL_ID).asInt()); + assertEquals(updatedModel, recordMessages2.get(0).getData().get(COL_MODEL).asText()); + assertNotNull(recordMessages2.get(0).getData().get(CDC_LOG_FILE)); + assertNotNull(recordMessages2.get(0).getData().get(CDC_UPDATED_AT)); + assertTrue(recordMessages2.get(0).getData().get(CDC_DELETED_AT).isNull()); + } + + @SuppressWarnings({"BusyWait", "CodeBlock2Expr"}) + @Test + @DisplayName("Verify that when data is inserted into the database while a sync is happening and after the first sync, it all gets replicated.") + void testRecordsProducedDuringAndAfterSync() throws Exception { + + final int recordsToCreate = 20; + final int[] recordsCreated = {0}; + // first batch of records. 20 created here and 6 created in setup method. + database.query(ctx -> { + while (recordsCreated[0] < recordsToCreate) { + final JsonNode record = + Jsons.jsonNode(ImmutableMap + .of(COL_ID, 100 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, + "F-" + recordsCreated[0])); + writeModelRecord(ctx, record); + recordsCreated[0]++; + } + return null; + }); + + final AutoCloseableIterator firstBatchIterator = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List dataFromFirstBatch = AutoCloseableIterators + .toListAndClose(firstBatchIterator); + List stateAfterFirstBatch = extractStateMessages(dataFromFirstBatch); + assertExpectedStateMessages(stateAfterFirstBatch); + Set recordsFromFirstBatch = extractRecordMessages( + dataFromFirstBatch); + assertEquals((MODEL_RECORDS.size() + 20), recordsFromFirstBatch.size()); + + // second batch of records again 20 being created + recordsCreated[0] = 0; + database.query(ctx -> { + while (recordsCreated[0] < recordsToCreate) { + final JsonNode record = + Jsons.jsonNode(ImmutableMap + .of(COL_ID, 200 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, + "F-" + recordsCreated[0])); + writeModelRecord(ctx, record); + recordsCreated[0]++; + } + return null; + }); + + final JsonNode state = stateAfterFirstBatch.get(0).getData(); + final AutoCloseableIterator secondBatchIterator = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + final List dataFromSecondBatch = AutoCloseableIterators + .toListAndClose(secondBatchIterator); + + List stateAfterSecondBatch = extractStateMessages(dataFromSecondBatch); + assertExpectedStateMessages(stateAfterSecondBatch); + + Set recordsFromSecondBatch = extractRecordMessages( + dataFromSecondBatch); + assertEquals(20, recordsFromSecondBatch.size(), + "Expected 20 records to be replicated in the second sync."); + + // sometimes there can be more than one of these at the end of the snapshot and just before the + // first incremental. + final Set recordsFromFirstBatchWithoutDuplicates = removeDuplicates( + recordsFromFirstBatch); + final Set recordsFromSecondBatchWithoutDuplicates = removeDuplicates( + recordsFromSecondBatch); + + final int recordsCreatedBeforeTestCount = MODEL_RECORDS.size(); + assertTrue(recordsCreatedBeforeTestCount < recordsFromFirstBatchWithoutDuplicates.size(), + "Expected first sync to include records created while the test was running."); + assertEquals(40 + recordsCreatedBeforeTestCount, + recordsFromFirstBatchWithoutDuplicates.size() + recordsFromSecondBatchWithoutDuplicates + .size()); + } + + private static Set removeDuplicates(Set messages) { + final Set existingDataRecordsWithoutUpdated = new HashSet<>(); + final Set output = new HashSet<>(); + + for (AirbyteRecordMessage message : messages) { + ObjectNode node = message.getData().deepCopy(); + node.remove("_ab_cdc_updated_at"); + + if (existingDataRecordsWithoutUpdated.contains(node)) { + LOGGER.info("Removing duplicate node: " + node); + } else { + output.add(message); + existingDataRecordsWithoutUpdated.add(node); + } + } + + return output; + } + + @Test + @DisplayName("When both incremental CDC and full refresh are configured for different streams in a sync, the data is replicated as expected.") + void testCdcAndFullRefreshInSameSync() throws Exception { + final ConfiguredAirbyteCatalog configuredCatalog = Jsons.clone(CONFIGURED_CATALOG); + + final List MODEL_RECORDS_2 = ImmutableList.of( + Jsons.jsonNode(ImmutableMap.of(COL_ID, 110, COL_MAKE_ID, 1, COL_MODEL, "Fiesta-2")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 120, COL_MAKE_ID, 1, COL_MODEL, "Focus-2")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 130, COL_MAKE_ID, 1, COL_MODEL, "Ranger-2")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 140, COL_MAKE_ID, 2, COL_MODEL, "GLA-2")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 150, COL_MAKE_ID, 2, COL_MODEL, "A 220-2")), + Jsons.jsonNode(ImmutableMap.of(COL_ID, 160, COL_MAKE_ID, 2, COL_MODEL, "E 350-2"))); + + database.query(ctx -> { + ctx.execute(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); + + for (JsonNode recordJson : MODEL_RECORDS_2) { + writeRecords(ctx, recordJson, MODELS_SCHEMA, MODELS_STREAM_NAME + "_2"); + } + + return null; + }); + + ConfiguredAirbyteStream airbyteStream = new ConfiguredAirbyteStream() + .withStream(CatalogHelpers.createAirbyteStream( + MODELS_STREAM_NAME + "_2", + MODELS_SCHEMA, + Field.of(COL_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MAKE_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MODEL, JsonSchemaPrimitive.STRING)) + .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSourceDefinedPrimaryKey(List.of(List.of(COL_ID)))); + airbyteStream.setSyncMode(SyncMode.FULL_REFRESH); + + List streams = configuredCatalog.getStreams(); + streams.add(airbyteStream); + configuredCatalog.withStreams(streams); + + final AutoCloseableIterator read1 = source + .read(getConfig(container, dbName), configuredCatalog, null); + final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); + + final Set recordMessages1 = extractRecordMessages(actualRecords1); + final List stateMessages1 = extractStateMessages(actualRecords1); + HashSet names = new HashSet<>(STREAM_NAMES); + names.add(MODELS_STREAM_NAME + "_2"); + assertExpectedStateMessages(stateMessages1); + assertExpectedRecords(Streams.concat(MODEL_RECORDS_2.stream(), MODEL_RECORDS.stream()) + .collect(Collectors.toSet()), + recordMessages1, + Collections.singleton(MODELS_STREAM_NAME), + names); + + final JsonNode puntoRecord = Jsons + .jsonNode(ImmutableMap.of(COL_ID, 100, COL_MAKE_ID, 3, COL_MODEL, "Punto")); + database.query(ctx -> { + writeModelRecord(ctx, puntoRecord); + return null; + }); + + final JsonNode state = extractStateMessages(actualRecords1).get(0).getData(); + final AutoCloseableIterator read2 = source + .read(getConfig(container, dbName), configuredCatalog, state); + final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); + + final Set recordMessages2 = extractRecordMessages(actualRecords2); + final List stateMessages2 = extractStateMessages(actualRecords2); + + assertExpectedStateMessages(stateMessages2); + assertExpectedRecords( + Streams.concat(MODEL_RECORDS_2.stream(), Stream.of(puntoRecord)) + .collect(Collectors.toSet()), + recordMessages2, + Collections.singleton(MODELS_STREAM_NAME), + names); + } + + @Test + @DisplayName("When no records exist, no records are returned.") + void testNoData() throws Exception { + + database.query(ctx -> { + ctx.execute(String.format("DELETE FROM %s.%s", MODELS_SCHEMA, MODELS_STREAM_NAME)); + return null; + }); + + final AutoCloseableIterator read = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List actualRecords = AutoCloseableIterators.toListAndClose(read); + + final Set recordMessages = extractRecordMessages(actualRecords); + final List stateMessages = extractStateMessages(actualRecords); + + assertExpectedRecords(Collections.emptySet(), recordMessages); + assertExpectedStateMessages(stateMessages); + } + + @Test + @DisplayName("When no changes have been made to the database since the previous sync, no records are returned.") + void testNoDataOnSecondSync() throws Exception { + final AutoCloseableIterator read1 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); + final JsonNode state = extractStateMessages(actualRecords1).get(0).getData(); + + final AutoCloseableIterator read2 = source + .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); + + final Set recordMessages2 = extractRecordMessages(actualRecords2); + final List stateMessages2 = extractStateMessages(actualRecords2); + + assertExpectedRecords(Collections.emptySet(), recordMessages2); + assertExpectedStateMessages(stateMessages2); + } + + @Test + void testCheck() { + final AirbyteConnectionStatus status = source.check(getConfig(container, dbName)); + assertEquals(status.getStatus(), AirbyteConnectionStatus.Status.SUCCEEDED); + } + + @Test + void testDiscover() throws Exception { + final AirbyteCatalog expectedCatalog = Jsons.clone(CATALOG); + + database.query(ctx -> { + ctx.execute(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200));", + MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL)); + return null; + }); + + List streams = expectedCatalog.getStreams(); + // stream with PK + streams.get(0).setSourceDefinedCursor(true); + addCdcMetadataColumns(streams.get(0)); + + AirbyteStream streamWithoutPK = CatalogHelpers.createAirbyteStream( + MODELS_STREAM_NAME + "_2", + MODELS_SCHEMA, + Field.of(COL_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MAKE_ID, JsonSchemaPrimitive.NUMBER), + Field.of(COL_MODEL, JsonSchemaPrimitive.STRING)); + streamWithoutPK.setSourceDefinedPrimaryKey(Collections.emptyList()); + streamWithoutPK.setSupportedSyncModes(List.of(SyncMode.FULL_REFRESH)); + addCdcMetadataColumns(streamWithoutPK); + + streams.add(streamWithoutPK); + expectedCatalog.withStreams(streams); + + final AirbyteCatalog actualCatalog = source.discover(getConfig(container, dbName)); + + assertEquals( + expectedCatalog.getStreams().stream().sorted(Comparator.comparing(AirbyteStream::getName)) + .collect(Collectors.toList()), + actualCatalog.getStreams().stream().sorted(Comparator.comparing(AirbyteStream::getName)) + .collect(Collectors.toList())); + } + + private static AirbyteStream addCdcMetadataColumns(AirbyteStream stream) { + ObjectNode jsonSchema = (ObjectNode) stream.getJsonSchema(); + ObjectNode properties = (ObjectNode) jsonSchema.get("properties"); + + final JsonNode numberType = Jsons.jsonNode(ImmutableMap.of("type", "number")); + final JsonNode stringType = Jsons.jsonNode(ImmutableMap.of("type", "string")); + properties.set(CDC_LOG_FILE, stringType); + properties.set(CDC_LOG_POS, numberType); + properties.set(CDC_UPDATED_AT, numberType); + properties.set(CDC_DELETED_AT, numberType); + + return stream; + } + + private void writeModelRecord(DSLContext ctx, JsonNode recordJson) { + writeRecords(ctx, recordJson, MODELS_SCHEMA, MODELS_STREAM_NAME); + } + + private void writeRecords(DSLContext ctx, JsonNode recordJson, String dbName, String streamName) { + writeRecords(ctx, recordJson, dbName, streamName, COL_ID, COL_MAKE_ID, COL_MODEL); + } + + private void writeRecords(DSLContext ctx, + JsonNode recordJson, + String dbName, + String streamName, + String idCol, + String makeIdCol, + String modelCol) { + ctx.execute( + String.format("INSERT INTO %s.%s (%s, %s, %s) VALUES (%s, %s, '%s');", dbName, streamName, idCol, makeIdCol, modelCol, + recordJson.get(idCol).asInt(), recordJson.get(makeIdCol).asInt(), + recordJson.get(modelCol).asText())); + } + + private Set extractRecordMessages(List messages) { + final List recordMessageList = messages + .stream() + .filter(r -> r.getType() == Type.RECORD).map(AirbyteMessage::getRecord) + .collect(Collectors.toList()); + final Set recordMessageSet = new HashSet<>(recordMessageList); + + assertEquals(recordMessageList.size(), recordMessageSet.size(), + "Expected no duplicates in airbyte record message output for a single sync."); + + return recordMessageSet; + } + + private List extractStateMessages(List messages) { + return messages.stream().filter(r -> r.getType() == Type.STATE).map(AirbyteMessage::getState) + .collect(Collectors.toList()); + } + + private static void assertExpectedStateMessages(List stateMessages) { + // TODO: add assertion for boolean cdc is true + assertEquals(1, stateMessages.size()); + assertNotNull(stateMessages.get(0).getData()); + assertNotNull( + stateMessages.get(0).getData().get("cdc_state").get("state").get(MYSQL_CDC_OFFSET)); + assertNotNull( + stateMessages.get(0).getData().get("cdc_state").get("state").get(MYSQL_DB_HISTORY)); + } + + private static void assertExpectedRecords(Set expectedRecords, + Set actualRecords) { + // assume all streams are cdc. + assertExpectedRecords( + expectedRecords, + actualRecords, + actualRecords.stream().map(AirbyteRecordMessage::getStream).collect(Collectors.toSet())); + } + + private static void assertExpectedRecords(Set expectedRecords, + Set actualRecords, + Set cdcStreams) { + assertExpectedRecords(expectedRecords, actualRecords, cdcStreams, STREAM_NAMES); + } + + private static void assertExpectedRecords(Set expectedRecords, + Set actualRecords, + Set cdcStreams, + Set streamNames) { + final Set actualData = actualRecords + .stream() + .map(recordMessage -> { + assertTrue(streamNames.contains(recordMessage.getStream())); + assertNotNull(recordMessage.getEmittedAt()); + + assertEquals(MODELS_SCHEMA, recordMessage.getNamespace()); + + final JsonNode data = recordMessage.getData(); + + if (cdcStreams.contains(recordMessage.getStream())) { + assertNotNull(data.get(CDC_LOG_FILE)); + assertNotNull(data.get(CDC_LOG_POS)); + assertNotNull(data.get(CDC_UPDATED_AT)); + } else { + assertNull(data.get(CDC_LOG_FILE)); + assertNull(data.get(CDC_LOG_POS)); + assertNull(data.get(CDC_UPDATED_AT)); + assertNull(data.get(CDC_DELETED_AT)); + } + + ((ObjectNode) data).remove(CDC_LOG_FILE); + ((ObjectNode) data).remove(CDC_LOG_POS); + ((ObjectNode) data).remove(CDC_UPDATED_AT); + ((ObjectNode) data).remove(CDC_DELETED_AT); + + return data; + }) + .collect(Collectors.toSet()); + + assertEquals(expectedRecords, actualData); + } + +} diff --git a/airbyte-integrations/connectors/source-postgres/src/main/java/io/airbyte/integrations/source/postgres/PostgresSource.java b/airbyte-integrations/connectors/source-postgres/src/main/java/io/airbyte/integrations/source/postgres/PostgresSource.java index 66fb76d101e2..6482eef659eb 100644 --- a/airbyte-integrations/connectors/source-postgres/src/main/java/io/airbyte/integrations/source/postgres/PostgresSource.java +++ b/airbyte-integrations/connectors/source-postgres/src/main/java/io/airbyte/integrations/source/postgres/PostgresSource.java @@ -219,6 +219,14 @@ public List> getIncrementalIterators(JsonN Map tableNameToTable, JdbcStateManager stateManager, Instant emittedAt) { + /** + * If a customer sets up a postgres source with cdc parameters (replication_slot and publication) + * but selects all the tables in FULL_REFRESH mode then we would still end up going through this + * path. We do have a check in place for debezium to make sure only tales in INCREMENTAL mode are + * synced {@link DebeziumRecordPublisher#getTableWhitelist(ConfiguredAirbyteCatalog)} but we should + * have a check here as well to make sure that if no table is in INCREMENTAL mode then skip this + * part + */ if (isCdc(config)) { // State works differently in CDC than it does in convention incremental. The state is written to an // offset file that debezium reads from. Then once all records are replicated, we read back that diff --git a/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceStandardTest.java b/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceStandardTest.java index b757effd354e..a17786a5627d 100644 --- a/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceStandardTest.java +++ b/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceStandardTest.java @@ -50,6 +50,10 @@ // todo (cgardens) - Sanity check that when configured for CDC that postgres performs like any other // incremental source. As we have more sources support CDC we will find a more reusable way of doing // this, but for now this is a solid sanity check. +/** + * None of the tests in this class use the cdc path (run the tests and search for `using CDC: false` + * in logs). This is exact same as {@link PostgresSourceStandardTest} + */ public class CdcPostgresSourceStandardTest extends StandardSourceTest { private static final String SLOT_NAME_BASE = "debezium_slot"; @@ -66,6 +70,11 @@ protected void setup(TestDestinationEnv testEnv) throws Exception { .withCommand("postgres -c config_file=/etc/postgresql/postgresql.conf"); container.start(); + /** + * The publication is not being set as part of the config and because of it + * {@link io.airbyte.integrations.source.postgres.PostgresSource#isCdc(JsonNode)} returns false, as + * a result no test in this class runs through the cdc path. + */ config = Jsons.jsonNode(ImmutableMap.builder() .put("host", container.getHost()) .put("port", container.getFirstMappedPort()) @@ -84,7 +93,10 @@ protected void setup(TestDestinationEnv testEnv) throws Exception { config.get("database").asText()), "org.postgresql.Driver", SQLDialect.POSTGRES); - + /** + * cdc expects the INCREMENTAL tables to contain primary key checkout + * {@link io.airbyte.integrations.source.postgres.PostgresSource#removeIncrementalWithoutPk(AirbyteStream)} + */ database.query(ctx -> { ctx.execute("SELECT pg_create_logical_replication_slot('" + SLOT_NAME_BASE + "', 'pgoutput');"); ctx.execute("CREATE TABLE id_and_name(id INTEGER, name VARCHAR(200));"); @@ -119,6 +131,14 @@ protected JsonNode getConfig() { @Override protected ConfiguredAirbyteCatalog getConfiguredCatalog() { + /** + * This catalog config is incorrect for CDC replication. We specify + * withCursorField(Lists.newArrayList("id")) but with CDC customers can't/shouldn't be able to + * specify cursor field for INCREMENTAL tables Take a look at + * {@link io.airbyte.integrations.source.postgres.PostgresSource#setIncrementalToSourceDefined(AirbyteStream)} + * We should also specify the primary keys for INCREMENTAL tables checkout + * {@link io.airbyte.integrations.source.postgres.PostgresSource#removeIncrementalWithoutPk(AirbyteStream)} + */ return new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList( new ConfiguredAirbyteStream() .withSyncMode(SyncMode.INCREMENTAL) diff --git a/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf b/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf new file mode 100644 index 000000000000..3aff81384e4b --- /dev/null +++ b/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf @@ -0,0 +1,783 @@ +# ----------------------------- +# PostgreSQL configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# (The "=" is optional.) Whitespace may be used. Comments are introduced with +# "#" anywhere on a line. The complete list of parameter names and allowed +# values can be found in the PostgreSQL documentation. +# +# The commented-out settings shown in this file represent the default values. +# Re-commenting a setting is NOT sufficient to revert it to the default value; +# you need to reload the server. +# +# This file is read on server startup and when the server receives a SIGHUP +# signal. If you edit the file on a running system, you have to SIGHUP the +# server for the changes to take effect, run "pg_ctl reload", or execute +# "SELECT pg_reload_conf()". Some parameters, which are marked below, +# require a server shutdown and restart to take effect. +# +# Any parameter can also be given as a command-line option to the server, e.g., +# "postgres -c log_connections=on". Some parameters can be changed at run time +# with the "SET" SQL command. +# +# Memory units: kB = kilobytes Time units: ms = milliseconds +# MB = megabytes s = seconds +# GB = gigabytes min = minutes +# TB = terabytes h = hours +# d = days + + +#------------------------------------------------------------------------------ +# FILE LOCATIONS +#------------------------------------------------------------------------------ + +# The default values of these variables are driven from the -D command-line +# option or PGDATA environment variable, represented here as ConfigDir. + +#data_directory = 'ConfigDir' # use data in another directory + # (change requires restart) +#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file + # (change requires restart) +#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file + # (change requires restart) + +# If external_pid_file is not explicitly set, no extra PID file is written. +#external_pid_file = '' # write an extra PID file + # (change requires restart) + + +#------------------------------------------------------------------------------ +# CONNECTIONS AND AUTHENTICATION +#------------------------------------------------------------------------------ + +# - Connection Settings - + +listen_addresses = '*' + # comma-separated list of addresses; + # defaults to 'localhost'; use '*' for all + # (change requires restart) +#port = 5432 # (change requires restart) +#max_connections = 100 # (change requires restart) +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories + # (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation + # (change requires restart) +#bonjour = off # advertise server via Bonjour + # (change requires restart) +#bonjour_name = '' # defaults to the computer name + # (change requires restart) + +# - TCP settings - +# see "man tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; + # 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; + # 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; + # 0 selects the system default +#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; + # 0 selects the system default + +# - Authentication - + +#authentication_timeout = 1min # 1s-600s +#password_encryption = md5 # md5 or scram-sha-256 +#db_user_namespace = off + +# GSSAPI using Kerberos +#krb_server_keyfile = '' +#krb_caseins_users = off + +# - SSL - + +#ssl = off +#ssl_ca_file = '' +#ssl_cert_file = 'server.crt' +#ssl_crl_file = '' +#ssl_key_file = 'server.key' +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers +#ssl_prefer_server_ciphers = on +#ssl_ecdh_curve = 'prime256v1' +#ssl_min_protocol_version = 'TLSv1.2' +#ssl_max_protocol_version = '' +#ssl_dh_params_file = '' +#ssl_passphrase_command = '' +#ssl_passphrase_command_supports_reload = off + + +#------------------------------------------------------------------------------ +# RESOURCE USAGE (except WAL) +#------------------------------------------------------------------------------ + +# - Memory - + +#shared_buffers = 32MB # min 128kB + # (change requires restart) +#huge_pages = try # on, off, or try + # (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature + # (change requires restart) +# Caution: it is not advisable to set max_prepared_transactions nonzero unless +# you actively intend to use prepared transactions. +#work_mem = 4MB # min 64kB +#hash_mem_multiplier = 1.0 # 1-1000.0 multiplier on hash table work_mem +#maintenance_work_mem = 64MB # min 1MB +#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem +#logical_decoding_work_mem = 64MB # min 64kB +#max_stack_depth = 2MB # min 100kB +#shared_memory_type = mmap # the default is the first option + # supported by the operating system: + # mmap + # sysv + # windows + # (change requires restart) +#dynamic_shared_memory_type = posix # the default is the first option + # supported by the operating system: + # posix + # sysv + # windows + # mmap + # (change requires restart) + +# - Disk - + +#temp_file_limit = -1 # limits per-process temp file space + # in kilobytes, or -1 for no limit + +# - Kernel Resources - + +#max_files_per_process = 1000 # min 64 + # (change requires restart) + +# - Cost-Based Vacuum Delay - + +#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 10 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits + +# - Background Writer - + +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round +#bgwriter_flush_after = 0 # measured in pages, 0 disables + +# - Asynchronous Behavior - + +#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +#maintenance_io_concurrency = 10 # 1-1000; 0 disables prefetching +#max_worker_processes = 8 # (change requires restart) +#max_parallel_maintenance_workers = 2 # taken from max_parallel_workers +#max_parallel_workers_per_gather = 2 # taken from max_parallel_workers +#parallel_leader_participation = on +#max_parallel_workers = 8 # maximum number of max_worker_processes that + # can be used in parallel operations +#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate + # (change requires restart) +#backend_flush_after = 0 # measured in pages, 0 disables + + +#------------------------------------------------------------------------------ +# WRITE-AHEAD LOG +#------------------------------------------------------------------------------ + +# - Settings - + +#wal_level = replica # minimal, replica, or logical + # (change requires restart) +#fsync = on # flush data to disk for crash safety + # (turning this off can cause + # unrecoverable data corruption) +#synchronous_commit = on # synchronization level; + # off, local, remote_write, remote_apply, or on +#wal_sync_method = fsync # the default is the first option + # supported by the operating system: + # open_datasync + # fdatasync (default on Linux) + # fsync + # fsync_writethrough + # open_sync +#full_page_writes = on # recover from partial page writes +#wal_compression = off # enable compression of full-page writes +#wal_log_hints = off # also do full page writes of non-critical updates + # (change requires restart) +#wal_init_zero = on # zero-fill new WAL files +#wal_recycle = on # recycle WAL files +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers + # (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_writer_flush_after = 1MB # measured in pages, 0 disables +#wal_skip_threshold = 2MB + +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 1-1000 + +# - Checkpoints - + +#checkpoint_timeout = 5min # range 30s-1d +#max_wal_size = 1GB +#min_wal_size = 80MB +#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_flush_after = 0 # measured in pages, 0 disables +#checkpoint_warning = 30s # 0 disables + +# - Archiving - + +#archive_mode = off # enables archiving; off, on, or always + # (change requires restart) +#archive_command = '' # command to use to archive a logfile segment + # placeholders: %p = path of file to archive + # %f = file name only + # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_timeout = 0 # force a logfile segment switch after this + # number of seconds; 0 disables + +# - Archive Recovery - + +# These are only used in recovery mode. + +#restore_command = '' # command to use to restore an archived logfile segment + # placeholders: %p = path of file to restore + # %f = file name only + # e.g. 'cp /mnt/server/archivedir/%f %p' + # (change requires restart) +#archive_cleanup_command = '' # command to execute at every restartpoint +#recovery_end_command = '' # command to execute at completion of recovery + +# - Recovery Target - + +# Set these only when performing a targeted recovery. + +#recovery_target = '' # 'immediate' to end recovery as soon as a + # consistent state is reached + # (change requires restart) +#recovery_target_name = '' # the named restore point to which recovery will proceed + # (change requires restart) +#recovery_target_time = '' # the time stamp up to which recovery will proceed + # (change requires restart) +#recovery_target_xid = '' # the transaction ID up to which recovery will proceed + # (change requires restart) +#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed + # (change requires restart) +#recovery_target_inclusive = on # Specifies whether to stop: + # just after the specified recovery target (on) + # just before the recovery target (off) + # (change requires restart) +#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID + # (change requires restart) +#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' + # (change requires restart) + + +#------------------------------------------------------------------------------ +# REPLICATION +#------------------------------------------------------------------------------ + +# - Sending Servers - + +# Set these on the master and on any standby that will send replication data. + +#max_wal_senders = 10 # max number of walsender processes + # (change requires restart) +#wal_keep_size = 0 # in megabytes; 0 disables +#max_slot_wal_keep_size = -1 # in megabytes; -1 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables + +#max_replication_slots = 10 # max number of replication slots + # (change requires restart) +#track_commit_timestamp = off # collect timestamp of transaction commit + # (change requires restart) + +# - Master Server - + +# These settings are ignored on a standby server. + +#synchronous_standby_names = '' # standby servers that provide sync rep + # method to choose sync standbys, number of sync standbys, + # and comma-separated list of application_name + # from standby(s); '*' = all +#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed + +# - Standby Servers - + +# These settings are ignored on a master server. + +#primary_conninfo = '' # connection string to sending server +#primary_slot_name = '' # replication slot on sending server +#promote_trigger_file = '' # file name whose presence ends recovery +#hot_standby = on # "off" disallows queries during recovery + # (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries + # when reading WAL from archive; + # -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries + # when reading streaming WAL; + # -1 allows indefinite delay +#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name + # is not set +#wal_receiver_status_interval = 10s # send replies at least this often + # 0 disables +#hot_standby_feedback = off # send info from standby to prevent + # query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for + # communication from master + # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt +#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery + +# - Subscribers - + +# These settings are ignored on a publisher. + +#max_logical_replication_workers = 4 # taken from max_worker_processes + # (change requires restart) +#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers + + +#------------------------------------------------------------------------------ +# QUERY TUNING +#------------------------------------------------------------------------------ + +# - Planner Method Configuration - + +#enable_bitmapscan = on +#enable_hashagg = on +#enable_hashjoin = on +#enable_indexscan = on +#enable_indexonlyscan = on +#enable_material = on +#enable_mergejoin = on +#enable_nestloop = on +#enable_parallel_append = on +#enable_seqscan = on +#enable_sort = on +#enable_incremental_sort = on +#enable_tidscan = on +#enable_partitionwise_join = off +#enable_partitionwise_aggregate = off +#enable_parallel_hash = on +#enable_partition_pruning = on + +# - Planner Cost Constants - + +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#parallel_tuple_cost = 0.1 # same scale as above +#parallel_setup_cost = 1000.0 # same scale as above + +#jit_above_cost = 100000 # perform JIT compilation if available + # and query more expensive than this; + # -1 disables +#jit_inline_above_cost = 500000 # inline small functions if query is + # more expensive than this; -1 disables +#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if + # query is more expensive than this; + # -1 disables + +#min_parallel_table_scan_size = 8MB +#min_parallel_index_scan_size = 512kB +#effective_cache_size = 4GB + +# - Genetic Query Optimizer - + +#geqo = on +#geqo_threshold = 12 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 + +# - Other Planner Options - + +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#from_collapse_limit = 8 +#join_collapse_limit = 8 # 1 disables collapsing of explicit + # JOIN clauses +#force_parallel_mode = off +#jit = on # allow JIT compilation +#plan_cache_mode = auto # auto, force_generic_plan or + # force_custom_plan + + +#------------------------------------------------------------------------------ +# REPORTING AND LOGGING +#------------------------------------------------------------------------------ + +# - Where to Log - + +#log_destination = 'stderr' # Valid values are combinations of + # stderr, csvlog, syslog, and eventlog, + # depending on platform. csvlog + # requires logging_collector to be on. + +# This is used when logging to stderr: +#logging_collector = off # Enable capturing of stderr and csvlog + # into log files. Required to be on for + # csvlogs. + # (change requires restart) + +# These are only used if logging_collector is on: +#log_directory = 'log' # directory where log files are written, + # can be absolute or relative to PGDATA +#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, + # can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, + # begin with 0 to use octal notation +#log_truncate_on_rotation = off # If on, an existing log file with the + # same name as the new log file will be + # truncated rather than appended to. + # But such truncation only occurs on + # time-driven rotation, not on restarts + # or size-driven rotation. Default is + # off, meaning append to existing files + # in all cases. +#log_rotation_age = 1d # Automatic rotation of logfiles will + # happen after that time. 0 disables. +#log_rotation_size = 10MB # Automatic rotation of logfiles will + # happen after that much log output. + # 0 disables. + +# These are relevant when logging to syslog: +#syslog_facility = 'LOCAL0' +#syslog_ident = 'postgres' +#syslog_sequence_numbers = on +#syslog_split_messages = on + +# This is only relevant when logging to eventlog (win32): +# (change requires restart) +#event_source = 'PostgreSQL' + +# - When to Log - + +#log_min_messages = warning # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic + +#log_min_error_statement = error # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements + # and their durations, > 0 logs only + # statements running at least this number + # of milliseconds + +#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements + # and their durations, > 0 logs only a sample of + # statements running at least this number + # of milliseconds; + # sample fraction is determined by log_statement_sample_rate + +#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding + # log_min_duration_sample to be logged; + # 1.0 logs all such statements, 0.0 never logs + + +#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements + # are logged regardless of their duration; 1.0 logs all + # statements from all transactions, 0.0 never logs + +# - What to Log - + +#debug_print_parse = off +#debug_print_rewritten = off +#debug_print_plan = off +#debug_pretty_print = on +#log_checkpoints = off +#log_connections = off +#log_disconnections = off +#log_duration = off +#log_error_verbosity = default # terse, default, or verbose messages +#log_hostname = off +#log_line_prefix = '%m [%p] ' # special values: + # %a = application name + # %u = user name + # %d = database name + # %r = remote host and port + # %h = remote host + # %b = backend type + # %p = process ID + # %t = timestamp without milliseconds + # %m = timestamp with milliseconds + # %n = timestamp with milliseconds (as a Unix epoch) + # %i = command tag + # %e = SQL state + # %c = session ID + # %l = session line number + # %s = session start timestamp + # %v = virtual transaction ID + # %x = transaction ID (0 if none) + # %q = stop here in non-session + # processes + # %% = '%' + # e.g. '<%u%%%d> ' +#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_parameter_max_length = -1 # when logging statements, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_parameter_max_length_on_error = 0 # when logging an error, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_statement = 'none' # none, ddl, mod, all +#log_replication_commands = off +#log_temp_files = -1 # log temporary files equal or larger + # than the specified size in kilobytes; + # -1 disables, 0 logs all temp files +#log_timezone = 'GMT' + +#------------------------------------------------------------------------------ +# PROCESS TITLE +#------------------------------------------------------------------------------ + +#cluster_name = '' # added to process titles if nonempty + # (change requires restart) +#update_process_title = on + + +#------------------------------------------------------------------------------ +# STATISTICS +#------------------------------------------------------------------------------ + +# - Query and Index Statistics Collector - + +#track_activities = on +#track_counts = on +#track_io_timing = off +#track_functions = none # none, pl, all +#track_activity_query_size = 1024 # (change requires restart) +#stats_temp_directory = 'pg_stat_tmp' + + +# - Monitoring - + +#log_parser_stats = off +#log_planner_stats = off +#log_executor_stats = off +#log_statement_stats = off + + +#------------------------------------------------------------------------------ +# AUTOVACUUM +#------------------------------------------------------------------------------ + +#autovacuum = on # Enable autovacuum subprocess? 'on' + # requires track_counts to also be on. +#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and + # their durations, > 0 logs only + # actions running at least this number + # of milliseconds. +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses + # (change requires restart) +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before + # vacuum +#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts + # before vacuum; -1 disables insert + # vacuums +#autovacuum_analyze_threshold = 50 # min number of row updates before + # analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of inserts over table + # size before insert vacuum +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum + # (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age + # before forced vacuum + # (change requires restart) +#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for + # autovacuum, in milliseconds; + # -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for + # autovacuum, -1 means use + # vacuum_cost_limit + + +#------------------------------------------------------------------------------ +# CLIENT CONNECTION DEFAULTS +#------------------------------------------------------------------------------ + +# - Statement Behavior - + +#client_min_messages = notice # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # log + # notice + # warning + # error +#search_path = '"$user", public' # schema names +#row_security = on +#default_tablespace = '' # a tablespace name, '' uses the default +#temp_tablespaces = '' # a list of tablespace names, '' uses + # only default tablespace +#default_table_access_method = 'heap' +#check_function_bodies = on +#default_transaction_isolation = 'read committed' +#default_transaction_read_only = off +#default_transaction_deferrable = off +#session_replication_role = 'origin' +#statement_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled +#vacuum_freeze_min_age = 50000000 +#vacuum_freeze_table_age = 150000000 +#vacuum_multixact_freeze_min_age = 5000000 +#vacuum_multixact_freeze_table_age = 150000000 +#vacuum_cleanup_index_scale_factor = 0.1 # fraction of total number of tuples + # before index cleanup, 0 always performs + # index cleanup +#bytea_output = 'hex' # hex, escape +#xmlbinary = 'base64' +#xmloption = 'content' +#gin_fuzzy_search_limit = 0 +#gin_pending_list_limit = 4MB + +# - Locale and Formatting - + +#datestyle = 'iso, mdy' +#intervalstyle = 'postgres' +#timezone = 'GMT' +#timezone_abbreviations = 'Default' # Select the set of available time zone + # abbreviations. Currently, there are + # Default + # Australia (historical usage) + # India + # You can create your own file in + # share/timezonesets/. +#extra_float_digits = 1 # min -15, max 3; any value >0 actually + # selects precise output mode +#client_encoding = sql_ascii # actually, defaults to database + # encoding + +# These settings are initialized by initdb, but they can be changed. +#lc_messages = 'C' # locale for system error message + # strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting + +# default configuration for text search +#default_text_search_config = 'pg_catalog.simple' + +# - Shared Library Preloading - + +#shared_preload_libraries = '' # (change requires restart) +#local_preload_libraries = '' +#session_preload_libraries = '' +#jit_provider = 'llvmjit' # JIT library to use + +# - Other Defaults - + +#dynamic_library_path = '$libdir' +#extension_destdir = '' # prepend path when loading extensions + # and shared objects (added by Debian) + + +#------------------------------------------------------------------------------ +# LOCK MANAGEMENT +#------------------------------------------------------------------------------ + +#deadlock_timeout = 1s +#max_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_relation = -2 # negative values mean + # (max_pred_locks_per_transaction + # / -max_pred_locks_per_relation) - 1 +#max_pred_locks_per_page = 2 # min 0 + + +#------------------------------------------------------------------------------ +# VERSION AND PLATFORM COMPATIBILITY +#------------------------------------------------------------------------------ + +# - Previous PostgreSQL Versions - + +#array_nulls = on +#backslash_quote = safe_encoding # on, off, or safe_encoding +#escape_string_warning = on +#lo_compat_privileges = off +#operator_precedence_warning = off +#quote_all_identifiers = off +#standard_conforming_strings = on +#synchronize_seqscans = on + +# - Other Platforms and Clients - + +#transform_null_equals = off + + +#------------------------------------------------------------------------------ +# ERROR HANDLING +#------------------------------------------------------------------------------ + +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? +#data_sync_retry = off # retry or panic on failure to fsync + # data? + # (change requires restart) + + +#------------------------------------------------------------------------------ +# CONFIG FILE INCLUDES +#------------------------------------------------------------------------------ + +# These options allow settings to be loaded from files other than the +# default postgresql.conf. Note that these are directives, not variable +# assignments, so they can usefully be given more than once. + +#include_dir = '...' # include files ending in '.conf' from + # a directory, e.g., 'conf.d' +#include_if_exists = '...' # include file only if it exists +#include = '...' # include file + + +#------------------------------------------------------------------------------ +# CUSTOMIZED OPTIONS +#------------------------------------------------------------------------------ +wal_level = logical +max_wal_senders = 30 +max_replication_slots = 30 + From f709724ade42dd1e99bd53f0b452fe53fff8122b Mon Sep 17 00:00:00 2001 From: subodh Date: Thu, 20 May 2021 21:52:44 +0530 Subject: [PATCH 02/13] add target file and position --- .../connectors/source-mysql/build.gradle | 2 - .../source/mysql/DebeziumRecordIterator.java | 76 ++++++------------- .../source/mysql/MySqlSource.java | 5 +- .../source/mysql/TargetFilePosition.java | 75 ++++++++++++++++++ 4 files changed, 101 insertions(+), 57 deletions(-) create mode 100644 airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/TargetFilePosition.java diff --git a/airbyte-integrations/connectors/source-mysql/build.gradle b/airbyte-integrations/connectors/source-mysql/build.gradle index 546b471995d4..6e582da14ef5 100644 --- a/airbyte-integrations/connectors/source-mysql/build.gradle +++ b/airbyte-integrations/connectors/source-mysql/build.gradle @@ -21,8 +21,6 @@ dependencies { implementation 'io.debezium:debezium-connector-mysql:1.4.2.Final' testImplementation testFixtures(project(':airbyte-integrations:connectors:source-jdbc')) - testImplementation project(":airbyte-json-validation") - testImplementation project(':airbyte-test-utils') testImplementation 'org.apache.commons:commons-lang3:3.11' testImplementation 'org.testcontainers:mysql:1.15.1' diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java index 6e19dca1a933..701fe3da3733 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java @@ -29,16 +29,11 @@ import io.airbyte.commons.json.Jsons; import io.airbyte.commons.lang.MoreBooleans; import io.airbyte.commons.util.AutoCloseableIterator; -import io.airbyte.db.PgLsn; import io.debezium.engine.ChangeEvent; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; import java.util.Optional; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,16 +57,16 @@ public class DebeziumRecordIterator extends AbstractIterator> queue; - // private final PgLsn targetLsn; + private final Optional targetFilePosition; private final Supplier publisherStatusSupplier; private final VoidCallable requestClose; public DebeziumRecordIterator(LinkedBlockingQueue> queue, - // PgLsn targetLsn, + Optional targetFilePosition, Supplier publisherStatusSupplier, VoidCallable requestClose) { this.queue = queue; - // this.targetLsn = targetLsn; + this.targetFilePosition = targetFilePosition; this.publisherStatusSupplier = publisherStatusSupplier; this.requestClose = requestClose; } @@ -97,10 +92,10 @@ protected ChangeEvent computeNext() { continue; } - // if the last record matches the target lsn, it is time to tell the producer to shutdown. - // if (shouldSignalClose(next)) { - // requestClose(); - // } + // if the last record matches the target file position, it is time to tell the producer to shutdown. + if (shouldSignalClose(next)) { + requestClose(); + } return next; } @@ -112,51 +107,24 @@ public void close() throws Exception { requestClose.call(); } - // private boolean shouldSignalClose(ChangeEvent event) { - - // for mysql - // SnapshotMetadata.valueOf(Jsons.deserialize(next.value()).get("source").get("snapshot").asText().toUpperCase()) - // - // Jsons.deserialize(next.value()).get("source").get("file") - // - // Jsons.deserialize(next.value()).get("source").get("pos") - - // final PgLsn eventLsn = extractLsn(event); - // - // if (targetLsn.compareTo(eventLsn) > 0) { - // return false; - // } else { - // final SnapshotMetadata snapshotMetadata = getSnapshotMetadata(event); - // // if not snapshot or is snapshot but last record in snapshot. - // return SnapshotMetadata.TRUE != snapshotMetadata; - // } - // } - - private SnapshotMetadata getSnapshotMetadata(ChangeEvent event) { - try { - final Method sourceRecordMethod = event.getClass().getMethod("sourceRecord"); - sourceRecordMethod.setAccessible(true); - final SourceRecord sourceRecord = (SourceRecord) sourceRecordMethod.invoke(event); - final String snapshot = ((Struct) sourceRecord.value()).getStruct("source").getString("snapshot"); - - if (snapshot == null) { - return null; + private boolean shouldSignalClose(ChangeEvent event) { + if (targetFilePosition.isPresent()) { + String file = Jsons.deserialize(event.value()).get("source").get("file").asText(); + int position = Jsons.deserialize(event.value()).get("source").get("pos").asInt(); + if (file.equals(targetFilePosition.get().fileName)) { + if (targetFilePosition.get().position >= position) { + return false; + } else { + // if not snapshot or is snapshot but last record in snapshot. + return SnapshotMetadata.TRUE != SnapshotMetadata.valueOf( + Jsons.deserialize(event.value()).get("source").get("snapshot").asText() + .toUpperCase()); + + } } - - // the snapshot field is an enum of true, false, and last. - return SnapshotMetadata.valueOf(snapshot.toUpperCase()); - } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { - throw new RuntimeException(e); } - } + return false; - private PgLsn extractLsn(ChangeEvent event) { - return Optional.ofNullable(event.value()) - .flatMap(value -> Optional.ofNullable(Jsons.deserialize(value).get("source"))) - .flatMap(source -> Optional.ofNullable(source.get("lsn").asText())) - .map(Long::parseLong) - .map(PgLsn::fromLong) - .orElseThrow(() -> new IllegalStateException("Could not find LSN")); } private void requestClose() { diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 2e6d07d37c28..3afb2c8efeaa 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -181,10 +181,13 @@ public List> getIncrementalIterators(JsonN offsetManager, dbHistoryStorageManager); publisher.start(queue); + Optional targetFilePosition = TargetFilePosition + .targetFilePosition(database); + // handle state machine around pub/sub logic. final AutoCloseableIterator> eventIterator = new DebeziumRecordIterator( queue, - // targetLsn, + targetFilePosition, publisher::hasClosed, publisher::close); diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/TargetFilePosition.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/TargetFilePosition.java new file mode 100644 index 000000000000..8e258ca432fe --- /dev/null +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/TargetFilePosition.java @@ -0,0 +1,75 @@ +/* + * MIT License + * + * Copyright (c) 2020 Airbyte + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package io.airbyte.integrations.source.mysql; + +import io.airbyte.db.jdbc.JdbcDatabase; +import java.sql.SQLException; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TargetFilePosition { + + private static final Logger LOGGER = LoggerFactory.getLogger(TargetFilePosition.class); + public final String fileName; + public final Integer position; + + public TargetFilePosition(String fileName, Integer position) { + this.fileName = fileName; + this.position = position; + } + + @Override + public String toString() { + return "FileName: " + fileName + ", Position : " + position; + } + + public static Optional targetFilePosition(JdbcDatabase database) { + try { + List masterStatus = database.resultSetQuery( + connection -> connection.createStatement().executeQuery("SHOW MASTER STATUS"), + resultSet -> { + String file = resultSet.getString("File"); + int position = resultSet.getInt("Position"); + if (file == null || position == 0) { + return new TargetFilePosition(null, null); + } + return new TargetFilePosition(file, position); + }).collect(Collectors.toList()); + TargetFilePosition targetFilePosition = masterStatus.get(0); + LOGGER.info("Target File position : " + targetFilePosition); + if (targetFilePosition.fileName == null || targetFilePosition == null) { + return Optional.empty(); + } + return Optional.of(targetFilePosition); + } catch (SQLException e) { + throw new RuntimeException(e); + } + + } + +} From 3dc9aaec21d4d9cbe5ea8d3e18c8a519d8c3b38b Mon Sep 17 00:00:00 2001 From: subodh Date: Fri, 21 May 2021 16:55:59 +0530 Subject: [PATCH 03/13] dont want to add file in this PR --- .../resources/postgresql.conf | 783 ------------------ 1 file changed, 783 deletions(-) delete mode 100644 airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf diff --git a/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf b/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf deleted file mode 100644 index 3aff81384e4b..000000000000 --- a/airbyte-integrations/connectors/source-postgres/src/test-integration/resources/postgresql.conf +++ /dev/null @@ -1,783 +0,0 @@ -# ----------------------------- -# PostgreSQL configuration file -# ----------------------------- -# -# This file consists of lines of the form: -# -# name = value -# -# (The "=" is optional.) Whitespace may be used. Comments are introduced with -# "#" anywhere on a line. The complete list of parameter names and allowed -# values can be found in the PostgreSQL documentation. -# -# The commented-out settings shown in this file represent the default values. -# Re-commenting a setting is NOT sufficient to revert it to the default value; -# you need to reload the server. -# -# This file is read on server startup and when the server receives a SIGHUP -# signal. If you edit the file on a running system, you have to SIGHUP the -# server for the changes to take effect, run "pg_ctl reload", or execute -# "SELECT pg_reload_conf()". Some parameters, which are marked below, -# require a server shutdown and restart to take effect. -# -# Any parameter can also be given as a command-line option to the server, e.g., -# "postgres -c log_connections=on". Some parameters can be changed at run time -# with the "SET" SQL command. -# -# Memory units: kB = kilobytes Time units: ms = milliseconds -# MB = megabytes s = seconds -# GB = gigabytes min = minutes -# TB = terabytes h = hours -# d = days - - -#------------------------------------------------------------------------------ -# FILE LOCATIONS -#------------------------------------------------------------------------------ - -# The default values of these variables are driven from the -D command-line -# option or PGDATA environment variable, represented here as ConfigDir. - -#data_directory = 'ConfigDir' # use data in another directory - # (change requires restart) -#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file - # (change requires restart) -#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file - # (change requires restart) - -# If external_pid_file is not explicitly set, no extra PID file is written. -#external_pid_file = '' # write an extra PID file - # (change requires restart) - - -#------------------------------------------------------------------------------ -# CONNECTIONS AND AUTHENTICATION -#------------------------------------------------------------------------------ - -# - Connection Settings - - -listen_addresses = '*' - # comma-separated list of addresses; - # defaults to 'localhost'; use '*' for all - # (change requires restart) -#port = 5432 # (change requires restart) -#max_connections = 100 # (change requires restart) -#superuser_reserved_connections = 3 # (change requires restart) -#unix_socket_directories = '/tmp' # comma-separated list of directories - # (change requires restart) -#unix_socket_group = '' # (change requires restart) -#unix_socket_permissions = 0777 # begin with 0 to use octal notation - # (change requires restart) -#bonjour = off # advertise server via Bonjour - # (change requires restart) -#bonjour_name = '' # defaults to the computer name - # (change requires restart) - -# - TCP settings - -# see "man tcp" for details - -#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; - # 0 selects the system default -#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; - # 0 selects the system default -#tcp_keepalives_count = 0 # TCP_KEEPCNT; - # 0 selects the system default -#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; - # 0 selects the system default - -# - Authentication - - -#authentication_timeout = 1min # 1s-600s -#password_encryption = md5 # md5 or scram-sha-256 -#db_user_namespace = off - -# GSSAPI using Kerberos -#krb_server_keyfile = '' -#krb_caseins_users = off - -# - SSL - - -#ssl = off -#ssl_ca_file = '' -#ssl_cert_file = 'server.crt' -#ssl_crl_file = '' -#ssl_key_file = 'server.key' -#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers -#ssl_prefer_server_ciphers = on -#ssl_ecdh_curve = 'prime256v1' -#ssl_min_protocol_version = 'TLSv1.2' -#ssl_max_protocol_version = '' -#ssl_dh_params_file = '' -#ssl_passphrase_command = '' -#ssl_passphrase_command_supports_reload = off - - -#------------------------------------------------------------------------------ -# RESOURCE USAGE (except WAL) -#------------------------------------------------------------------------------ - -# - Memory - - -#shared_buffers = 32MB # min 128kB - # (change requires restart) -#huge_pages = try # on, off, or try - # (change requires restart) -#temp_buffers = 8MB # min 800kB -#max_prepared_transactions = 0 # zero disables the feature - # (change requires restart) -# Caution: it is not advisable to set max_prepared_transactions nonzero unless -# you actively intend to use prepared transactions. -#work_mem = 4MB # min 64kB -#hash_mem_multiplier = 1.0 # 1-1000.0 multiplier on hash table work_mem -#maintenance_work_mem = 64MB # min 1MB -#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem -#logical_decoding_work_mem = 64MB # min 64kB -#max_stack_depth = 2MB # min 100kB -#shared_memory_type = mmap # the default is the first option - # supported by the operating system: - # mmap - # sysv - # windows - # (change requires restart) -#dynamic_shared_memory_type = posix # the default is the first option - # supported by the operating system: - # posix - # sysv - # windows - # mmap - # (change requires restart) - -# - Disk - - -#temp_file_limit = -1 # limits per-process temp file space - # in kilobytes, or -1 for no limit - -# - Kernel Resources - - -#max_files_per_process = 1000 # min 64 - # (change requires restart) - -# - Cost-Based Vacuum Delay - - -#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) -#vacuum_cost_page_hit = 1 # 0-10000 credits -#vacuum_cost_page_miss = 10 # 0-10000 credits -#vacuum_cost_page_dirty = 20 # 0-10000 credits -#vacuum_cost_limit = 200 # 1-10000 credits - -# - Background Writer - - -#bgwriter_delay = 200ms # 10-10000ms between rounds -#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables -#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round -#bgwriter_flush_after = 0 # measured in pages, 0 disables - -# - Asynchronous Behavior - - -#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching -#maintenance_io_concurrency = 10 # 1-1000; 0 disables prefetching -#max_worker_processes = 8 # (change requires restart) -#max_parallel_maintenance_workers = 2 # taken from max_parallel_workers -#max_parallel_workers_per_gather = 2 # taken from max_parallel_workers -#parallel_leader_participation = on -#max_parallel_workers = 8 # maximum number of max_worker_processes that - # can be used in parallel operations -#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate - # (change requires restart) -#backend_flush_after = 0 # measured in pages, 0 disables - - -#------------------------------------------------------------------------------ -# WRITE-AHEAD LOG -#------------------------------------------------------------------------------ - -# - Settings - - -#wal_level = replica # minimal, replica, or logical - # (change requires restart) -#fsync = on # flush data to disk for crash safety - # (turning this off can cause - # unrecoverable data corruption) -#synchronous_commit = on # synchronization level; - # off, local, remote_write, remote_apply, or on -#wal_sync_method = fsync # the default is the first option - # supported by the operating system: - # open_datasync - # fdatasync (default on Linux) - # fsync - # fsync_writethrough - # open_sync -#full_page_writes = on # recover from partial page writes -#wal_compression = off # enable compression of full-page writes -#wal_log_hints = off # also do full page writes of non-critical updates - # (change requires restart) -#wal_init_zero = on # zero-fill new WAL files -#wal_recycle = on # recycle WAL files -#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers - # (change requires restart) -#wal_writer_delay = 200ms # 1-10000 milliseconds -#wal_writer_flush_after = 1MB # measured in pages, 0 disables -#wal_skip_threshold = 2MB - -#commit_delay = 0 # range 0-100000, in microseconds -#commit_siblings = 5 # range 1-1000 - -# - Checkpoints - - -#checkpoint_timeout = 5min # range 30s-1d -#max_wal_size = 1GB -#min_wal_size = 80MB -#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 -#checkpoint_flush_after = 0 # measured in pages, 0 disables -#checkpoint_warning = 30s # 0 disables - -# - Archiving - - -#archive_mode = off # enables archiving; off, on, or always - # (change requires restart) -#archive_command = '' # command to use to archive a logfile segment - # placeholders: %p = path of file to archive - # %f = file name only - # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' -#archive_timeout = 0 # force a logfile segment switch after this - # number of seconds; 0 disables - -# - Archive Recovery - - -# These are only used in recovery mode. - -#restore_command = '' # command to use to restore an archived logfile segment - # placeholders: %p = path of file to restore - # %f = file name only - # e.g. 'cp /mnt/server/archivedir/%f %p' - # (change requires restart) -#archive_cleanup_command = '' # command to execute at every restartpoint -#recovery_end_command = '' # command to execute at completion of recovery - -# - Recovery Target - - -# Set these only when performing a targeted recovery. - -#recovery_target = '' # 'immediate' to end recovery as soon as a - # consistent state is reached - # (change requires restart) -#recovery_target_name = '' # the named restore point to which recovery will proceed - # (change requires restart) -#recovery_target_time = '' # the time stamp up to which recovery will proceed - # (change requires restart) -#recovery_target_xid = '' # the transaction ID up to which recovery will proceed - # (change requires restart) -#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed - # (change requires restart) -#recovery_target_inclusive = on # Specifies whether to stop: - # just after the specified recovery target (on) - # just before the recovery target (off) - # (change requires restart) -#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID - # (change requires restart) -#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' - # (change requires restart) - - -#------------------------------------------------------------------------------ -# REPLICATION -#------------------------------------------------------------------------------ - -# - Sending Servers - - -# Set these on the master and on any standby that will send replication data. - -#max_wal_senders = 10 # max number of walsender processes - # (change requires restart) -#wal_keep_size = 0 # in megabytes; 0 disables -#max_slot_wal_keep_size = -1 # in megabytes; -1 disables -#wal_sender_timeout = 60s # in milliseconds; 0 disables - -#max_replication_slots = 10 # max number of replication slots - # (change requires restart) -#track_commit_timestamp = off # collect timestamp of transaction commit - # (change requires restart) - -# - Master Server - - -# These settings are ignored on a standby server. - -#synchronous_standby_names = '' # standby servers that provide sync rep - # method to choose sync standbys, number of sync standbys, - # and comma-separated list of application_name - # from standby(s); '*' = all -#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed - -# - Standby Servers - - -# These settings are ignored on a master server. - -#primary_conninfo = '' # connection string to sending server -#primary_slot_name = '' # replication slot on sending server -#promote_trigger_file = '' # file name whose presence ends recovery -#hot_standby = on # "off" disallows queries during recovery - # (change requires restart) -#max_standby_archive_delay = 30s # max delay before canceling queries - # when reading WAL from archive; - # -1 allows indefinite delay -#max_standby_streaming_delay = 30s # max delay before canceling queries - # when reading streaming WAL; - # -1 allows indefinite delay -#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name - # is not set -#wal_receiver_status_interval = 10s # send replies at least this often - # 0 disables -#hot_standby_feedback = off # send info from standby to prevent - # query conflicts -#wal_receiver_timeout = 60s # time that receiver waits for - # communication from master - # in milliseconds; 0 disables -#wal_retrieve_retry_interval = 5s # time to wait before retrying to - # retrieve WAL after a failed attempt -#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery - -# - Subscribers - - -# These settings are ignored on a publisher. - -#max_logical_replication_workers = 4 # taken from max_worker_processes - # (change requires restart) -#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers - - -#------------------------------------------------------------------------------ -# QUERY TUNING -#------------------------------------------------------------------------------ - -# - Planner Method Configuration - - -#enable_bitmapscan = on -#enable_hashagg = on -#enable_hashjoin = on -#enable_indexscan = on -#enable_indexonlyscan = on -#enable_material = on -#enable_mergejoin = on -#enable_nestloop = on -#enable_parallel_append = on -#enable_seqscan = on -#enable_sort = on -#enable_incremental_sort = on -#enable_tidscan = on -#enable_partitionwise_join = off -#enable_partitionwise_aggregate = off -#enable_parallel_hash = on -#enable_partition_pruning = on - -# - Planner Cost Constants - - -#seq_page_cost = 1.0 # measured on an arbitrary scale -#random_page_cost = 4.0 # same scale as above -#cpu_tuple_cost = 0.01 # same scale as above -#cpu_index_tuple_cost = 0.005 # same scale as above -#cpu_operator_cost = 0.0025 # same scale as above -#parallel_tuple_cost = 0.1 # same scale as above -#parallel_setup_cost = 1000.0 # same scale as above - -#jit_above_cost = 100000 # perform JIT compilation if available - # and query more expensive than this; - # -1 disables -#jit_inline_above_cost = 500000 # inline small functions if query is - # more expensive than this; -1 disables -#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if - # query is more expensive than this; - # -1 disables - -#min_parallel_table_scan_size = 8MB -#min_parallel_index_scan_size = 512kB -#effective_cache_size = 4GB - -# - Genetic Query Optimizer - - -#geqo = on -#geqo_threshold = 12 -#geqo_effort = 5 # range 1-10 -#geqo_pool_size = 0 # selects default based on effort -#geqo_generations = 0 # selects default based on effort -#geqo_selection_bias = 2.0 # range 1.5-2.0 -#geqo_seed = 0.0 # range 0.0-1.0 - -# - Other Planner Options - - -#default_statistics_target = 100 # range 1-10000 -#constraint_exclusion = partition # on, off, or partition -#cursor_tuple_fraction = 0.1 # range 0.0-1.0 -#from_collapse_limit = 8 -#join_collapse_limit = 8 # 1 disables collapsing of explicit - # JOIN clauses -#force_parallel_mode = off -#jit = on # allow JIT compilation -#plan_cache_mode = auto # auto, force_generic_plan or - # force_custom_plan - - -#------------------------------------------------------------------------------ -# REPORTING AND LOGGING -#------------------------------------------------------------------------------ - -# - Where to Log - - -#log_destination = 'stderr' # Valid values are combinations of - # stderr, csvlog, syslog, and eventlog, - # depending on platform. csvlog - # requires logging_collector to be on. - -# This is used when logging to stderr: -#logging_collector = off # Enable capturing of stderr and csvlog - # into log files. Required to be on for - # csvlogs. - # (change requires restart) - -# These are only used if logging_collector is on: -#log_directory = 'log' # directory where log files are written, - # can be absolute or relative to PGDATA -#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, - # can include strftime() escapes -#log_file_mode = 0600 # creation mode for log files, - # begin with 0 to use octal notation -#log_truncate_on_rotation = off # If on, an existing log file with the - # same name as the new log file will be - # truncated rather than appended to. - # But such truncation only occurs on - # time-driven rotation, not on restarts - # or size-driven rotation. Default is - # off, meaning append to existing files - # in all cases. -#log_rotation_age = 1d # Automatic rotation of logfiles will - # happen after that time. 0 disables. -#log_rotation_size = 10MB # Automatic rotation of logfiles will - # happen after that much log output. - # 0 disables. - -# These are relevant when logging to syslog: -#syslog_facility = 'LOCAL0' -#syslog_ident = 'postgres' -#syslog_sequence_numbers = on -#syslog_split_messages = on - -# This is only relevant when logging to eventlog (win32): -# (change requires restart) -#event_source = 'PostgreSQL' - -# - When to Log - - -#log_min_messages = warning # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # info - # notice - # warning - # error - # log - # fatal - # panic - -#log_min_error_statement = error # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # info - # notice - # warning - # error - # log - # fatal - # panic (effectively off) - -#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements - # and their durations, > 0 logs only - # statements running at least this number - # of milliseconds - -#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements - # and their durations, > 0 logs only a sample of - # statements running at least this number - # of milliseconds; - # sample fraction is determined by log_statement_sample_rate - -#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding - # log_min_duration_sample to be logged; - # 1.0 logs all such statements, 0.0 never logs - - -#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements - # are logged regardless of their duration; 1.0 logs all - # statements from all transactions, 0.0 never logs - -# - What to Log - - -#debug_print_parse = off -#debug_print_rewritten = off -#debug_print_plan = off -#debug_pretty_print = on -#log_checkpoints = off -#log_connections = off -#log_disconnections = off -#log_duration = off -#log_error_verbosity = default # terse, default, or verbose messages -#log_hostname = off -#log_line_prefix = '%m [%p] ' # special values: - # %a = application name - # %u = user name - # %d = database name - # %r = remote host and port - # %h = remote host - # %b = backend type - # %p = process ID - # %t = timestamp without milliseconds - # %m = timestamp with milliseconds - # %n = timestamp with milliseconds (as a Unix epoch) - # %i = command tag - # %e = SQL state - # %c = session ID - # %l = session line number - # %s = session start timestamp - # %v = virtual transaction ID - # %x = transaction ID (0 if none) - # %q = stop here in non-session - # processes - # %% = '%' - # e.g. '<%u%%%d> ' -#log_lock_waits = off # log lock waits >= deadlock_timeout -#log_parameter_max_length = -1 # when logging statements, limit logged - # bind-parameter values to N bytes; - # -1 means print in full, 0 disables -#log_parameter_max_length_on_error = 0 # when logging an error, limit logged - # bind-parameter values to N bytes; - # -1 means print in full, 0 disables -#log_statement = 'none' # none, ddl, mod, all -#log_replication_commands = off -#log_temp_files = -1 # log temporary files equal or larger - # than the specified size in kilobytes; - # -1 disables, 0 logs all temp files -#log_timezone = 'GMT' - -#------------------------------------------------------------------------------ -# PROCESS TITLE -#------------------------------------------------------------------------------ - -#cluster_name = '' # added to process titles if nonempty - # (change requires restart) -#update_process_title = on - - -#------------------------------------------------------------------------------ -# STATISTICS -#------------------------------------------------------------------------------ - -# - Query and Index Statistics Collector - - -#track_activities = on -#track_counts = on -#track_io_timing = off -#track_functions = none # none, pl, all -#track_activity_query_size = 1024 # (change requires restart) -#stats_temp_directory = 'pg_stat_tmp' - - -# - Monitoring - - -#log_parser_stats = off -#log_planner_stats = off -#log_executor_stats = off -#log_statement_stats = off - - -#------------------------------------------------------------------------------ -# AUTOVACUUM -#------------------------------------------------------------------------------ - -#autovacuum = on # Enable autovacuum subprocess? 'on' - # requires track_counts to also be on. -#log_autovacuum_min_duration = -1 # -1 disables, 0 logs all actions and - # their durations, > 0 logs only - # actions running at least this number - # of milliseconds. -#autovacuum_max_workers = 3 # max number of autovacuum subprocesses - # (change requires restart) -#autovacuum_naptime = 1min # time between autovacuum runs -#autovacuum_vacuum_threshold = 50 # min number of row updates before - # vacuum -#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts - # before vacuum; -1 disables insert - # vacuums -#autovacuum_analyze_threshold = 50 # min number of row updates before - # analyze -#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum -#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of inserts over table - # size before insert vacuum -#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze -#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum - # (change requires restart) -#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age - # before forced vacuum - # (change requires restart) -#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for - # autovacuum, in milliseconds; - # -1 means use vacuum_cost_delay -#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for - # autovacuum, -1 means use - # vacuum_cost_limit - - -#------------------------------------------------------------------------------ -# CLIENT CONNECTION DEFAULTS -#------------------------------------------------------------------------------ - -# - Statement Behavior - - -#client_min_messages = notice # values in order of decreasing detail: - # debug5 - # debug4 - # debug3 - # debug2 - # debug1 - # log - # notice - # warning - # error -#search_path = '"$user", public' # schema names -#row_security = on -#default_tablespace = '' # a tablespace name, '' uses the default -#temp_tablespaces = '' # a list of tablespace names, '' uses - # only default tablespace -#default_table_access_method = 'heap' -#check_function_bodies = on -#default_transaction_isolation = 'read committed' -#default_transaction_read_only = off -#default_transaction_deferrable = off -#session_replication_role = 'origin' -#statement_timeout = 0 # in milliseconds, 0 is disabled -#lock_timeout = 0 # in milliseconds, 0 is disabled -#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled -#vacuum_freeze_min_age = 50000000 -#vacuum_freeze_table_age = 150000000 -#vacuum_multixact_freeze_min_age = 5000000 -#vacuum_multixact_freeze_table_age = 150000000 -#vacuum_cleanup_index_scale_factor = 0.1 # fraction of total number of tuples - # before index cleanup, 0 always performs - # index cleanup -#bytea_output = 'hex' # hex, escape -#xmlbinary = 'base64' -#xmloption = 'content' -#gin_fuzzy_search_limit = 0 -#gin_pending_list_limit = 4MB - -# - Locale and Formatting - - -#datestyle = 'iso, mdy' -#intervalstyle = 'postgres' -#timezone = 'GMT' -#timezone_abbreviations = 'Default' # Select the set of available time zone - # abbreviations. Currently, there are - # Default - # Australia (historical usage) - # India - # You can create your own file in - # share/timezonesets/. -#extra_float_digits = 1 # min -15, max 3; any value >0 actually - # selects precise output mode -#client_encoding = sql_ascii # actually, defaults to database - # encoding - -# These settings are initialized by initdb, but they can be changed. -#lc_messages = 'C' # locale for system error message - # strings -#lc_monetary = 'C' # locale for monetary formatting -#lc_numeric = 'C' # locale for number formatting -#lc_time = 'C' # locale for time formatting - -# default configuration for text search -#default_text_search_config = 'pg_catalog.simple' - -# - Shared Library Preloading - - -#shared_preload_libraries = '' # (change requires restart) -#local_preload_libraries = '' -#session_preload_libraries = '' -#jit_provider = 'llvmjit' # JIT library to use - -# - Other Defaults - - -#dynamic_library_path = '$libdir' -#extension_destdir = '' # prepend path when loading extensions - # and shared objects (added by Debian) - - -#------------------------------------------------------------------------------ -# LOCK MANAGEMENT -#------------------------------------------------------------------------------ - -#deadlock_timeout = 1s -#max_locks_per_transaction = 64 # min 10 - # (change requires restart) -#max_pred_locks_per_transaction = 64 # min 10 - # (change requires restart) -#max_pred_locks_per_relation = -2 # negative values mean - # (max_pred_locks_per_transaction - # / -max_pred_locks_per_relation) - 1 -#max_pred_locks_per_page = 2 # min 0 - - -#------------------------------------------------------------------------------ -# VERSION AND PLATFORM COMPATIBILITY -#------------------------------------------------------------------------------ - -# - Previous PostgreSQL Versions - - -#array_nulls = on -#backslash_quote = safe_encoding # on, off, or safe_encoding -#escape_string_warning = on -#lo_compat_privileges = off -#operator_precedence_warning = off -#quote_all_identifiers = off -#standard_conforming_strings = on -#synchronize_seqscans = on - -# - Other Platforms and Clients - - -#transform_null_equals = off - - -#------------------------------------------------------------------------------ -# ERROR HANDLING -#------------------------------------------------------------------------------ - -#exit_on_error = off # terminate session on any error? -#restart_after_crash = on # reinitialize after backend crash? -#data_sync_retry = off # retry or panic on failure to fsync - # data? - # (change requires restart) - - -#------------------------------------------------------------------------------ -# CONFIG FILE INCLUDES -#------------------------------------------------------------------------------ - -# These options allow settings to be loaded from files other than the -# default postgresql.conf. Note that these are directives, not variable -# assignments, so they can usefully be given more than once. - -#include_dir = '...' # include files ending in '.conf' from - # a directory, e.g., 'conf.d' -#include_if_exists = '...' # include file only if it exists -#include = '...' # include file - - -#------------------------------------------------------------------------------ -# CUSTOMIZED OPTIONS -#------------------------------------------------------------------------------ -wal_level = logical -max_wal_senders = 30 -max_replication_slots = 30 - From 17c50ee836bcaa11aca7197d070fc3e2c7ebe3fe Mon Sep 17 00:00:00 2001 From: subodh Date: Fri, 21 May 2021 18:20:52 +0530 Subject: [PATCH 04/13] refine tests + add comments --- ...eFileDatabaseHistoryStorageOperations.java | 7 + .../source/mysql/DebeziumRecordPublisher.java | 8 +- .../mysql/FilteredFileDatabaseHistory.java | 40 +- .../source/mysql/CdcMySqlStandardTest.java | 73 ++-- .../source/mysql/CdcMySqlSourceTest.java | 350 +++++++++--------- 5 files changed, 270 insertions(+), 208 deletions(-) diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java index 3137a38c2fc5..10f1a383a42f 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java @@ -43,6 +43,13 @@ import java.util.function.Consumer; import org.apache.commons.io.FileUtils; +/** + * The purpose of this class is : to , 1. Read the contents of the file {@link #path} at the end of + * the sync so that it can be saved in state for future syncs. Check {@link #read()} 2. Write the + * saved content back to the file {@link #path} at the beginning of the sync so that debezium cant + * function smoothly. Check {@link #persist(CdcState)} To under more about fine please refer + * {@link FilteredFileDatabaseHistory} + */ public class AirbyteFileDatabaseHistoryStorageOperations { private final Path path; diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java index ffa41b0ed0d5..5df48c604eb5 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java @@ -25,7 +25,6 @@ package io.airbyte.integrations.source.mysql; import com.fasterxml.jackson.databind.JsonNode; -import com.google.common.annotations.VisibleForTesting; import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; import io.airbyte.protocol.models.ConfiguredAirbyteStream; import io.airbyte.protocol.models.SyncMode; @@ -182,19 +181,18 @@ protected Properties getDebeziumProperties(JsonNode config, } // table selection - final String tableWhitelist = getTableWhitelist(catalog); + final String tableWhitelist = getTableWhitelist(catalog, config); props.setProperty("table.include.list", tableWhitelist); props.setProperty("database.include.list", config.get("database").asText()); return props; } - @VisibleForTesting - protected static String getTableWhitelist(ConfiguredAirbyteCatalog catalog) { + private static String getTableWhitelist(ConfiguredAirbyteCatalog catalog, JsonNode config) { return catalog.getStreams().stream() .filter(s -> s.getSyncMode() == SyncMode.INCREMENTAL) .map(ConfiguredAirbyteStream::getStream) - .map(stream -> stream.getNamespace() + "." + stream.getName()) + .map(stream -> config.get("database").asText() + "." + stream.getName()) // debezium needs commas escaped to split properly .map(x -> StringUtils.escape(x, new char[] {','}, "\\,")) .collect(Collectors.joining(",")); diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java index 3b8e7c1bb7c1..741adbea98e9 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java @@ -24,6 +24,8 @@ package io.airbyte.integrations.source.mysql; +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; import io.debezium.config.Configuration; import io.debezium.relational.history.AbstractDatabaseHistory; import io.debezium.relational.history.DatabaseHistoryException; @@ -36,6 +38,24 @@ import java.lang.reflect.Method; import java.util.function.Consumer; +/** + * MySQL Debezium connector monitors the database schema evolution over the time and stores the data + * in database history file. Without this file we can't fetch the records from binlog. We need to + * save the contents of the file. Debezium by default uses + * {@link io.debezium.relational.history.FileDatabaseHistory} class to write the schema information + * in the file. The problem is that the Debezium tracks the schema evolution of all the tables in + * all the databases, because of that the file content can grow. In order to make sure that debezium + * tracks only the schema of the tables that are present in the database that Airbyte is syncing, we + * created this class. In the method {@link #storeRecord(HistoryRecord)}, we introduced a check to + * make sure only those records are being saved whose database name matches the database Airbyte is + * syncing. We tell debezium to use this class by passing it as property in debezium engine. Look + * for "database.history" property in + * {@link DebeziumRecordPublisher#getDebeziumProperties(JsonNode, ConfiguredAirbyteCatalog, AirbyteFileOffsetBackingStore)} + * Ideally {@link FilteredFileDatabaseHistory} should have extended + * {@link io.debezium.relational.history.FileDatabaseHistory} and overridden the + * {@link #storeRecord(HistoryRecord)} method but {@link FilteredFileDatabaseHistory} is a final + * class and can not be inherited + */ public class FilteredFileDatabaseHistory extends AbstractDatabaseHistory { private final FileDatabaseHistory fileDatabaseHistory; @@ -45,6 +65,14 @@ public FilteredFileDatabaseHistory() { this.fileDatabaseHistory = new FileDatabaseHistory(); } + /** + * Ideally the databaseName should have been initialized in the constructor of the class. But since + * we supply the class name to debezium and it uses reflection to construct the object of the class, + * we can't pass in the databaseName as a parameter to the constructor. That's why we had to take + * the static approach. + * + * @param databaseName Name of the database that the connector is syncing + */ static void setDatabaseName(String databaseName) { if (FilteredFileDatabaseHistory.databaseName == null) { FilteredFileDatabaseHistory.databaseName = databaseName; @@ -79,6 +107,11 @@ public void storeRecord(HistoryRecord record) throws DatabaseHistoryException { return; } + /** + * We are using reflection because the method + * {@link io.debezium.relational.history.FileDatabaseHistory#storeRecord(HistoryRecord)} is + * protected and can not be accessed from here + */ final Method storeRecordMethod = fileDatabaseHistory.getClass() .getDeclaredMethod("storeRecord", record.getClass()); storeRecordMethod.setAccessible(true); @@ -91,13 +124,18 @@ public void storeRecord(HistoryRecord record) throws DatabaseHistoryException { @Override public void stop() { fileDatabaseHistory.stop(); - // this is mainly for tests + // this is just for tests databaseName = null; } @Override protected void recoverRecords(Consumer records) { try { + /** + * We are using reflection because the method + * {@link io.debezium.relational.history.FileDatabaseHistory#recoverRecords(Consumer)} is protected + * and can not be accessed from here + */ final Method recoverRecords = fileDatabaseHistory.getClass() .getDeclaredMethod("recoverRecords", Consumer.class); recoverRecords.setAccessible(true); diff --git a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java index 05755cc52b4a..7262d8d4dc5a 100644 --- a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java +++ b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java @@ -24,8 +24,6 @@ package io.airbyte.integrations.source.mysql; -import static io.airbyte.integrations.source.mysql.MySqlSource.DRIVER_CLASS; - import com.fasterxml.jackson.databind.JsonNode; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -87,7 +85,8 @@ protected ConfiguredAirbyteCatalog getConfiguredCatalog() { Field.of("name", JsonSchemaPrimitive.STRING)) .withSourceDefinedCursor(true) .withSourceDefinedPrimaryKey(List.of(List.of("id"))) - .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))), + .withSupportedSyncModes( + Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))), new ConfiguredAirbyteStream() .withSyncMode(SyncMode.INCREMENTAL) .withDestinationSyncMode(DestinationSyncMode.APPEND) @@ -98,7 +97,8 @@ protected ConfiguredAirbyteCatalog getConfiguredCatalog() { Field.of("name", JsonSchemaPrimitive.STRING)) .withSourceDefinedCursor(true) .withSourceDefinedPrimaryKey(List.of(List.of("id"))) - .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))))); + .withSupportedSyncModes( + Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL))))); } @Override @@ -112,7 +112,7 @@ protected List getRegexTests() { } @Override - protected void setup(TestDestinationEnv testEnv) throws Exception { + protected void setup(TestDestinationEnv testEnv) { container = new MySQLContainer<>("mysql:8.0"); container.start(); @@ -120,32 +120,51 @@ protected void setup(TestDestinationEnv testEnv) throws Exception { .put("host", container.getHost()) .put("port", container.getFirstMappedPort()) .put("database", container.getDatabaseName()) - .put("username", "root") - .put("password", "test") + .put("username", container.getUsername()) + .put("password", container.getPassword()) .put("replication_method", "CDC") .build()); - final Database database = Databases.createDatabase( - config.get("username").asText(), - config.get("password").asText(), + revokeAllPermissions(); + grantCorrectPermissions(); + createAndPopulateTables(); + } + + private void createAndPopulateTables() { + executeQuery("CREATE TABLE id_and_name(id INTEGER PRIMARY KEY, name VARCHAR(200));"); + executeQuery( + "INSERT INTO id_and_name (id, name) VALUES (1,'picard'), (2, 'crusher'), (3, 'vash');"); + executeQuery("CREATE TABLE starships(id INTEGER PRIMARY KEY, name VARCHAR(200));"); + executeQuery( + "INSERT INTO starships (id, name) VALUES (1,'enterprise-d'), (2, 'defiant'), (3, 'yamato');"); + } + + private void revokeAllPermissions() { + executeQuery("REVOKE ALL PRIVILEGES, GRANT OPTION FROM " + container.getUsername() + "@'%';"); + } + + private void grantCorrectPermissions() { + executeQuery( + "GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO " + + container.getUsername() + "@'%';"); + } + + private void executeQuery(String query) { + try (Database database = Databases.createDatabase( + "root", + "test", String.format("jdbc:mysql://%s:%s/%s", - config.get("host").asText(), - config.get("port").asText(), - config.get("database").asText()), - DRIVER_CLASS, - SQLDialect.MYSQL); - - database.query(ctx -> { - ctx.fetch("CREATE TABLE id_and_name(id INTEGER PRIMARY KEY, name VARCHAR(200));"); - ctx.fetch( - "INSERT INTO id_and_name (id, name) VALUES (1,'picard'), (2, 'crusher'), (3, 'vash');"); - ctx.fetch("CREATE TABLE starships(id INTEGER PRIMARY KEY, name VARCHAR(200));"); - ctx.fetch( - "INSERT INTO starships (id, name) VALUES (1,'enterprise-d'), (2, 'defiant'), (3, 'yamato');"); - return null; - }); - - database.close(); + container.getHost(), + container.getFirstMappedPort(), + container.getDatabaseName()), + MySqlSource.DRIVER_CLASS, + SQLDialect.MYSQL)) { + database.query( + ctx -> ctx + .execute(query)); + } catch (Exception e) { + throw new RuntimeException(e); + } } @Override diff --git a/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java b/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java index 6e9f1dd2efa5..28982bb462f1 100644 --- a/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java +++ b/airbyte-integrations/connectors/source-mysql/src/test/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceTest.java @@ -61,6 +61,7 @@ import io.airbyte.protocol.models.Field; import io.airbyte.protocol.models.Field.JsonSchemaPrimitive; import io.airbyte.protocol.models.SyncMode; +import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -69,7 +70,6 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.jooq.DSLContext; import org.jooq.SQLDialect; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -90,7 +90,7 @@ public class CdcMySqlSourceTest { private static final String COL_ID = "id"; private static final String COL_MAKE_ID = "make_id"; private static final String COL_MODEL = "model"; - private static final String dbName = MODELS_SCHEMA; + private static final String DB_NAME = MODELS_SCHEMA; private static final AirbyteCatalog CATALOG = new AirbyteCatalog().withStreams(List.of( CatalogHelpers.createAirbyteStream( @@ -117,107 +117,151 @@ public class CdcMySqlSourceTest { Jsons.jsonNode(ImmutableMap.of(COL_ID, 15, COL_MAKE_ID, 2, COL_MODEL, "A 220")), Jsons.jsonNode(ImmutableMap.of(COL_ID, 16, COL_MAKE_ID, 2, COL_MODEL, "E 350"))); - private static MySQLContainer container; - + private MySQLContainer container; private Database database; private MySqlSource source; + private JsonNode config; - @AfterEach - public void tearDown() { - container.close(); - container.stop(); + @BeforeEach + public void setup() { + init(); + revokeAllPermissions(); + grantCorrectPermissions(); + createAndPopulateTables(); } - @BeforeEach - public void setup() throws Exception { + private void init() { container = new MySQLContainer<>("mysql:8.0"); container.start(); source = new MySqlSource(); - final JsonNode config = getConfig(container, dbName); - database = getDatabaseFromConfig(config); - database.query(ctx -> { - ctx.execute("CREATE DATABASE " + MODELS_SCHEMA + ";"); - ctx.execute(String - .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", - MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); - - for (JsonNode recordJson : MODEL_RECORDS) { - writeModelRecord(ctx, recordJson); - } + database = Databases.createDatabase( + "root", + "test", + String.format("jdbc:mysql://%s:%s", + container.getHost(), + container.getFirstMappedPort()), + DRIVER_CLASS, + SQLDialect.MYSQL); - return null; - }); - /** - * This database and table is not part of Airbyte sync. It is being created just to make sure the - * databases not being synced by Airbyte are not causing issues with our debezium logic - */ - database.query(ctx -> { - ctx.execute("CREATE DATABASE " + MODELS_SCHEMA + "_random" + ";"); - ctx.execute(String - .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", - MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", COL_ID + "_random", - COL_MAKE_ID + "_random", - COL_MODEL + "_random", COL_ID + "_random")); - - final List MODEL_RECORDS_RANDOM = ImmutableList.of( - Jsons - .jsonNode(ImmutableMap - .of(COL_ID + "_random", 11000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", - "Fiesta-random")), - Jsons.jsonNode(ImmutableMap - .of(COL_ID + "_random", 12000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", - "Focus-random")), - Jsons - .jsonNode(ImmutableMap - .of(COL_ID + "_random", 13000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", - "Ranger-random")), - Jsons.jsonNode(ImmutableMap - .of(COL_ID + "_random", 14000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", - "GLA-random")), - Jsons.jsonNode(ImmutableMap - .of(COL_ID + "_random", 15000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", - "A 220-random")), - Jsons - .jsonNode(ImmutableMap - .of(COL_ID + "_random", 16000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", - "E 350-random"))); - for (JsonNode recordJson : MODEL_RECORDS_RANDOM) { - writeRecords(ctx, recordJson, MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", - COL_ID + "_random", COL_MAKE_ID + "_random", COL_MODEL + "_random"); - } + config = Jsons.jsonNode(ImmutableMap.builder() + .put("host", container.getHost()) + .put("port", container.getFirstMappedPort()) + .put("database", CdcMySqlSourceTest.DB_NAME) + .put("username", container.getUsername()) + .put("password", container.getPassword()) + .put("replication_method", "CDC") + .build()); + } - return null; - }); + private void revokeAllPermissions() { + executeQuery("REVOKE ALL PRIVILEGES, GRANT OPTION FROM " + container.getUsername() + "@'%';"); } - private JsonNode getConfig(MySQLContainer db, String dbName) { + private void grantCorrectPermissions() { + executeQuery( + "GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO " + + container.getUsername() + "@'%';"); + } - return Jsons.jsonNode(ImmutableMap.builder() - .put("host", db.getHost()) - .put("port", db.getFirstMappedPort()) - .put("database", dbName) - .put("username", "root") - .put("password", "test") - .put("replication_method", "CDC") - .build()); + private void executeQuery(String query) { + try { + database.query( + ctx -> ctx + .execute(query)); + } catch (SQLException e) { + throw new RuntimeException(e); + } } - private Database getDatabaseFromConfig(JsonNode config) { - return Databases.createDatabase( - config.get("username").asText(), - config.get("password").asText(), - String.format("jdbc:mysql://%s:%s", - config.get("host").asText(), - config.get("port").asText()), - DRIVER_CLASS, - SQLDialect.MYSQL); + private void createAndPopulateTables() { + createAndPopulateActualTable(); + createAndPopulateRandomTable(); + } + + private void createAndPopulateActualTable() { + executeQuery("CREATE DATABASE " + MODELS_SCHEMA + ";"); + executeQuery(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); + for (JsonNode recordJson : MODEL_RECORDS) { + writeModelRecord(recordJson); + } + } + + /** + * This database and table is not part of Airbyte sync. It is being created just to make sure the + * databases not being synced by Airbyte are not causing issues with our debezium logic + */ + private void createAndPopulateRandomTable() { + executeQuery("CREATE DATABASE " + MODELS_SCHEMA + "_random" + ";"); + executeQuery(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", COL_ID + "_random", + COL_MAKE_ID + "_random", + COL_MODEL + "_random", COL_ID + "_random")); + final List MODEL_RECORDS_RANDOM = ImmutableList.of( + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 11000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Fiesta-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 12000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Focus-random")), + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 13000, COL_MAKE_ID + "_random", 1, COL_MODEL + "_random", + "Ranger-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 14000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "GLA-random")), + Jsons.jsonNode(ImmutableMap + .of(COL_ID + "_random", 15000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "A 220-random")), + Jsons + .jsonNode(ImmutableMap + .of(COL_ID + "_random", 16000, COL_MAKE_ID + "_random", 2, COL_MODEL + "_random", + "E 350-random"))); + for (JsonNode recordJson : MODEL_RECORDS_RANDOM) { + writeRecords(recordJson, MODELS_SCHEMA + "_random", MODELS_STREAM_NAME + "_random", + COL_ID + "_random", COL_MAKE_ID + "_random", COL_MODEL + "_random"); + } + } + + private void writeModelRecord(JsonNode recordJson) { + writeRecords(recordJson, CdcMySqlSourceTest.MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, + COL_MAKE_ID, + COL_MODEL); + } + + private void writeRecords( + JsonNode recordJson, + String dbName, + String streamName, + String idCol, + String makeIdCol, + String modelCol) { + executeQuery( + String.format("INSERT INTO %s.%s (%s, %s, %s) VALUES (%s, %s, '%s');", dbName, streamName, + idCol, makeIdCol, modelCol, + recordJson.get(idCol).asInt(), recordJson.get(makeIdCol).asInt(), + recordJson.get(modelCol).asText())); + } + + @AfterEach + public void tearDown() { + try { + database.close(); + container.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } } @Test @DisplayName("On the first sync, produce returns records that exist in the database.") void testExistingData() throws Exception { final AutoCloseableIterator read = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List actualRecords = AutoCloseableIterators.toListAndClose(read); final Set recordMessages = extractRecordMessages(actualRecords); @@ -232,22 +276,19 @@ void testExistingData() throws Exception { @DisplayName("When a record is deleted, produces a deletion record.") void testDelete() throws Exception { final AutoCloseableIterator read1 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); final List stateMessages1 = extractStateMessages(actualRecords1); assertExpectedStateMessages(stateMessages1); - database.query(ctx -> { - ctx.execute(String - .format("DELETE FROM %s.%s WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, - 11)); - return null; - }); + executeQuery(String + .format("DELETE FROM %s.%s WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, COL_ID, + 11)); final JsonNode state = stateMessages1.get(0).getData(); final AutoCloseableIterator read2 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + .read(config, CONFIGURED_CATALOG, state); final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); final List recordMessages2 = new ArrayList<>( extractRecordMessages(actualRecords2)); @@ -266,22 +307,19 @@ void testDelete() throws Exception { void testUpdate() throws Exception { final String updatedModel = "Explorer"; final AutoCloseableIterator read1 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); final List stateMessages1 = extractStateMessages(actualRecords1); assertExpectedStateMessages(stateMessages1); - database.query(ctx -> { - ctx.execute(String - .format("UPDATE %s.%s SET %s = '%s' WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, - COL_MODEL, updatedModel, COL_ID, 11)); - return null; - }); + executeQuery(String + .format("UPDATE %s.%s SET %s = '%s' WHERE %s = %s", MODELS_SCHEMA, MODELS_STREAM_NAME, + COL_MODEL, updatedModel, COL_ID, 11)); final JsonNode state = stateMessages1.get(0).getData(); final AutoCloseableIterator read2 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + .read(config, CONFIGURED_CATALOG, state); final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); final List recordMessages2 = new ArrayList<>( extractRecordMessages(actualRecords2)); @@ -304,20 +342,17 @@ void testRecordsProducedDuringAndAfterSync() throws Exception { final int recordsToCreate = 20; final int[] recordsCreated = {0}; // first batch of records. 20 created here and 6 created in setup method. - database.query(ctx -> { - while (recordsCreated[0] < recordsToCreate) { - final JsonNode record = - Jsons.jsonNode(ImmutableMap - .of(COL_ID, 100 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, - "F-" + recordsCreated[0])); - writeModelRecord(ctx, record); - recordsCreated[0]++; - } - return null; - }); + while (recordsCreated[0] < recordsToCreate) { + final JsonNode record = + Jsons.jsonNode(ImmutableMap + .of(COL_ID, 100 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, + "F-" + recordsCreated[0])); + writeModelRecord(record); + recordsCreated[0]++; + } final AutoCloseableIterator firstBatchIterator = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List dataFromFirstBatch = AutoCloseableIterators .toListAndClose(firstBatchIterator); List stateAfterFirstBatch = extractStateMessages(dataFromFirstBatch); @@ -328,21 +363,18 @@ void testRecordsProducedDuringAndAfterSync() throws Exception { // second batch of records again 20 being created recordsCreated[0] = 0; - database.query(ctx -> { - while (recordsCreated[0] < recordsToCreate) { - final JsonNode record = - Jsons.jsonNode(ImmutableMap - .of(COL_ID, 200 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, - "F-" + recordsCreated[0])); - writeModelRecord(ctx, record); - recordsCreated[0]++; - } - return null; - }); + while (recordsCreated[0] < recordsToCreate) { + final JsonNode record = + Jsons.jsonNode(ImmutableMap + .of(COL_ID, 200 + recordsCreated[0], COL_MAKE_ID, 1, COL_MODEL, + "F-" + recordsCreated[0])); + writeModelRecord(record); + recordsCreated[0]++; + } final JsonNode state = stateAfterFirstBatch.get(0).getData(); final AutoCloseableIterator secondBatchIterator = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + .read(config, CONFIGURED_CATALOG, state); final List dataFromSecondBatch = AutoCloseableIterators .toListAndClose(secondBatchIterator); @@ -401,17 +433,14 @@ void testCdcAndFullRefreshInSameSync() throws Exception { Jsons.jsonNode(ImmutableMap.of(COL_ID, 150, COL_MAKE_ID, 2, COL_MODEL, "A 220-2")), Jsons.jsonNode(ImmutableMap.of(COL_ID, 160, COL_MAKE_ID, 2, COL_MODEL, "E 350-2"))); - database.query(ctx -> { - ctx.execute(String - .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", - MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); + executeQuery(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200), PRIMARY KEY (%s));", + MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL, COL_ID)); - for (JsonNode recordJson : MODEL_RECORDS_2) { - writeRecords(ctx, recordJson, MODELS_SCHEMA, MODELS_STREAM_NAME + "_2"); - } - - return null; - }); + for (JsonNode recordJson : MODEL_RECORDS_2) { + writeRecords(recordJson, CdcMySqlSourceTest.MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, + COL_MAKE_ID, COL_MODEL); + } ConfiguredAirbyteStream airbyteStream = new ConfiguredAirbyteStream() .withStream(CatalogHelpers.createAirbyteStream( @@ -420,7 +449,8 @@ void testCdcAndFullRefreshInSameSync() throws Exception { Field.of(COL_ID, JsonSchemaPrimitive.NUMBER), Field.of(COL_MAKE_ID, JsonSchemaPrimitive.NUMBER), Field.of(COL_MODEL, JsonSchemaPrimitive.STRING)) - .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) + .withSupportedSyncModes( + Lists.newArrayList(SyncMode.FULL_REFRESH, SyncMode.INCREMENTAL)) .withSourceDefinedPrimaryKey(List.of(List.of(COL_ID)))); airbyteStream.setSyncMode(SyncMode.FULL_REFRESH); @@ -429,7 +459,7 @@ void testCdcAndFullRefreshInSameSync() throws Exception { configuredCatalog.withStreams(streams); final AutoCloseableIterator read1 = source - .read(getConfig(container, dbName), configuredCatalog, null); + .read(config, configuredCatalog, null); final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); final Set recordMessages1 = extractRecordMessages(actualRecords1); @@ -445,14 +475,11 @@ void testCdcAndFullRefreshInSameSync() throws Exception { final JsonNode puntoRecord = Jsons .jsonNode(ImmutableMap.of(COL_ID, 100, COL_MAKE_ID, 3, COL_MODEL, "Punto")); - database.query(ctx -> { - writeModelRecord(ctx, puntoRecord); - return null; - }); + writeModelRecord(puntoRecord); final JsonNode state = extractStateMessages(actualRecords1).get(0).getData(); final AutoCloseableIterator read2 = source - .read(getConfig(container, dbName), configuredCatalog, state); + .read(config, configuredCatalog, state); final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); final Set recordMessages2 = extractRecordMessages(actualRecords2); @@ -471,13 +498,10 @@ void testCdcAndFullRefreshInSameSync() throws Exception { @DisplayName("When no records exist, no records are returned.") void testNoData() throws Exception { - database.query(ctx -> { - ctx.execute(String.format("DELETE FROM %s.%s", MODELS_SCHEMA, MODELS_STREAM_NAME)); - return null; - }); + executeQuery(String.format("DELETE FROM %s.%s", MODELS_SCHEMA, MODELS_STREAM_NAME)); final AutoCloseableIterator read = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List actualRecords = AutoCloseableIterators.toListAndClose(read); final Set recordMessages = extractRecordMessages(actualRecords); @@ -491,12 +515,12 @@ void testNoData() throws Exception { @DisplayName("When no changes have been made to the database since the previous sync, no records are returned.") void testNoDataOnSecondSync() throws Exception { final AutoCloseableIterator read1 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, null); + .read(config, CONFIGURED_CATALOG, null); final List actualRecords1 = AutoCloseableIterators.toListAndClose(read1); final JsonNode state = extractStateMessages(actualRecords1).get(0).getData(); final AutoCloseableIterator read2 = source - .read(getConfig(container, dbName), CONFIGURED_CATALOG, state); + .read(config, CONFIGURED_CATALOG, state); final List actualRecords2 = AutoCloseableIterators.toListAndClose(read2); final Set recordMessages2 = extractRecordMessages(actualRecords2); @@ -508,7 +532,7 @@ void testNoDataOnSecondSync() throws Exception { @Test void testCheck() { - final AirbyteConnectionStatus status = source.check(getConfig(container, dbName)); + final AirbyteConnectionStatus status = source.check(config); assertEquals(status.getStatus(), AirbyteConnectionStatus.Status.SUCCEEDED); } @@ -516,12 +540,9 @@ void testCheck() { void testDiscover() throws Exception { final AirbyteCatalog expectedCatalog = Jsons.clone(CATALOG); - database.query(ctx -> { - ctx.execute(String - .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200));", - MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL)); - return null; - }); + executeQuery(String + .format("CREATE TABLE %s.%s(%s INTEGER, %s INTEGER, %s VARCHAR(200));", + MODELS_SCHEMA, MODELS_STREAM_NAME + "_2", COL_ID, COL_MAKE_ID, COL_MODEL)); List streams = expectedCatalog.getStreams(); // stream with PK @@ -541,7 +562,7 @@ void testDiscover() throws Exception { streams.add(streamWithoutPK); expectedCatalog.withStreams(streams); - final AirbyteCatalog actualCatalog = source.discover(getConfig(container, dbName)); + final AirbyteCatalog actualCatalog = source.discover(config); assertEquals( expectedCatalog.getStreams().stream().sorted(Comparator.comparing(AirbyteStream::getName)) @@ -564,27 +585,6 @@ private static AirbyteStream addCdcMetadataColumns(AirbyteStream stream) { return stream; } - private void writeModelRecord(DSLContext ctx, JsonNode recordJson) { - writeRecords(ctx, recordJson, MODELS_SCHEMA, MODELS_STREAM_NAME); - } - - private void writeRecords(DSLContext ctx, JsonNode recordJson, String dbName, String streamName) { - writeRecords(ctx, recordJson, dbName, streamName, COL_ID, COL_MAKE_ID, COL_MODEL); - } - - private void writeRecords(DSLContext ctx, - JsonNode recordJson, - String dbName, - String streamName, - String idCol, - String makeIdCol, - String modelCol) { - ctx.execute( - String.format("INSERT INTO %s.%s (%s, %s, %s) VALUES (%s, %s, '%s');", dbName, streamName, idCol, makeIdCol, modelCol, - recordJson.get(idCol).asInt(), recordJson.get(makeIdCol).asInt(), - recordJson.get(modelCol).asText())); - } - private Set extractRecordMessages(List messages) { final List recordMessageList = messages .stream() From d3b56aa002d952b720aa3ea51c77593904991801 Mon Sep 17 00:00:00 2001 From: subodh Date: Fri, 21 May 2021 18:27:08 +0530 Subject: [PATCH 05/13] fix typo --- .../mysql/AirbyteFileDatabaseHistoryStorageOperations.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java index 10f1a383a42f..84083507d284 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java @@ -46,8 +46,8 @@ /** * The purpose of this class is : to , 1. Read the contents of the file {@link #path} at the end of * the sync so that it can be saved in state for future syncs. Check {@link #read()} 2. Write the - * saved content back to the file {@link #path} at the beginning of the sync so that debezium cant - * function smoothly. Check {@link #persist(CdcState)} To under more about fine please refer + * saved content back to the file {@link #path} at the beginning of the sync so that debezium can + * function smoothly. Check {@link #persist(CdcState)} To understand more about file, please refer * {@link FilteredFileDatabaseHistory} */ public class AirbyteFileDatabaseHistoryStorageOperations { From c60f5f72ae159d924def98790ad69dac7b63b142 Mon Sep 17 00:00:00 2001 From: subodh Date: Fri, 21 May 2021 23:29:01 +0530 Subject: [PATCH 06/13] address review comments --- .../source/StandardSourceTest.java | 2 +- .../mysql/AirbyteFileOffsetBackingStore.java | 16 +++++++ ....java => AirbyteSchemaHistoryStorage.java} | 25 ++++++++--- .../source/mysql/DebeziumRecordPublisher.java | 8 ++-- .../source/mysql/MySqlSource.java | 43 +++---------------- .../source/mysql/CdcMySqlStandardTest.java | 5 --- 6 files changed, 46 insertions(+), 53 deletions(-) rename airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/{AirbyteFileDatabaseHistoryStorageOperations.java => AirbyteSchemaHistoryStorage.java} (84%) diff --git a/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java b/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java index f36d18eec022..12096b5a3fa4 100644 --- a/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java +++ b/airbyte-integrations/bases/standard-source-test/src/main/java/io/airbyte/integrations/standardtest/source/StandardSourceTest.java @@ -385,7 +385,7 @@ private List filterRecords(Collection mess .collect(Collectors.toList()); } - public ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { + private ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { final ConfiguredAirbyteCatalog clone = Jsons.clone(catalog); for (ConfiguredAirbyteStream configuredStream : clone.getStreams()) { if (configuredStream.getSyncMode() == INCREMENTAL diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java index ef228afcd3c0..33f490b7e32c 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileOffsetBackingStore.java @@ -29,6 +29,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Preconditions; import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.source.jdbc.JdbcStateManager; import io.airbyte.integrations.source.jdbc.models.CdcState; import java.io.EOFException; import java.io.IOException; @@ -159,4 +160,19 @@ private void save(Map data) { } } + static AirbyteFileOffsetBackingStore initializeState(JdbcStateManager stateManager) { + final Path cdcWorkingDir; + try { + cdcWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-state-offset"); + } catch (IOException e) { + throw new RuntimeException(e); + } + final Path cdcOffsetFilePath = cdcWorkingDir.resolve("offset.dat"); + + final AirbyteFileOffsetBackingStore offsetManager = new AirbyteFileOffsetBackingStore( + cdcOffsetFilePath); + offsetManager.persist(stateManager.getCdcStateManager().getCdcState()); + return offsetManager; + } + } diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteSchemaHistoryStorage.java similarity index 84% rename from airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java rename to airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteSchemaHistoryStorage.java index 84083507d284..9a1aaf4f708e 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteFileDatabaseHistoryStorageOperations.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/AirbyteSchemaHistoryStorage.java @@ -27,6 +27,7 @@ import static io.airbyte.integrations.source.mysql.MySqlSource.MYSQL_DB_HISTORY; import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.source.jdbc.JdbcStateManager; import io.airbyte.integrations.source.jdbc.models.CdcState; import io.debezium.document.Document; import io.debezium.document.DocumentReader; @@ -47,17 +48,17 @@ * The purpose of this class is : to , 1. Read the contents of the file {@link #path} at the end of * the sync so that it can be saved in state for future syncs. Check {@link #read()} 2. Write the * saved content back to the file {@link #path} at the beginning of the sync so that debezium can - * function smoothly. Check {@link #persist(CdcState)} To understand more about file, please refer + * function smoothly. Check {@link #persist(CdcState)}. To understand more about file, please refer * {@link FilteredFileDatabaseHistory} */ -public class AirbyteFileDatabaseHistoryStorageOperations { +public class AirbyteSchemaHistoryStorage { private final Path path; private static final Charset UTF8 = StandardCharsets.UTF_8; private final DocumentReader reader = DocumentReader.defaultReader(); private final DocumentWriter writer = DocumentWriter.defaultWriter(); - public AirbyteFileDatabaseHistoryStorageOperations(final Path path) { + public AirbyteSchemaHistoryStorage(final Path path) { this.path = path; } @@ -67,7 +68,7 @@ public Path getPath() { /** * This implementation is is kind of similar to - * {@link io.debezium.relational.history.FileDatabaseHistory#recoverRecords(Consumer)} ()} + * {@link io.debezium.relational.history.FileDatabaseHistory#recoverRecords(Consumer)} */ public String read() { StringBuilder fileAsString = new StringBuilder(); @@ -105,7 +106,7 @@ private void makeSureFileExists() { } } } catch (IOException e) { - throw new RuntimeException( + throw new IllegalStateException( "Unable to create history file at " + path + ": " + e.getMessage(), e); } } @@ -151,4 +152,18 @@ private void writeToFile(String fileAsString) { } } + static AirbyteSchemaHistoryStorage initializeDBHistory(JdbcStateManager stateManager) { + final Path dbHistoryWorkingDir; + try { + dbHistoryWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-db-history"); + } catch (IOException e) { + throw new RuntimeException(e); + } + final Path dbHistoryFilePath = dbHistoryWorkingDir.resolve("dbhistory.dat"); + + final AirbyteSchemaHistoryStorage schemaHistoryManager = new AirbyteSchemaHistoryStorage(dbHistoryFilePath); + schemaHistoryManager.persist(stateManager.getCdcStateManager().getCdcState()); + return schemaHistoryManager; + } + } diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java index 5df48c604eb5..cf93dc75c575 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordPublisher.java @@ -54,7 +54,7 @@ public class DebeziumRecordPublisher implements AutoCloseable { private final JsonNode config; private final ConfiguredAirbyteCatalog catalog; private final AirbyteFileOffsetBackingStore offsetManager; - private final AirbyteFileDatabaseHistoryStorageOperations airbyteFileDatabaseHistoryStorageOperations; + private final AirbyteSchemaHistoryStorage schemaHistoryManager; private final AtomicBoolean hasClosed; private final AtomicBoolean isClosing; @@ -64,11 +64,11 @@ public class DebeziumRecordPublisher implements AutoCloseable { public DebeziumRecordPublisher(JsonNode config, ConfiguredAirbyteCatalog catalog, AirbyteFileOffsetBackingStore offsetManager, - AirbyteFileDatabaseHistoryStorageOperations airbyteFileDatabaseHistoryStorageOperations) { + AirbyteSchemaHistoryStorage schemaHistoryManager) { this.config = config; this.catalog = catalog; this.offsetManager = offsetManager; - this.airbyteFileDatabaseHistoryStorageOperations = airbyteFileDatabaseHistoryStorageOperations; + this.schemaHistoryManager = schemaHistoryManager; this.hasClosed = new AtomicBoolean(false); this.isClosing = new AtomicBoolean(false); this.thrownError = new AtomicReference<>(); @@ -157,7 +157,7 @@ protected Properties getDebeziumProperties(JsonNode config, props.setProperty("database.history", "io.airbyte.integrations.source.mysql.FilteredFileDatabaseHistory"); props.setProperty("database.history.file.filename", - airbyteFileDatabaseHistoryStorageOperations.getPath().toString()); + schemaHistoryManager.getPath().toString()); // https://debezium.io/documentation/reference/configuration/avro.html props.setProperty("key.converter.schemas.enable", "false"); diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 3afb2c8efeaa..87f3c62bd672 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -24,6 +24,8 @@ package io.airbyte.integrations.source.mysql; +import static io.airbyte.integrations.source.mysql.AirbyteSchemaHistoryStorage.initializeDBHistory; +import static io.airbyte.integrations.source.mysql.AirbyteFileOffsetBackingStore.initializeState; import static java.util.stream.Collectors.toList; import com.fasterxml.jackson.databind.JsonNode; @@ -49,9 +51,6 @@ import io.airbyte.protocol.models.ConfiguredAirbyteStream; import io.airbyte.protocol.models.SyncMode; import io.debezium.engine.ChangeEvent; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.time.Instant; import java.util.Collections; import java.util.HashMap; @@ -173,12 +172,10 @@ public List> getIncrementalIterators(JsonN LOGGER.info("using CDC: {}", true); // TODO: Figure out how to set the isCDC of stateManager to true. Its always false final AirbyteFileOffsetBackingStore offsetManager = initializeState(stateManager); - AirbyteFileDatabaseHistoryStorageOperations dbHistoryStorageManager = initializeDBHistory( - stateManager); + AirbyteSchemaHistoryStorage schemaHistoryManager = initializeDBHistory(stateManager); FilteredFileDatabaseHistory.setDatabaseName(config.get("database").asText()); final LinkedBlockingQueue> queue = new LinkedBlockingQueue<>(); - final DebeziumRecordPublisher publisher = new DebeziumRecordPublisher(config, catalog, - offsetManager, dbHistoryStorageManager); + final DebeziumRecordPublisher publisher = new DebeziumRecordPublisher(config, catalog, offsetManager, schemaHistoryManager); publisher.start(queue); Optional targetFilePosition = TargetFilePosition @@ -201,7 +198,7 @@ public List> getIncrementalIterators(JsonN // have been produced) final Supplier stateMessageSupplier = () -> { Map offset = offsetManager.readMap(); - String dbHistory = dbHistoryStorageManager.read(); + String dbHistory = schemaHistoryManager.read(); Map state = new HashMap<>(); state.put(MYSQL_CDC_OFFSET, offset); @@ -237,36 +234,6 @@ public List> getIncrementalIterators(JsonN } } - private AirbyteFileOffsetBackingStore initializeState(JdbcStateManager stateManager) { - final Path cdcWorkingDir; - try { - cdcWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-state-offset"); - } catch (IOException e) { - throw new RuntimeException(e); - } - final Path cdcOffsetFilePath = cdcWorkingDir.resolve("offset.dat"); - - final AirbyteFileOffsetBackingStore offsetManager = new AirbyteFileOffsetBackingStore( - cdcOffsetFilePath); - offsetManager.persist(stateManager.getCdcStateManager().getCdcState()); - return offsetManager; - } - - private AirbyteFileDatabaseHistoryStorageOperations initializeDBHistory( - JdbcStateManager stateManager) { - final Path dbHistoryWorkingDir; - try { - dbHistoryWorkingDir = Files.createTempDirectory(Path.of("/tmp"), "cdc-db-history"); - } catch (IOException e) { - throw new RuntimeException(e); - } - final Path dbHistoryFilePath = dbHistoryWorkingDir.resolve("dbhistory.dat"); - - final AirbyteFileDatabaseHistoryStorageOperations dbHistoryStorageManager = new AirbyteFileDatabaseHistoryStorageOperations(dbHistoryFilePath); - dbHistoryStorageManager.persist(stateManager.getCdcStateManager().getCdcState()); - return dbHistoryStorageManager; - } - @Override public Set getExcludedInternalSchemas() { return Set.of( diff --git a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java index 7262d8d4dc5a..357e003cf98e 100644 --- a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java +++ b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java @@ -67,11 +67,6 @@ protected JsonNode getConfig() { return config; } - @Override - public ConfiguredAirbyteCatalog withSourceDefinedCursors(ConfiguredAirbyteCatalog catalog) { - return catalog; - } - @Override protected ConfiguredAirbyteCatalog getConfiguredCatalog() { return new ConfiguredAirbyteCatalog().withStreams(Lists.newArrayList( From 3ff961958bb49c6c5786b7636099bd06eb230528 Mon Sep 17 00:00:00 2001 From: subodh Date: Sat, 22 May 2021 10:07:07 +0530 Subject: [PATCH 07/13] fix formatting error --- .../java/io/airbyte/integrations/source/mysql/MySqlSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 87f3c62bd672..61e171f4de61 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -24,8 +24,8 @@ package io.airbyte.integrations.source.mysql; -import static io.airbyte.integrations.source.mysql.AirbyteSchemaHistoryStorage.initializeDBHistory; import static io.airbyte.integrations.source.mysql.AirbyteFileOffsetBackingStore.initializeState; +import static io.airbyte.integrations.source.mysql.AirbyteSchemaHistoryStorage.initializeDBHistory; import static java.util.stream.Collectors.toList; import com.fasterxml.jackson.databind.JsonNode; From 790b3e4c2307927b99e7aff9957fc9e40f2f84b6 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 12:00:06 +0530 Subject: [PATCH 08/13] resolve conflicts --- ...SqlStandardTest.java => CdcMySqlSourceAcceptanceTest.java} | 4 ++-- .../sources/CdcPostgresSourceAcceptanceTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/{CdcMySqlStandardTest.java => CdcMySqlSourceAcceptanceTest.java} (97%) diff --git a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceAcceptanceTest.java similarity index 97% rename from airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java rename to airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceAcceptanceTest.java index 357e003cf98e..d5bdee740f22 100644 --- a/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlStandardTest.java +++ b/airbyte-integrations/connectors/source-mysql/src/test-integration/java/io/airbyte/integrations/source/mysql/CdcMySqlSourceAcceptanceTest.java @@ -31,7 +31,7 @@ import io.airbyte.commons.resources.MoreResources; import io.airbyte.db.Database; import io.airbyte.db.Databases; -import io.airbyte.integrations.standardtest.source.StandardSourceTest; +import io.airbyte.integrations.standardtest.source.SourceAcceptanceTest; import io.airbyte.protocol.models.CatalogHelpers; import io.airbyte.protocol.models.ConfiguredAirbyteCatalog; import io.airbyte.protocol.models.ConfiguredAirbyteStream; @@ -45,7 +45,7 @@ import org.jooq.SQLDialect; import org.testcontainers.containers.MySQLContainer; -public class CdcMySqlStandardTest extends StandardSourceTest { +public class CdcMySqlSourceAcceptanceTest extends SourceAcceptanceTest { private static final String STREAM_NAME = "id_and_name"; private static final String STREAM_NAME2 = "starships"; diff --git a/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceAcceptanceTest.java b/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceAcceptanceTest.java index 45f80f183b6a..0760f59053c1 100644 --- a/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceAcceptanceTest.java +++ b/airbyte-integrations/connectors/source-postgres/src/test-integration/java/io/airbyte/integrations/io/airbyte/integration_tests/sources/CdcPostgresSourceAcceptanceTest.java @@ -52,7 +52,7 @@ // this, but for now this is a solid sanity check. /** * None of the tests in this class use the cdc path (run the tests and search for `using CDC: false` - * in logs). This is exact same as {@link PostgresSourceStandardTest} + * in logs). This is exact same as {@link PostgresSourceAcceptanceTest} */ public class CdcPostgresSourceAcceptanceTest extends SourceAcceptanceTest { From 40757d11956f48c2720ece29502502b35561bc01 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 14:23:00 +0530 Subject: [PATCH 09/13] update docs + bump docker minor version --- .../connectors/source-mysql/Dockerfile | 2 +- docs/integrations/sources/mysql.md | 66 +++++++++++++++++-- docs/understanding-airbyte/cdc.md | 5 +- 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/connectors/source-mysql/Dockerfile b/airbyte-integrations/connectors/source-mysql/Dockerfile index bb9a36c7dc3d..895fc0ad2130 100644 --- a/airbyte-integrations/connectors/source-mysql/Dockerfile +++ b/airbyte-integrations/connectors/source-mysql/Dockerfile @@ -8,6 +8,6 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar RUN tar xf ${APPLICATION}.tar --strip-components=1 -LABEL io.airbyte.version=0.3.0 +LABEL io.airbyte.version=0.3.1 LABEL io.airbyte.name=airbyte/source-mysql diff --git a/docs/integrations/sources/mysql.md b/docs/integrations/sources/mysql.md index a63cbb166d7d..c9272894a903 100644 --- a/docs/integrations/sources/mysql.md +++ b/docs/integrations/sources/mysql.md @@ -37,8 +37,8 @@ If you do not see a type in this list, assume that it is coerced into a string. | :--- | :--- | :--- | | Full Refresh Sync | Yes | | | Incremental - Append Sync | Yes | | -| Replicate Incremental Deletes | Coming soon | | -| Logical Replication \(WAL\) | Coming soon | | +| Replicate Incremental Deletes | Yes | | +| CDC | Yes | | | SSL Support | Yes | | | SSH Tunnel Connection | Coming soon | | | Namespaces | Yes | Enabled by default | @@ -66,13 +66,71 @@ To create a dedicated database user, run the following commands against your dat CREATE USER 'airbyte'@'%' IDENTIFIED BY 'your_password_here'; ``` -Then give it access to the relevant schema: +The right set of permissions differ between the `STANDARD` and `CDC` replication method. +For `STANDARD` replication method, only `SELECT` permission is required. ```sql GRANT SELECT ON .* TO 'airbyte'@'%'; ``` +For `CDC` replication method, `SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT` permissions are required. +```sql +GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON .* TO 'airbyte'@'%'; +``` + +Your database user should now be ready for use with Airbyte. + +#### 3. Set up CDC + +For `STANDARD` replication method this is not applicable. If you select the `CDC` replication method then only this is required. Please read [the section on CDC below](mysql.md#setting-up-cdc-for-mysql) for more information. -You can limit this grant down to specific tables instead of the whole database. Note that to replicate data from multiple MySQL schemas, you can re-run the command above to grant access to all the relevant schemas, but you'll need to set up multiple sources connecting to the same db on multiple schemas. +#### 4. That's it! Your database user should now be ready for use with Airbyte. +## Change Data Capture \(CDC\) + +* If you need a record of deletions and can accept the limitations posted below, you should be able to use CDC for MySQL. +* If your data set is small, and you just want snapshot of your table in the destination, consider using Full Refresh replication for your table instead of CDC. +* If the limitations prevent you from using CDC and your goal is to maintain a snapshot of your table in the destination, consider using non-CDC incremental and occasionally reset the data and re-sync. +* If your table has a primary key but doesn't have a reasonable cursor field for incremental syncing \(i.e. `updated_at`\), CDC allows you to sync your table incrementally. + +### CDC Limitations + +* Make sure to read our [CDC docs](../../understanding-airbyte/cdc.md) to see limitations that impact all databases using CDC replication. +* Our CDC implementation uses at least once delivery for all change records. + +### Setting up CDC for MySQL +You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes. + +#### Enable binary logging + +You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes. You can configure your MySQL server configuration file with the following properties, which are described in below: +``` +server-id = 223344 +log_bin = mysql-bin +binlog_format = ROW +binlog_row_image = FULL +expire_logs_days = 10 +``` +* server-id : The value for the server-id must be unique for each server and replication client in the MySQL cluster. The `server-id` should be a non-zero value. If the `server-id` is already set to a non-zero value, you don't need to make any change. You can set the `server-id` to any value between 1 and 4294967295. For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/replication-options.html#sysvar_server_id) +* log_bin : The value of log_bin is the base name of the sequence of binlog files. If the `log_bin` is already set, you don't need to make any change. For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/replication-options-binary-log.html#option_mysqld_log-bin) +* binlog_format : The `binlog_format` must be set to `ROW`. For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/replication-options-binary-log.html#sysvar_binlog_format) +* binlog_row_image : The `binlog_row_image` must be set to `FULL`. It determines how row images are written to the binary log. For more information refer [mysql doc](https://dev.mysql.com/doc/refman/5.7/en/replication-options-binary-log.html#sysvar_binlog_row_image) +* expire_logs_days : This is the number of days for automatic binlog file removal. We recommend 10 days so that in case of a failure in sync or if the sync is paused, we still have some bandwidth to start from the last point in incremental sync. We also recommend setting frequent syncs for CDC. + +#### Enable GTIDs \(Optional\) +Global transaction identifiers (GTIDs) uniquely identify transactions that occur on a server within a cluster. +Though not required for a Airbyte MySQL connector, using GTIDs simplifies replication and enables you to more easily confirm if primary and replica servers are consistent. +For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/replication-options-gtids.html#option_mysqld_gtid-mode) +* Enable gtid_mode : Boolean that specifies whether GTID mode of the MySQL server is enabled or not. Enable it via `mysql> gtid_mode=ON` +* Enable enforce_gtid_consistency : Boolean that specifies whether the server enforces GTID consistency by allowing the execution of statements that can be logged in a transactionally safe manner. Required when using GTIDs. Enable it via `mysql> enforce_gtid_consistency=ON` + +####Note + +When a sync runs for the first time using CDC, Airbyte performs an initial consistent snapshot of your database. +Airbyte doesn't acquire any table locks (for tables defined with MyISAM engine, the tables would still be locked) while creating the snapshot to allow writes by other database clients. +But in order for the sync to work without any error/unexpected behaviour, it is assumed that no schema changes are happening while the snapshot is running. + + + + diff --git a/docs/understanding-airbyte/cdc.md b/docs/understanding-airbyte/cdc.md index fc20f286d3bf..737ab07bccaf 100644 --- a/docs/understanding-airbyte/cdc.md +++ b/docs/understanding-airbyte/cdc.md @@ -14,7 +14,8 @@ The Airbyte Protocol outputs records from sources. Records from `UPDATE` stateme We add some metadata columns for CDC sources: -* `ab_cdc_lsn` is the point in the log where the record was retrieved +* `ab_cdc_lsn` (specific to postgres source) is the point in the log where the record was retrieved +* `ab_cdc_log_file` & `ab_cdc_log_pos` (specific to mysql source) is the file name and position in the file where the record was retrieved * `ab_cdc_updated_at` is the timestamp for the database transaction that resulted in this record change and is present for records from `DELETE`/`INSERT`/`UPDATE` statements * `ab_cdc_deleted_at` is the timestamp for the database transaction that resulted in this record change and is only present for records from `DELETE` statements @@ -30,10 +31,10 @@ We add some metadata columns for CDC sources: ## Current Support * [Postgres](../integrations/sources/postgres.md) (For a quick video overview of CDC on Postgres, click [here](https://www.youtube.com/watch?v=NMODvLgZvuE&ab_channel=Airbyte)) +* [MySQL](../integrations/sources/mysql.md) ## Coming Soon -* [MySQL](../integrations/sources/mysql.md) * [SQL Server / MSSQL](../integrations/sources/mssql.md) * Oracle DB * Please [create a ticket](https://github.com/airbytehq/airbyte/issues/new/choose) if you need CDC support on another database! From 94e844271f5c9fd07f5dba85db137e54d8315fb6 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 15:03:55 +0530 Subject: [PATCH 10/13] remove un-necessary new lines + add multiple checks for cdc --- .../source/mysql/MySqlSource.java | 61 +++++++++++++++++++ docs/integrations/sources/mysql.md | 10 ++- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 61e171f4de61..9997d84866db 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -31,6 +31,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableMap; +import io.airbyte.commons.functional.CheckedConsumer; import io.airbyte.commons.json.Jsons; import io.airbyte.commons.util.AutoCloseableIterator; import io.airbyte.commons.util.AutoCloseableIterators; @@ -52,6 +53,7 @@ import io.airbyte.protocol.models.SyncMode; import io.debezium.engine.ChangeEvent; import java.time.Instant; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; @@ -108,6 +110,65 @@ private static AirbyteStream addCdcMetadataColumns(AirbyteStream stream) { return stream; } + @Override + public List> getCheckOperations(JsonNode config) throws Exception { + final List> checkOperations = new ArrayList<>(super.getCheckOperations(config)); + if (isCdc(config)) { + checkOperations.add(database -> { + List matchingSlots = database.resultSetQuery(connection -> { + final String sql = "show variables where Variable_name = 'log_bin'"; + + return connection.createStatement().executeQuery(sql); + }, resultSet -> resultSet.getString("Value")).collect(toList()); + + if (matchingSlots.size() != 1) { + throw new RuntimeException("Could not query the variable log_bin"); + } + + String logBin = matchingSlots.get(0); + if (!logBin.equalsIgnoreCase("ON")) { + throw new RuntimeException("The variable log_bin should be set to ON, but it is : " + logBin); + } + }); + + checkOperations.add(database -> { + List matchingSlots = database.resultSetQuery(connection -> { + final String sql = "show variables where Variable_name = 'binlog_format'"; + + return connection.createStatement().executeQuery(sql); + }, resultSet -> resultSet.getString("Value")).collect(toList()); + + if (matchingSlots.size() != 1) { + throw new RuntimeException("Could not query the variable binlog_format"); + } + + String binlogFormat = matchingSlots.get(0); + if (!binlogFormat.equalsIgnoreCase("ROW")) { + throw new RuntimeException("The variable binlog_format should be set to ROW, but it is : " + binlogFormat); + } + }); + } + + checkOperations.add(database -> { + List matchingSlots = database.resultSetQuery(connection -> { + final String sql = "show variables where Variable_name = 'binlog_row_image'"; + + return connection.createStatement().executeQuery(sql); + }, resultSet -> resultSet.getString("Value")).collect(toList()); + + if (matchingSlots.size() != 1) { + throw new RuntimeException("Could not query the variable binlog_row_image"); + } + + String binlogRowImage = matchingSlots.get(0); + if (!binlogRowImage.equalsIgnoreCase("FULL")) { + throw new RuntimeException("The variable binlog_row_image should be set to FULL, but it is : " + binlogRowImage); + } + }); + + return checkOperations; + } + @Override public AirbyteCatalog discover(JsonNode config) throws Exception { AirbyteCatalog catalog = super.discover(config); diff --git a/docs/integrations/sources/mysql.md b/docs/integrations/sources/mysql.md index c9272894a903..3b3ed868bfda 100644 --- a/docs/integrations/sources/mysql.md +++ b/docs/integrations/sources/mysql.md @@ -74,7 +74,7 @@ GRANT SELECT ON .* TO 'airbyte'@'%'; ``` For `CDC` replication method, `SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT` permissions are required. ```sql -GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON .* TO 'airbyte'@'%'; +GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'airbyte'@'%'; ``` Your database user should now be ready for use with Airbyte. @@ -100,6 +100,7 @@ Your database user should now be ready for use with Airbyte. * Our CDC implementation uses at least once delivery for all change records. ### Setting up CDC for MySQL + You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes. #### Enable binary logging @@ -119,6 +120,7 @@ expire_logs_days = 10 * expire_logs_days : This is the number of days for automatic binlog file removal. We recommend 10 days so that in case of a failure in sync or if the sync is paused, we still have some bandwidth to start from the last point in incremental sync. We also recommend setting frequent syncs for CDC. #### Enable GTIDs \(Optional\) + Global transaction identifiers (GTIDs) uniquely identify transactions that occur on a server within a cluster. Though not required for a Airbyte MySQL connector, using GTIDs simplifies replication and enables you to more easily confirm if primary and replica servers are consistent. For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/replication-options-gtids.html#option_mysqld_gtid-mode) @@ -129,8 +131,4 @@ For more information refer [mysql doc](https://dev.mysql.com/doc/refman/8.0/en/r When a sync runs for the first time using CDC, Airbyte performs an initial consistent snapshot of your database. Airbyte doesn't acquire any table locks (for tables defined with MyISAM engine, the tables would still be locked) while creating the snapshot to allow writes by other database clients. -But in order for the sync to work without any error/unexpected behaviour, it is assumed that no schema changes are happening while the snapshot is running. - - - - +But in order for the sync to work without any error/unexpected behaviour, it is assumed that no schema changes are happening while the snapshot is running. \ No newline at end of file From 29b250a2ca291eb41d73e923527e4b3cc453ab01 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 21:05:43 +0530 Subject: [PATCH 11/13] address review comments from Davin --- .../connectors/source-mysql/build.gradle | 8 ++-- .../source/mysql/DebeziumRecordIterator.java | 39 ++++++++++--------- .../mysql/FilteredFileDatabaseHistory.java | 2 +- .../source/mysql/MySqlSource.java | 6 +-- 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/airbyte-integrations/connectors/source-mysql/build.gradle b/airbyte-integrations/connectors/source-mysql/build.gradle index 6e582da14ef5..10c28b4d418c 100644 --- a/airbyte-integrations/connectors/source-mysql/build.gradle +++ b/airbyte-integrations/connectors/source-mysql/build.gradle @@ -11,14 +11,14 @@ application { dependencies { implementation project(':airbyte-db') implementation project(':airbyte-integrations:bases:base-java') - implementation project(':airbyte-protocol:models') implementation project(':airbyte-integrations:connectors:source-jdbc') + implementation project(':airbyte-protocol:models') - implementation 'mysql:mysql-connector-java:8.0.22' - implementation 'org.apache.commons:commons-lang3:3.11' - implementation 'io.debezium:debezium-embedded:1.4.2.Final' implementation 'io.debezium:debezium-api:1.4.2.Final' implementation 'io.debezium:debezium-connector-mysql:1.4.2.Final' + implementation 'io.debezium:debezium-embedded:1.4.2.Final' + implementation 'mysql:mysql-connector-java:8.0.22' + implementation 'org.apache.commons:commons-lang3:3.11' testImplementation testFixtures(project(':airbyte-integrations:connectors:source-jdbc')) testImplementation 'org.apache.commons:commons-lang3:3.11' diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java index 701fe3da3733..f302f0f84dbd 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/DebeziumRecordIterator.java @@ -38,15 +38,15 @@ import org.slf4j.LoggerFactory; /** - * The record iterator is the consumer (in the producer / consumer relationship with debezium) is + * The record iterator is the consumer (in the producer / consumer relationship with debezium) * responsible for 1. making sure every record produced by the record publisher is processed 2. * signalling to the record publisher when it is time for it to stop producing records. It emits * this signal either when the publisher had not produced a new record for a long time or when it * has processed at least all of the records that were present in the database when the source was * started. Because the publisher might publish more records between the consumer sending this - * signal and the publisher acutally shutting down, the consumer must stay alive as long as the - * publisher is not closed or if there are any new records for it to process (even if the publisher - * is closed). + * signal and the publisher actually shutting down, the consumer must stay alive as long as the + * publisher is not closed. Even after the publisher is closed, the consumer will finish processing + * any produced records before closing. */ public class DebeziumRecordIterator extends AbstractIterator> implements AutoCloseableIterator> { @@ -108,23 +108,24 @@ public void close() throws Exception { } private boolean shouldSignalClose(ChangeEvent event) { - if (targetFilePosition.isPresent()) { - String file = Jsons.deserialize(event.value()).get("source").get("file").asText(); - int position = Jsons.deserialize(event.value()).get("source").get("pos").asInt(); - if (file.equals(targetFilePosition.get().fileName)) { - if (targetFilePosition.get().position >= position) { - return false; - } else { - // if not snapshot or is snapshot but last record in snapshot. - return SnapshotMetadata.TRUE != SnapshotMetadata.valueOf( - Jsons.deserialize(event.value()).get("source").get("snapshot").asText() - .toUpperCase()); - - } - } + if (targetFilePosition.isEmpty()) { + return false; + } + + String file = Jsons.deserialize(event.value()).get("source").get("file").asText(); + int position = Jsons.deserialize(event.value()).get("source").get("pos").asInt(); + if (!file.equals(targetFilePosition.get().fileName)) { + return false; + } + + if (targetFilePosition.get().position >= position) { + return false; } - return false; + // if not snapshot or is snapshot but last record in snapshot. + return SnapshotMetadata.TRUE != SnapshotMetadata.valueOf( + Jsons.deserialize(event.value()).get("source").get("snapshot").asText() + .toUpperCase()); } private void requestClose() { diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java index 741adbea98e9..91307e679d91 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/FilteredFileDatabaseHistory.java @@ -40,7 +40,7 @@ /** * MySQL Debezium connector monitors the database schema evolution over the time and stores the data - * in database history file. Without this file we can't fetch the records from binlog. We need to + * in a database history file. Without this file we can't fetch the records from binlog. We need to * save the contents of the file. Debezium by default uses * {@link io.debezium.relational.history.FileDatabaseHistory} class to write the schema information * in the file. The problem is that the Debezium tracks the schema evolution of all the tables in diff --git a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java index 9997d84866db..53372ed9ead3 100644 --- a/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java +++ b/airbyte-integrations/connectors/source-mysql/src/main/java/io/airbyte/integrations/source/mysql/MySqlSource.java @@ -209,14 +209,12 @@ public JsonNode toJdbcConfig(JsonNode config) { } private static boolean isCdc(JsonNode config) { - final boolean isCdc = config.hasNonNull("replication_method") + return config.hasNonNull("replication_method") && ReplicationMethod.valueOf(config.get("replication_method").asText()) .equals(ReplicationMethod.CDC); - - return isCdc; } - static boolean shouldUseCDC(ConfiguredAirbyteCatalog catalog) { + private static boolean shouldUseCDC(ConfiguredAirbyteCatalog catalog) { Optional any = catalog.getStreams().stream().map(ConfiguredAirbyteStream::getSyncMode) .filter(syncMode -> syncMode == SyncMode.INCREMENTAL).findAny(); return any.isPresent(); From 6a6a55e58428f9335f681a0deda2bf2a8a3c2d47 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 23:38:55 +0530 Subject: [PATCH 12/13] increase the version in source_definitions.yaml --- .../init/src/main/resources/seed/source_definitions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 6c7bf854524b..f749cc25d768 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -61,7 +61,7 @@ - sourceDefinitionId: 435bb9a5-7887-4809-aa58-28c27df0d7ad name: MySQL dockerRepository: airbyte/source-mysql - dockerImageTag: 0.3.0 + dockerImageTag: 0.3.1 documentationUrl: https://docs.airbyte.io/integrations/sources/mysql icon: mysql.svg - sourceDefinitionId: 2470e835-feaf-4db6-96f3-70fd645acc77 From 2de9894a67ccc1e39f3ebc3ca630e3bdb6898c97 Mon Sep 17 00:00:00 2001 From: subodh Date: Mon, 24 May 2021 23:51:55 +0530 Subject: [PATCH 13/13] rebuild seed --- .../435bb9a5-7887-4809-aa58-28c27df0d7ad.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/435bb9a5-7887-4809-aa58-28c27df0d7ad.json b/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/435bb9a5-7887-4809-aa58-28c27df0d7ad.json index 175eacde342a..8951b5276662 100644 --- a/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/435bb9a5-7887-4809-aa58-28c27df0d7ad.json +++ b/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/435bb9a5-7887-4809-aa58-28c27df0d7ad.json @@ -2,7 +2,7 @@ "sourceDefinitionId": "435bb9a5-7887-4809-aa58-28c27df0d7ad", "name": "MySQL", "dockerRepository": "airbyte/source-mysql", - "dockerImageTag": "0.3.0", + "dockerImageTag": "0.3.1", "documentationUrl": "https://docs.airbyte.io/integrations/sources/mysql", "icon": "mysql.svg" }