From 8d73aa020fc82eed0c9d23937111b531911ee462 Mon Sep 17 00:00:00 2001 From: hantmac Date: Sun, 27 Nov 2022 13:23:13 +0800 Subject: [PATCH 01/34] feat: Add databend destination --- .../destination-databend/.dockerignore | 5 + .../6045ffa4-d502-40fe-807f-d21e71b74ab5.csv | 10 ++ .../destination-databend/Dockerfile | 38 ++++ .../connectors/destination-databend/README.md | 123 +++++++++++++ .../destination-databend/build.gradle | 8 + .../destination_databend/__init__.py | 8 + .../destination_databend/__init__.pyc | Bin 0 -> 307 bytes .../destination_databend/client.py | 19 ++ .../destination_databend/destination.py | 85 +++++++++ .../destination_databend/spec.json | 81 +++++++++ .../destination_databend/writer.py | 122 +++++++++++++ .../integration_tests/integration_test.py | 143 +++++++++++++++ .../connectors/destination-databend/main.py | 11 ++ .../destination-databend/requirements.txt | 1 + .../connectors/destination-databend/setup.py | 27 +++ .../unit_tests/test_databend_destination.py | 163 ++++++++++++++++++ .../unit_tests/unit_test.py | 56 ++++++ docs/integrations/destinations/databend.md | 54 ++++++ 18 files changed, 954 insertions(+) create mode 100644 airbyte-integrations/connectors/destination-databend/.dockerignore create mode 100644 airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv create mode 100644 airbyte-integrations/connectors/destination-databend/Dockerfile create mode 100644 airbyte-integrations/connectors/destination-databend/README.md create mode 100644 airbyte-integrations/connectors/destination-databend/build.gradle create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/__init__.py create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/__init__.pyc create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/client.py create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/destination.py create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/spec.json create mode 100644 airbyte-integrations/connectors/destination-databend/destination_databend/writer.py create mode 100644 airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py create mode 100644 airbyte-integrations/connectors/destination-databend/main.py create mode 100644 airbyte-integrations/connectors/destination-databend/requirements.txt create mode 100644 airbyte-integrations/connectors/destination-databend/setup.py create mode 100644 airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py create mode 100644 airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py create mode 100644 docs/integrations/destinations/databend.md diff --git a/airbyte-integrations/connectors/destination-databend/.dockerignore b/airbyte-integrations/connectors/destination-databend/.dockerignore new file mode 100644 index 000000000000..57f4cf36c057 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/.dockerignore @@ -0,0 +1,5 @@ +* +!Dockerfile +!main.py +!destination_databend +!setup.py diff --git a/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv b/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv new file mode 100644 index 000000000000..6dc78f6b56da --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv @@ -0,0 +1,10 @@ +904c3361-a443-4985-9c81-f1331a5b7ba2,2022-11-27 12:28:25.070203,"{""str_col"": ""0"", ""int_col"": 0}" +7d47f811-da9f-4a2c-9df3-82f14d0cd232,2022-11-27 12:28:25.070218,"{""str_col"": ""1"", ""int_col"": 1}" +f8546f7a-18b2-46f7-b184-bd9b9dc76e41,2022-11-27 12:28:25.070237,"{""str_col"": ""2"", ""int_col"": 2}" +7cbb0577-e6d3-485d-9671-0088d0913cfb,2022-11-27 12:28:25.070250,"{""str_col"": ""3"", ""int_col"": 3}" +3bef3e8d-aaf4-467f-93dd-683249f9c1d0,2022-11-27 12:28:25.070263,"{""str_col"": ""4"", ""int_col"": 4}" +f3f57c21-d980-4cd0-aa1a-3e6f0092fd95,2022-11-27 12:28:25.070340,"{""str_col"": ""5"", ""int_col"": 5}" +7a6525a0-451c-45a8-841f-5d8ba160aa1c,2022-11-27 12:28:25.070353,"{""str_col"": ""6"", ""int_col"": 6}" +b0994bdb-90db-4c23-bce0-25654eb8175c,2022-11-27 12:28:25.070367,"{""str_col"": ""7"", ""int_col"": 7}" +93dc6599-43f3-4567-980b-448f2d383f97,2022-11-27 12:28:25.070380,"{""str_col"": ""8"", ""int_col"": 8}" +645b1c0a-516c-4740-998f-08cd0c162578,2022-11-27 12:28:25.070392,"{""str_col"": ""9"", ""int_col"": 9}" diff --git a/airbyte-integrations/connectors/destination-databend/Dockerfile b/airbyte-integrations/connectors/destination-databend/Dockerfile new file mode 100644 index 000000000000..6619497f7f83 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.9.11-alpine3.15 as base + +# build and load all requirements +FROM base as builder +WORKDIR /airbyte/integration_code + +# upgrade pip to the latest version +RUN apk --no-cache upgrade \ + && pip install --upgrade pip \ + && apk --no-cache add tzdata build-base + + +COPY setup.py ./ +# install necessary packages to a temporary folder +RUN pip install --prefix=/install . + +# build a clean environment +FROM base +WORKDIR /airbyte/integration_code + +# copy all loaded and built libraries to a pure basic image +COPY --from=builder /install /usr/local +# add default timezone settings +COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime +RUN echo "Etc/UTC" > /etc/timezone + +# bash is installed for more convenient debugging. +RUN apk --no-cache add bash + +# copy payload code only +COPY main.py ./ +COPY destination_databend ./destination_databend + +ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" +ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] + +LABEL io.airbyte.version=0.1.0 +LABEL io.airbyte.name=airbyte/destination-databend diff --git a/airbyte-integrations/connectors/destination-databend/README.md b/airbyte-integrations/connectors/destination-databend/README.md new file mode 100644 index 000000000000..8ef9f9a85c16 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/README.md @@ -0,0 +1,123 @@ +# Databend Destination + +This is the repository for the Databend destination connector, written in Python. +For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/destinations/databend). + +## Local development + +### Prerequisites +**To iterate on this connector, make sure to complete this prerequisites section.** + +#### Minimum Python version required `= 3.7.0` + +#### Build & Activate Virtual Environment and install dependencies +From this connector directory, create a virtual environment: +``` +python -m venv .venv +``` + +This will generate a virtualenv for this module in `.venv/`. Make sure this venv is active in your +development environment of choice. To activate it from the terminal, run: +``` +source .venv/bin/activate +pip install -r requirements.txt +``` +If you are in an IDE, follow your IDE's instructions to activate the virtualenv. + +Note that while we are installing dependencies from `requirements.txt`, you should only edit `setup.py` for your dependencies. `requirements.txt` is +used for editable installs (`pip install -e`) to pull in Python dependencies from the monorepo and will call `setup.py`. +If this is mumbo jumbo to you, don't worry about it, just put your deps in `setup.py` but install using `pip install -r requirements.txt` and everything +should work as you expect. + +#### Building via Gradle +From the Airbyte repository root, run: +``` +./gradlew :airbyte-integrations:connectors:destination-databend:build +``` + +#### Create credentials +**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.io/integrations/destinations/databend) +to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `destination_databend/spec.json` file. +Note that the `secrets` directory is gitignored by default, so there is no danger of accidentally checking in sensitive information. +See `integration_tests/sample_config.json` for a sample config file. + +**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `destination databend test creds` +and place them into `secrets/config.json`. + +### Locally running the connector +``` +python main.py spec +python main.py check --config secrets/config.json +python main.py discover --config secrets/config.json +python main.py read --config secrets/config.json --catalog integration_tests/configured_catalog.json +``` + +### Locally running the connector docker image + +#### Build +First, make sure you build the latest Docker image: +``` +docker build . -t airbyte/destination-databend:dev +``` + +You can also build the connector image via Gradle: +``` +./gradlew :airbyte-integrations:connectors:destination-databend:airbyteDocker +``` +When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in +the Dockerfile. + +#### Run +Then run any of the connector commands as follows: +``` +docker run --rm airbyte/destination-databend:dev spec +docker run --rm -v $(pwd)/secrets:/secrets airbyte/destination-databend:dev check --config /secrets/config.json +# messages.jsonl is a file containing line-separated JSON representing AirbyteMessages +cat messages.jsonl | docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/destination-databend:dev write --config /secrets/config.json --catalog /integration_tests/configured_catalog.json +``` +## Testing + Make sure to familiarize yourself with [pytest test discovery](https://docs.pytest.org/en/latest/goodpractices.html#test-discovery) to know how your test files and methods should be named. +First install test dependencies into your virtual environment: +``` +pip install .[tests] +``` +### Unit Tests +To run unit tests locally, from the connector directory run: +``` +python -m pytest unit_tests +``` + +### Integration Tests +There are two types of integration tests: Acceptance Tests (Airbyte's test suite for all destination connectors) and custom integration tests (which are specific to this connector). +#### Custom Integration tests +Place custom tests inside `integration_tests/` folder, then, from the connector root, run +``` +python -m pytest integration_tests +``` +#### Acceptance Tests +Coming soon: + +### Using gradle to run tests +All commands should be run from airbyte project root. +To run unit tests: +``` +./gradlew :airbyte-integrations:connectors:destination-databend:unitTest +``` +To run acceptance and custom integration tests: +``` +./gradlew :airbyte-integrations:connectors:destination-databend:integrationTest +``` + +## Dependency Management +All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development. +We split dependencies between two groups, dependencies that are: +* required for your connector to work need to go to `MAIN_REQUIREMENTS` list. +* required for the testing need to go to `TEST_REQUIREMENTS` list + +### Publishing a new version of the connector +You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what? +1. Make sure your changes are passing unit and integration tests. +1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)). +1. Create a Pull Request. +1. Pat yourself on the back for being an awesome contributor. +1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master. diff --git a/airbyte-integrations/connectors/destination-databend/build.gradle b/airbyte-integrations/connectors/destination-databend/build.gradle new file mode 100644 index 000000000000..dd8a2bfb94e1 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/build.gradle @@ -0,0 +1,8 @@ +plugins { + id 'airbyte-python' + id 'airbyte-docker' +} + +airbytePython { + moduleDirectory 'destination_databend' +} diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/__init__.py b/airbyte-integrations/connectors/destination-databend/destination_databend/__init__.py new file mode 100644 index 000000000000..fe96a70e2b77 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from .destination import DestinationDatabend + +__all__ = ["DestinationDatabend"] diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/__init__.pyc b/airbyte-integrations/connectors/destination-databend/destination_databend/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4538a0add1a44b3d66bbe4f5a67b65c59b0c7072 GIT binary patch literal 307 zcmYjNK~BRk5L_n+P!SRmXYRQKf1pAfdgcJ)!e!;gR#=+aD7y&d#8>zuA7D}yTFW~- zo*io}*FTqw@6A`&(SB9v_b9yqL~KU`(bmz6BNr>`!77S1JvAqAhyye;*fn64)FcO! zl2^gPI_3BXnI4*Ywp;+>!Ll3K8ARqV49pEE9Y7}>{j;V~{cFgIdXwaPvG%i!8}&iP z^VEgPQi8lo2%*< D6Y@_X literal 0 HcmV?d00001 diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/client.py b/airbyte-integrations/connectors/destination-databend/destination_databend/client.py new file mode 100644 index 000000000000..161c6f807162 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/client.py @@ -0,0 +1,19 @@ +from databend_sqlalchemy import connector + + +class DatabendClient: + + def __init__(self, protocol: str, host: str, port: int, database: str, table: str, username: str, password: str = None): + self.protocol = protocol + self.host = host + self.port = port + self.database = database + self.table = table + self.username = username + self.password = password + + def open(self): + handle = connector.connect( + f'{self.protocol}://{self.username}:{self.password}@{self.host}:{self.port}').cursor() + + return handle diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py new file mode 100644 index 000000000000..45c20ba9c05f --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from typing import Any, Iterable, Mapping +from logging import getLogger +import json +from datetime import datetime +from airbyte_cdk import AirbyteLogger +from destination_databend.client import DatabendClient +from airbyte_cdk.destinations import Destination +from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, DestinationSyncMode, Status, Type +from .writer import create_databend_wirter +from typing import Any, Dict, Iterable, Mapping, Optional +from uuid import uuid4 + +logger = getLogger("airbyte") + + +class DestinationDatabend(Destination): + def write( + self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] + ) -> Iterable[AirbyteMessage]: + + """ + TODO + Reads the input stream of messages, config, and catalog to write data to the destination. + + This method returns an iterable (typically a generator of AirbyteMessages via yield) containing state messages received + in the input message stream. Outputting a state message means that every AirbyteRecordMessage which came before it has been + successfully persisted to the destination. This is used to ensure fault tolerance in the case that a sync fails before fully completing, + then the source is given the last state message output from this method as the starting point of the next sync. + + :param config: dict of JSON configuration matching the configuration declared in spec.json + :param configured_catalog: The Configured Catalog describing the schema of the data being received and how it should be persisted in the + destination + :param input_messages: The stream of input messages received from the source + :return: Iterable of AirbyteStateMessages wrapped in AirbyteMessage structs + """ + streams = {s.stream.name for s in configured_catalog.streams} + client = DatabendClient(**config) + + writer = create_databend_wirter(client, logger) + + for configured_stream in configured_catalog.streams: + if configured_stream.destination_sync_mode == DestinationSyncMode.overwrite: + writer.delete_table(configured_stream.stream.name) + logger.info(f"Stream {configured_stream.stream.name} is wiped.") + writer.create_raw_table(configured_stream.stream.name) + + for message in input_messages: + if message.type == Type.STATE: + yield message + elif message.type == Type.RECORD: + data = message.record.data + stream = message.record.stream + # Skip unselected streams + if stream not in streams: + logger.debug(f"Stream {stream} was not present in configured streams, skipping") + continue + writer.queue_write_data(stream, str(uuid4()), datetime.now(), json.dumps(data)) + + # Flush any leftover messages + writer.flush() + + def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: + """ + Tests if the input configuration can be used to successfully connect to the destination with the needed permissions + e.g: if a provided API token or password can be used to connect and write to the destination. + + :param logger: Logging object to display debug/info/error to the logs + (logs will not be accessible via airbyte UI if they are not passed to this logger) + :param config: Json object containing the configuration of this destination, content of this json is as specified in + the properties of the spec.json file + + :return: AirbyteConnectionStatus indicating a Success or Failure + """ + try: + client = DatabendClient(**config) + cursor = client.open() + cursor.execute('select 1') + return AirbyteConnectionStatus(status=Status.SUCCEEDED) + except Exception as e: + return AirbyteConnectionStatus(status=Status.FAILED, message=f"An exception occurred: {repr(e)}") diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json new file mode 100644 index 000000000000..13f842129d6b --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json @@ -0,0 +1,81 @@ +{ + "documentationUrl" : "https://docs.airbyte.com/integrations/destinations/databend", + "supported_destination_sync_modes" : [ + "overwrite", + "append", + "append_dedup" + ], + "supportsIncremental" : true, + "supportsDBT" : true, + "supportsNormalization" : true, + "connectionSpecification" : { + "$schema" : "http://json-schema.org/draft-07/schema#", + "title" : "Destination Databend", + "type" : "object", + "required" : [ + "host", + "username", + "database" + ], + "additionalProperties" : true, + "properties" : { + "host" : { + "title" : "Host", + "description" : "Hostname of the database.", + "type" : "string", + "order" : 0 + }, + "protocol" : { + "title" : "Protocol", + "description" : "Protocol of the host.", + "type" : "string", + "examples" : [ + "https" + ], + "default" : "https", + "order" : 1 + }, + "port" : { + "title" : "Port", + "description" : "Port of the database.", + "type" : "integer", + "minimum" : 0, + "maximum" : 65536, + "default" : 8081, + "examples" : [ + "8081" + ], + "order" : 2 + }, + "database" : { + "title" : "DB Name", + "description" : "Name of the database.", + "type" : "string", + "order" : 3 + }, + "schema" : { + "title" : "Default Table", + "description" : "The default table was written to.", + "type" : "string", + "examples" : [ + "default" + ], + "default" : "default", + "order" : 4 + }, + "username" : { + "title" : "User", + "description" : "Username to use to access the database.", + "type" : "string", + "order" : 5 + }, + "password" : { + "title" : "Password", + "description" : "Password associated with the username.", + "type" : "string", + "airbyte_secret" : true, + "order" : 6 + } + } + } +} diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py new file mode 100644 index 000000000000..9afaebc14372 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py @@ -0,0 +1,122 @@ +from collections import defaultdict +import json +from datetime import datetime +from time import time +from airbyte_cdk import AirbyteLogger +from itertools import chain +from destination_databend.client import DatabendClient + + +class DatabendWriter: + """ + Base class for shared writer logic. + """ + + flush_interval = 1000 + + def __init__(self, client: DatabendClient) -> None: + """ + :param client: Databend SDK connection class with established connection + to the databse. + """ + self.client = client + self.cursor = client.open() + self._buffer = defaultdict(list) + self._values = 0 + + def delete_table(self, name: str) -> None: + """ + Delete the resulting table. + Primarily used in Overwrite strategy to clean up previous data. + + :param name: table name to delete. + """ + cursor = self.cursor + cursor.execute(f"DROP TABLE IF EXISTS _airbyte_raw_{name}") + + def create_raw_table(self, name: str): + """ + Create the resulting _airbyte_raw table. + + :param name: table name to create. + """ + query = f""" + CREATE TABLE IF NOT EXISTS _airbyte_raw_{name} ( + _airbyte_ab_id TEXT, + _airbyte_emitted_at TIMESTAMP, + _airbyte_data TEXT + ) + """ + cursor = self.cursor + cursor.execute(query) + + def queue_write_data(self, stream_name: str, id: str, time: datetime, record: str) -> None: + """ + Queue up data in a buffer in memory before writing to the database. + When flush_interval is reached data is persisted. + + :param stream_name: name of the stream for which the data corresponds. + :param id: unique identifier of this data row. + :param time: time of writing. + :param record: string representation of the json data payload. + """ + self._buffer[stream_name].append((id, time, record)) + self._values += 1 + if self._values == self.flush_interval: + self._flush() + + def _flush(self): + """ + Stub for the intermediate data flush that's triggered during the + buffering operation. + """ + raise NotImplementedError() + + def flush(self): + """ + Stub for the data flush at the end of writing operation. + """ + raise NotImplementedError() + + +class DatabendSQLWriter(DatabendWriter): + """ + Data writer using the SQL writing strategy. Data is buffered in memory + and flushed using INSERT INTO SQL statement. This is less effective strategy + better suited for testing and small data sets. + """ + + flush_interval = 1000 + + def __init__(self, client: DatabendClient) -> None: + """ + :param client: Databend SDK connection class with established connection + to the databse. + """ + super().__init__(client) + + def _flush(self) -> None: + """ + Intermediate data flush that's triggered during the + buffering operation. Writes data stored in memory via SQL commands. + databend connector insert into table using stage + """ + cursor = self.cursor + # id, written_at, data + for table, data in self._buffer.items(): + cursor.execute(f"INSERT INTO _airbyte_raw_{table} (_airbyte_ab_id,_airbyte_emitted_at,_airbyte_data) VALUES (%, %, %)", + list(chain.from_iterable(data))) + self._buffer.clear() + self._values = 0 + + def flush(self) -> None: + """ + Final data flush after all data has been written to memory. + """ + self._flush() + + +def create_databend_wirter(client: DatabendClient, logger: AirbyteLogger) -> DatabendWriter: + logger.info("Using the SQL writing strategy") + writer = DatabendSQLWriter(client) + return writer diff --git a/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py new file mode 100644 index 000000000000..aa937dcb4802 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py @@ -0,0 +1,143 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json +from typing import Any, Dict, List, Mapping + +import pytest +from airbyte_cdk import AirbyteLogger +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStateMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + Status, + SyncMode, + Type, +) +from destination_databend import DestinationDatabend +from destination_databend.client import DatabendClient + + +@pytest.fixture(name="config") +def config_fixture() -> Mapping[str, Any]: + with open("secrets/config.json", "r") as f: + return json.loads(f.read()) + + +@pytest.fixture(name="configured_catalog") +def configured_catalog_fixture() -> ConfiguredAirbyteCatalog: + stream_schema = {"type": "object", "properties": {"string_col": {"type": "str"}, "int_col": {"type": "integer"}}} + + append_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="append_stream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + + overwrite_stream = ConfiguredAirbyteStream( + stream=AirbyteStream(name="overwrite_stream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental]), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + + return ConfiguredAirbyteCatalog(streams=[append_stream, overwrite_stream]) + + +@pytest.fixture(autouse=True) +def teardown(config: Mapping): + yield + client = DatabendClient(**config) + cursor = client.open() + cursor.close() + + +@pytest.fixture(name="client") +def client_fixture(config) -> DatabendClient: + return DatabendClient(**config) + + +def test_check_valid_config(config: Mapping): + outcome = DestinationDatabend().check(AirbyteLogger(), config) + assert outcome.status == Status.SUCCEEDED + + +def test_check_invalid_config(): + outcome = DestinationDatabend().check(AirbyteLogger(), {"bucket_id": "not_a_real_id"}) + assert outcome.status == Status.FAILED + + +def _state(data: Dict[str, Any]) -> AirbyteMessage: + return AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage(data=data)) + + +def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage: + return AirbyteMessage( + type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0) + ) + + +def retrieve_all_records(stream_name: str, client: DatabendClient) -> List[AirbyteRecordMessage]: + """retrieves and formats all records in databend as Airbyte messages""" + cursor = client.open() + cursor.execute(f"select * from _airbyte_raw_{stream_name}") + all_records = cursor.fetchall() + out = [] + for record in all_records: + # key = record[0] + # stream = key.split("__ab__")[0] + value = json.loads(record[2]) + out.append(_record(stream_name, value["str_col"], value["int_col"])) + return out + + +def test_write(config: Mapping, configured_catalog: ConfiguredAirbyteCatalog, client: DatabendClient): + """ + This test verifies that: + 1. writing a stream in "overwrite" mode overwrites any existing data for that stream + 2. writing a stream in "append" mode appends new records without deleting the old ones + 3. The correct state message is output by the connector at the end of the sync + """ + append_stream, overwrite_stream = configured_catalog.streams[0].stream.name, configured_catalog.streams[1].stream.name + first_state_message = _state({"state": "1"}) + first_record_chunk = [_record(append_stream, str(i), i) for i in range(5)] + [_record(overwrite_stream, str(i), i) for i in range(5)] + + second_state_message = _state({"state": "2"}) + second_record_chunk = [_record(append_stream, str(i), i) for i in range(5, 10)] + [ + _record(overwrite_stream, str(i), i) for i in range(5, 10) + ] + + destination = DestinationDatabend() + + expected_states = [first_state_message, second_state_message] + output_states = list( + destination.write( + config, configured_catalog, [*first_record_chunk, first_state_message, *second_record_chunk, second_state_message] + ) + ) + assert expected_states == output_states, "Checkpoint state messages were expected from the destination" + + expected_records = [_record(append_stream, str(i), i) for i in range(10)] + [_record(overwrite_stream, str(i), i) for i in range(10)] + records_in_destination = retrieve_all_records("append_stream", client) + print("des", records_in_destination) + print("expect", expected_records) + assert expected_records == records_in_destination, "Records in destination should match records expected" + + # After this sync we expect the append stream to have 15 messages and the overwrite stream to have 5 + third_state_message = _state({"state": "3"}) + third_record_chunk = [_record(append_stream, str(i), i) for i in range(10, 15)] + [ + _record(overwrite_stream, str(i), i) for i in range(10, 15) + ] + + output_states = list(destination.write(config, configured_catalog, [*third_record_chunk, third_state_message])) + assert [third_state_message] == output_states + + records_in_destination = retrieve_all_records('append_stream', client) + expected_records = [_record(append_stream, str(i), i) for i in range(15)] + [ + _record(overwrite_stream, str(i), i) for i in range(10, 15) + ] + assert expected_records == records_in_destination diff --git a/airbyte-integrations/connectors/destination-databend/main.py b/airbyte-integrations/connectors/destination-databend/main.py new file mode 100644 index 000000000000..17cced87eeb9 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/main.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import sys + +from destination_databend import DestinationDatabend + +if __name__ == "__main__": + DestinationDatabend().run(sys.argv[1:]) diff --git a/airbyte-integrations/connectors/destination-databend/requirements.txt b/airbyte-integrations/connectors/destination-databend/requirements.txt new file mode 100644 index 000000000000..d6e1198b1ab1 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/requirements.txt @@ -0,0 +1 @@ +-e . diff --git a/airbyte-integrations/connectors/destination-databend/setup.py b/airbyte-integrations/connectors/destination-databend/setup.py new file mode 100644 index 000000000000..462605e00103 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/setup.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from setuptools import find_packages, setup + +MAIN_REQUIREMENTS = [ + "airbyte-cdk", + "requests", + "databend-sqlalchemy", + "databend-py" +] + +TEST_REQUIREMENTS = ["pytest~=6.1"] +setup( + name="destination_databend", + description="Destination implementation for Databend.", + author="Airbyte", + author_email="contact@airbyte.io", + packages=find_packages(), + install_requires=MAIN_REQUIREMENTS, + package_data={"": ["*.json"]}, + extras_require={ + "tests": TEST_REQUIREMENTS, + }, +) diff --git a/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py b/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py new file mode 100644 index 000000000000..c20039df9de1 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py @@ -0,0 +1,163 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from datetime import datetime +from typing import Any, Dict +from unittest.mock import AsyncMock, MagicMock, call, patch + +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteRecordMessage, + AirbyteStream, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + Status, + SyncMode, + Type, +) +from destination_databend.destination import DestinationDatabend, DatabendClient +from pytest import fixture, mark + + +@fixture +def logger() -> MagicMock: + return MagicMock() + + +@fixture +def config() -> Dict[str, str]: + args = { + "database": "default", + "username": "root", + "password": "root", + "host": "localhost", + "protocol": "http", + "port": 8081, + "table": "default", + } + return args + + +@fixture(name="mock_connection") +def async_connection_cursor_mock(): + connection = MagicMock() + cursor = AsyncMock() + connection.cursor.return_value = cursor + return connection, cursor + + +@fixture +def configured_stream1() -> ConfiguredAirbyteStream: + return ConfiguredAirbyteStream( + stream=AirbyteStream( + name="table1", + json_schema={ + "type": "object", + "properties": {"col1": {"type": "string"}, "col2": {"type": "integer"}}, + }, + supported_sync_modes=[SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + + +@fixture +def configured_stream2() -> ConfiguredAirbyteStream: + return ConfiguredAirbyteStream( + stream=AirbyteStream( + name="table2", + json_schema={ + "type": "object", + "properties": {"col1": {"type": "string"}, "col2": {"type": "integer"}}, + }, + supported_sync_modes=[SyncMode.incremental], + ), + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append, + ) + + +@fixture +def airbyte_message1() -> AirbyteMessage: + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage( + stream="table1", + data={"key1": "value1", "key2": 2}, + emitted_at=int(datetime.now().timestamp()) * 1000, + ), + ) + + +@fixture +def airbyte_message2() -> AirbyteMessage: + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage( + stream="table2", + data={"key1": "value2", "key2": 3}, + emitted_at=int(datetime.now().timestamp()) * 1000, + ), + ) + + +@fixture +def airbyte_state_message() -> AirbyteMessage: + return AirbyteMessage(type=Type.STATE) + + +@patch("destination_databend.client.DatabendClient", MagicMock()) +def test_connection(config: Dict[str, str], logger: MagicMock) -> None: + # Check no log object + DatabendClient(**config) + + +@patch("destination_databend.writer.DatabendSQLWriter") +@patch("destination_databend.client.DatabendClient") +def test_sql_write_append( + mock_connection: MagicMock, + mock_writer: MagicMock, + config: Dict[str, str], + configured_stream1: ConfiguredAirbyteStream, + configured_stream2: ConfiguredAirbyteStream, + airbyte_message1: AirbyteMessage, + airbyte_message2: AirbyteMessage, + airbyte_state_message: AirbyteMessage, +) -> None: + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream1, configured_stream2]) + + destination = DestinationDatabend() + result = destination.write(config, catalog, [airbyte_message1, airbyte_state_message, airbyte_message2]) + + assert list(result) == [airbyte_state_message] + mock_writer.return_value.delete_table.assert_not_called() + mock_writer.return_value.create_raw_table.mock_calls = [call(mock_connection, "table1"), call(mock_connection, "table2")] + assert len(mock_writer.return_value.queue_write_data.mock_calls) == 2 + mock_writer.return_value.flush.assert_called_once() + + +@patch("destination_databend.writer.DatabendSQLWriter") +@patch("destination_databend.client.DatabendClient") +def test_sql_write_overwrite( + mock_connection: MagicMock, + mock_writer: MagicMock, + config: Dict[str, str], + configured_stream1: ConfiguredAirbyteStream, + configured_stream2: ConfiguredAirbyteStream, + airbyte_message1: AirbyteMessage, + airbyte_message2: AirbyteMessage, + airbyte_state_message: AirbyteMessage, +): + # Overwrite triggers a delete + configured_stream1.destination_sync_mode = DestinationSyncMode.overwrite + catalog = ConfiguredAirbyteCatalog(streams=[configured_stream1, configured_stream2]) + + destination = DestinationDatabend() + result = destination.write(config, catalog, [airbyte_message1, airbyte_state_message, airbyte_message2]) + + assert list(result) == [airbyte_state_message] + mock_writer.return_value.delete_table.assert_called_once_with("table1") + mock_writer.return_value.create_raw_table.mock_calls = [call(mock_connection, "table1"), call(mock_connection, "table2")] diff --git a/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py b/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py new file mode 100644 index 000000000000..25ba388053e8 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# +from typing import Any, Union +from unittest.mock import ANY, MagicMock, call, patch + +from destination_databend.writer import DatabendSQLWriter +from pytest import fixture, mark + + +@fixture +def client() -> MagicMock: + return MagicMock() + + +@fixture +def sql_writer(client: MagicMock) -> DatabendSQLWriter: + return DatabendSQLWriter(client) + + +def test_sql_default(sql_writer: DatabendSQLWriter) -> None: + assert len(sql_writer._buffer) == 0 + assert sql_writer.flush_interval == 1000 + + +@mark.parametrize("writer", ["sql_writer"]) +def test_sql_create(client: MagicMock, writer: Union[DatabendSQLWriter], request: Any) -> None: + writer = request.getfixturevalue(writer) + expected_query = """ + CREATE FACT TABLE IF NOT EXISTS _airbyte_raw_dummy ( + _airbyte_ab_id TEXT, + _airbyte_emitted_at TIMESTAMP, + _airbyte_data TEXT + ) + PRIMARY INDEX _airbyte_ab_id + """ + writer.create_raw_table("dummy") + + +def test_data_buffering(sql_writer: DatabendSQLWriter) -> None: + sql_writer.queue_write_data("dummy", "id1", 20200101, '{"key": "value"}') + sql_writer._buffer["dummy"][0] == ("id1", 20200101, '{"key": "value"}') + assert len(sql_writer._buffer["dummy"]) == 1 + assert len(sql_writer._buffer.keys()) == 1 + sql_writer.queue_write_data("dummy", "id2", 20200102, '{"key2": "value2"}') + sql_writer._buffer["dummy"][0] == ("id2", 20200102, '{"key2": "value2"}') + assert len(sql_writer._buffer["dummy"]) == 2 + assert len(sql_writer._buffer.keys()) == 1 + sql_writer.queue_write_data("dummy2", "id3", 20200103, '{"key3": "value3"}') + sql_writer._buffer["dummy"][0] == ("id3", 20200103, '{"key3": "value3"}') + assert len(sql_writer._buffer["dummy"]) == 2 + assert len(sql_writer._buffer["dummy2"]) == 1 + assert len(sql_writer._buffer.keys()) == 2 + + + diff --git a/docs/integrations/destinations/databend.md b/docs/integrations/destinations/databend.md new file mode 100644 index 000000000000..b11c9fc41529 --- /dev/null +++ b/docs/integrations/destinations/databend.md @@ -0,0 +1,54 @@ +# Databend + +This page guides you through the process of setting up the Databend destination connector. + +## Features + +| Feature | Supported?\(Yes/No\) | Notes | +| :--- | :--- | :--- | +| Full Refresh Sync | Yes | | +| Incremental - Append Sync | Yes | | + + +#### Output Schema + +Each stream will be output into its own table in ClickHouse. Each table will contain 3 columns: + +* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in ClickHouse is `String`. +* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in ClickHouse is `DateTime64`. +* `_airbyte_data`: a json blob representing with the event data. The column type in ClickHouse is `String`. + +## Getting Started +You can follow the [Connecting to a Warehouse docs](https://docs.databend.com/using-databend-cloud/warehouses/connecting-a-warehouse) to get the user, password, host etc. + +Or You can create such a user by running: + +``` +GRANT CREATE ON * TO airbyte_user; +``` + +Make sure the Databend user with the following permissions: + +* can create tables and write rows. +* can create databases e.g: + +You can also use a pre-existing user but we highly recommend creating a dedicated user for Airbyte. + + +#### Target Database + +You will need to choose an existing database or create a new database that will be used to store synced data from Airbyte. + +### Setup the ClickHouse Destination in Airbyte + +You should now have all the requirements needed to configure Databend as a destination in the UI. You'll need the following information to configure the Databend destination: + +* **Host** +* **Port** +* **Username** +* **Password** +* **Database** + + +## Changelog +######TODO: more info \ No newline at end of file From 9d397102e335306498b7af3af3e7b4283294ada0 Mon Sep 17 00:00:00 2001 From: hantmac Date: Sun, 27 Nov 2022 13:32:15 +0800 Subject: [PATCH 02/34] docs --- docs/integrations/destinations/databend.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/destinations/databend.md b/docs/integrations/destinations/databend.md index b11c9fc41529..26fb0f4d5ae5 100644 --- a/docs/integrations/destinations/databend.md +++ b/docs/integrations/destinations/databend.md @@ -1,6 +1,6 @@ # Databend -This page guides you through the process of setting up the Databend destination connector. +This page guides you through the process of setting up the [Databend](https://databend.rs/) destination connector. ## Features From 48414c391ba7b425ddeb05b2251c156536a6eb8b Mon Sep 17 00:00:00 2001 From: hantmac Date: Sun, 27 Nov 2022 13:51:19 +0800 Subject: [PATCH 03/34] fix --- .../6045ffa4-d502-40fe-807f-d21e71b74ab5.csv | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv diff --git a/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv b/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv deleted file mode 100644 index 6dc78f6b56da..000000000000 --- a/airbyte-integrations/connectors/destination-databend/6045ffa4-d502-40fe-807f-d21e71b74ab5.csv +++ /dev/null @@ -1,10 +0,0 @@ -904c3361-a443-4985-9c81-f1331a5b7ba2,2022-11-27 12:28:25.070203,"{""str_col"": ""0"", ""int_col"": 0}" -7d47f811-da9f-4a2c-9df3-82f14d0cd232,2022-11-27 12:28:25.070218,"{""str_col"": ""1"", ""int_col"": 1}" -f8546f7a-18b2-46f7-b184-bd9b9dc76e41,2022-11-27 12:28:25.070237,"{""str_col"": ""2"", ""int_col"": 2}" -7cbb0577-e6d3-485d-9671-0088d0913cfb,2022-11-27 12:28:25.070250,"{""str_col"": ""3"", ""int_col"": 3}" -3bef3e8d-aaf4-467f-93dd-683249f9c1d0,2022-11-27 12:28:25.070263,"{""str_col"": ""4"", ""int_col"": 4}" -f3f57c21-d980-4cd0-aa1a-3e6f0092fd95,2022-11-27 12:28:25.070340,"{""str_col"": ""5"", ""int_col"": 5}" -7a6525a0-451c-45a8-841f-5d8ba160aa1c,2022-11-27 12:28:25.070353,"{""str_col"": ""6"", ""int_col"": 6}" -b0994bdb-90db-4c23-bce0-25654eb8175c,2022-11-27 12:28:25.070367,"{""str_col"": ""7"", ""int_col"": 7}" -93dc6599-43f3-4567-980b-448f2d383f97,2022-11-27 12:28:25.070380,"{""str_col"": ""8"", ""int_col"": 8}" -645b1c0a-516c-4740-998f-08cd0c162578,2022-11-27 12:28:25.070392,"{""str_col"": ""9"", ""int_col"": 9}" From a6c3ce1461d3593a335f0e9582a4ea5042eb3206 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 28 Nov 2022 10:16:38 +0800 Subject: [PATCH 04/34] fix --- docs/integrations/destinations/databend.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/integrations/destinations/databend.md b/docs/integrations/destinations/databend.md index 26fb0f4d5ae5..921c1045a2e0 100644 --- a/docs/integrations/destinations/databend.md +++ b/docs/integrations/destinations/databend.md @@ -12,11 +12,11 @@ This page guides you through the process of setting up the [Databend](https://da #### Output Schema -Each stream will be output into its own table in ClickHouse. Each table will contain 3 columns: +Each stream will be output into its own table in Databend. Each table will contain 3 columns: -* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in ClickHouse is `String`. -* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in ClickHouse is `DateTime64`. -* `_airbyte_data`: a json blob representing with the event data. The column type in ClickHouse is `String`. +* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in Databend is `String`. +* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in Databend is `Timestamp`. +* `_airbyte_data`: a json blob representing with the event data. The column type in Databend is `String`. ## Getting Started You can follow the [Connecting to a Warehouse docs](https://docs.databend.com/using-databend-cloud/warehouses/connecting-a-warehouse) to get the user, password, host etc. @@ -39,7 +39,7 @@ You can also use a pre-existing user but we highly recommend creating a dedicate You will need to choose an existing database or create a new database that will be used to store synced data from Airbyte. -### Setup the ClickHouse Destination in Airbyte +### Setup the Databend Destination in Airbyte You should now have all the requirements needed to configure Databend as a destination in the UI. You'll need the following information to configure the Databend destination: From 74ddc14a2d2420b4df2cbd5f20d7bfef2c0ffbfd Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 28 Nov 2022 17:07:34 +0800 Subject: [PATCH 05/34] fix tests --- .../integration_tests/integration_test.py | 38 +++++++++++++------ docs/integrations/destinations/databend.md | 6 +-- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py index aa937dcb4802..87d74732a4ac 100644 --- a/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py +++ b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py @@ -2,7 +2,7 @@ # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import json +import json, logging from typing import Any, Dict, List, Mapping import pytest @@ -62,12 +62,12 @@ def client_fixture(config) -> DatabendClient: def test_check_valid_config(config: Mapping): - outcome = DestinationDatabend().check(AirbyteLogger(), config) + outcome = DestinationDatabend().check(logging.getLogger('airbyte'), config) assert outcome.status == Status.SUCCEEDED def test_check_invalid_config(): - outcome = DestinationDatabend().check(AirbyteLogger(), {"bucket_id": "not_a_real_id"}) + outcome = DestinationDatabend().check(logging.getLogger('airbyte'), {"bucket_id": "not_a_real_id"}) assert outcome.status == Status.FAILED @@ -81,8 +81,7 @@ def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage: ) -def retrieve_all_records(stream_name: str, client: DatabendClient) -> List[AirbyteRecordMessage]: - """retrieves and formats all records in databend as Airbyte messages""" +def retrieve_records(stream_name: str, client: DatabendClient) -> List[AirbyteRecordMessage]: cursor = client.open() cursor.execute(f"select * from _airbyte_raw_{stream_name}") all_records = cursor.fetchall() @@ -95,6 +94,15 @@ def retrieve_all_records(stream_name: str, client: DatabendClient) -> List[Airby return out +def retrieve_all_records(client: DatabendClient) -> List[AirbyteRecordMessage]: + """retrieves and formats all records in databend as Airbyte messages""" + overwrite_stream = "overwrite_stream" + append_stream = "append_stream" + overwrite_out = retrieve_records(overwrite_stream, client) + append_out = retrieve_records(append_stream, client) + return overwrite_out + append_out + + def test_write(config: Mapping, configured_catalog: ConfiguredAirbyteCatalog, client: DatabendClient): """ This test verifies that: @@ -122,10 +130,8 @@ def test_write(config: Mapping, configured_catalog: ConfiguredAirbyteCatalog, cl assert expected_states == output_states, "Checkpoint state messages were expected from the destination" expected_records = [_record(append_stream, str(i), i) for i in range(10)] + [_record(overwrite_stream, str(i), i) for i in range(10)] - records_in_destination = retrieve_all_records("append_stream", client) - print("des", records_in_destination) - print("expect", expected_records) - assert expected_records == records_in_destination, "Records in destination should match records expected" + records_in_destination = retrieve_all_records(client) + assert len(expected_records) == len(records_in_destination), "Records in destination should match records expected" # After this sync we expect the append stream to have 15 messages and the overwrite stream to have 5 third_state_message = _state({"state": "3"}) @@ -136,8 +142,18 @@ def test_write(config: Mapping, configured_catalog: ConfiguredAirbyteCatalog, cl output_states = list(destination.write(config, configured_catalog, [*third_record_chunk, third_state_message])) assert [third_state_message] == output_states - records_in_destination = retrieve_all_records('append_stream', client) + records_in_destination = retrieve_all_records(client) expected_records = [_record(append_stream, str(i), i) for i in range(15)] + [ _record(overwrite_stream, str(i), i) for i in range(10, 15) ] - assert expected_records == records_in_destination + assert len(expected_records) == len(records_in_destination) + + tear_down(client) + + +def tear_down(client: DatabendClient): + overwrite_stream = "overwrite_stream" + append_stream = "append_stream" + cursor = client.open() + cursor.execute(f"DROP table _airbyte_raw_{overwrite_stream}") + cursor.execute(f"DROP table _airbyte_raw_{append_stream}") diff --git a/docs/integrations/destinations/databend.md b/docs/integrations/destinations/databend.md index 921c1045a2e0..08339cf0ab32 100644 --- a/docs/integrations/destinations/databend.md +++ b/docs/integrations/destinations/databend.md @@ -14,9 +14,9 @@ This page guides you through the process of setting up the [Databend](https://da Each stream will be output into its own table in Databend. Each table will contain 3 columns: -* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in Databend is `String`. -* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in Databend is `Timestamp`. -* `_airbyte_data`: a json blob representing with the event data. The column type in Databend is `String`. +* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in Databend is `VARCHAR`. +* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in Databend is `TIMESTAMP`. +* `_airbyte_data`: a json blob representing with the event data. The column type in Databend is `VARVHAR`. ## Getting Started You can follow the [Connecting to a Warehouse docs](https://docs.databend.com/using-databend-cloud/warehouses/connecting-a-warehouse) to get the user, password, host etc. From ee83f67064e82d62206db5477662e4e38e0dde85 Mon Sep 17 00:00:00 2001 From: hantmac Date: Thu, 1 Dec 2022 15:01:41 +0800 Subject: [PATCH 06/34] support normalization for databend --- .../bases/base-normalization/README.md | 1 + .../bases/base-normalization/build.gradle | 8 +++ .../base-normalization/databend.Dockerfile | 32 +++++++++ .../dbt_project.yml | 65 +++++++++++++++++++ .../packages.yml | 5 ++ .../macros/cross_db_utils/array.sql | 5 ++ .../macros/cross_db_utils/datatypes.sql | 52 +++++++++++++++ .../macros/cross_db_utils/json_operations.sql | 25 +++++++ .../macros/cross_db_utils/quote.sql | 4 ++ .../docker-compose.build.yaml | 7 ++ .../base-normalization/docker-compose.yaml | 2 + .../integration_tests/dbt_integration_test.py | 4 ++ .../integration_tests/test_normalization.py | 10 ++- .../normalization/destination_type.py | 1 + .../destination_name_transformer.py | 9 +++ .../transform_catalog/reserved_keywords.py | 4 +- .../transform_config/transform.py | 21 +++++- .../test_destination_name_transformer.py | 11 ++++ docs/cloud/core-concepts.md | 1 + .../basic-normalization.md | 1 + 20 files changed, 263 insertions(+), 5 deletions(-) create mode 100644 airbyte-integrations/bases/base-normalization/databend.Dockerfile create mode 100644 airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml create mode 100644 airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml diff --git a/airbyte-integrations/bases/base-normalization/README.md b/airbyte-integrations/bases/base-normalization/README.md index bfa9ada93db4..99644d9f82fa 100644 --- a/airbyte-integrations/bases/base-normalization/README.md +++ b/airbyte-integrations/bases/base-normalization/README.md @@ -235,6 +235,7 @@ allowed characters, if quotes are needed or not, and the length limitations: * [mysql](../../../docs/integrations/destinations/mysql.md) * [oracle](../../../docs/integrations/destinations/oracle.md) * [mssql](../../../docs/integrations/destinations/mssql.md) +* [databend](../../../docs/integrations/destinations/databend.md) Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit: * `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii` diff --git a/airbyte-integrations/bases/base-normalization/build.gradle b/airbyte-integrations/bases/base-normalization/build.gradle index 5030b7264723..5c5b50598568 100644 --- a/airbyte-integrations/bases/base-normalization/build.gradle +++ b/airbyte-integrations/bases/base-normalization/build.gradle @@ -84,6 +84,11 @@ task airbyteDockerTiDB(type: Exec, dependsOn: checkSshScriptCopy) { dependsOn assemble } +task airbyteDockerDatabend(type: Exec, dependsOn: checkSshScriptCopy) { + configure buildAirbyteDocker('databend') + dependsOn assemble +} + airbyteDocker.dependsOn(airbyteDockerMSSql) airbyteDocker.dependsOn(airbyteDockerMySql) airbyteDocker.dependsOn(airbyteDockerOracle) @@ -91,6 +96,7 @@ airbyteDocker.dependsOn(airbyteDockerClickhouse) airbyteDocker.dependsOn(airbyteDockerSnowflake) airbyteDocker.dependsOn(airbyteDockerRedshift) airbyteDocker.dependsOn(airbyteDockerTiDB) +airbyteDocker.dependsOn(airbyteDockerDatabend) task("customIntegrationTestPython", type: PythonTask, dependsOn: installTestReqs) { module = "pytest" @@ -106,6 +112,7 @@ task("customIntegrationTestPython", type: PythonTask, dependsOn: installTestReqs dependsOn ':airbyte-integrations:connectors:destination-mssql:airbyteDocker' dependsOn ':airbyte-integrations:connectors:destination-clickhouse:airbyteDocker' dependsOn ':airbyte-integrations:connectors:destination-tidb:airbyteDocker' + dependsOn ':airbyte-integrations:connectors:destination-databend:airbyteDocker' } // not really sure what this task does differently from customIntegrationTestPython, but it seems to also run integration tests @@ -120,6 +127,7 @@ project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte- project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-mssql:airbyteDocker' project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-clickhouse:airbyteDocker' project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-tidb:airbyteDocker' +project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-databend:airbyteDocker' // DATs have some additional tests that exercise normalization code paths, // so we want to run these in addition to the base-normalization integration tests. diff --git a/airbyte-integrations/bases/base-normalization/databend.Dockerfile b/airbyte-integrations/bases/base-normalization/databend.Dockerfile new file mode 100644 index 000000000000..deddbc0c2190 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/databend.Dockerfile @@ -0,0 +1,32 @@ +FROM fishtownanalytics/dbt:0.21.0 +COPY --from=airbyte/base-airbyte-protocol-python:0.1.1 /airbyte /airbyte + +# Install SSH Tunneling dependencies +RUN apt-get update && apt-get install -y jq sshpass +WORKDIR /airbyte +COPY entrypoint.sh . +COPY build/sshtunneling.sh . + +WORKDIR /airbyte/normalization_code +COPY normalization ./normalization +COPY setup.py . +COPY dbt-project-template/ ./dbt-template/ + +# Install python dependencies +WORKDIR /airbyte/base_python_structs +RUN pip install . + +WORKDIR /airbyte/normalization_code +RUN pip install . + +WORKDIR /airbyte/normalization_code/dbt-template/ +#RUN pip install dbt-databend-cloud +RUN pip install git+https://github.com/databendcloud/dbt-databend.git +# Download external dbt dependencies +RUN dbt deps + +WORKDIR /airbyte +ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh" +ENTRYPOINT ["/airbyte/entrypoint.sh"] + +LABEL io.airbyte.name=airbyte/normalization-databend diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml new file mode 100644 index 000000000000..58d58e7e1104 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml @@ -0,0 +1,65 @@ +# This file is necessary to install dbt-utils with dbt deps +# the content will be overwritten by the transform function + +# Name your package! Package names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: "airbyte_utils" +version: "1.0" +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. Profiles contain +# database connection information, and should be configured in the ~/.dbt/profiles.yml file +profile: "normalize" + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that source models can be found +# in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +docs-paths: ["docs"] +analysis-paths: ["analysis"] +test-paths: ["tests"] +seed-paths: ["data"] +macro-paths: ["macros"] + +target-path: "../build" # directory which will store compiled SQL files +log-path: "../logs" # directory which will store DBT logs +packages-install-path: "/dbt" # directory which will store external DBT dependencies + +clean-targets: # directories to be removed by `dbt clean` + - "build" + - "dbt_modules" + +quoting: + database: true + # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) + # all schemas should be unquoted + schema: true + identifier: true + +# You can define configurations for models in the `model-paths` directory here. +# Using these configurations, you can enable or disable models, change how they +# are materialized, and more! +models: + airbyte_utils: + +materialized: table + generated: + airbyte_ctes: + +tags: airbyte_internal_cte + # ephemeral materialization isn't supported in Databend yet + +materialized: view + airbyte_incremental: + +tags: incremental_tables + +materialized: incremental + # schema change test isn't supported in Databend yet + +on_schema_change: "ignore" + airbyte_tables: + +tags: normalized_tables + +materialized: table + airbyte_views: + +tags: airbyte_internal_views + +materialized: view + +dispatch: + - macro_namespace: dbt_utils + search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml new file mode 100644 index 000000000000..33b4edd58c8c --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml @@ -0,0 +1,5 @@ +# add dependencies. these will get pulled during the `dbt deps` process. + +packages: + - git: "https://github.com/fishtown-analytics/dbt-utils.git" + revision: 0.8.2 diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql index 56ab17ce9af6..e2ab50385d4a 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql @@ -6,6 +6,7 @@ - postgres: unnest() -> https://www.postgresqltutorial.com/postgresql-array/ - MSSQL: openjson() –> https://docs.microsoft.com/en-us/sql/relational-databases/json/validate-query-and-change-json-data-with-built-in-functions-sql-server?view=sql-server-ver15 - ClickHouse: ARRAY JOIN –> https://clickhouse.com/docs/zh/sql-reference/statements/select/array-join/ + - Databend: unnest() -> https://databend.rs/doc/sql-reference/data-types/data-type-array-types/ #} {# cross_join_unnest ------------------------------------------------- #} @@ -26,6 +27,10 @@ ARRAY JOIN {{ array_col }} {%- endmacro %} +{% macro databend__cross_join_unnest(stream_name, array_col) -%} + unnest({{ array_col }}) +{%- endmacro %} + {% macro oracle__cross_join_unnest(stream_name, array_col) -%} {% do exceptions.warn("Normalization does not support unnesting for Oracle yet.") %} {%- endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql index 42f5312b054f..d4cf2f20e361 100755 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql @@ -44,6 +44,10 @@ json {%- endmacro -%} +{%- macro databend__type_json() -%} + json +{%- endmacro -%} + {# string ------------------------------------------------- #} @@ -72,6 +76,10 @@ char(1000) {%- endmacro -%} +{%- macro databend__type_string() -%} + String +{%- endmacro -%} + {# float ------------------------------------------------- #} {% macro mysql__type_float() %} float @@ -89,6 +97,10 @@ float {% endmacro %} +{% macro databend__type_float() %} + float +{% endmacro %} + {# int ------------------------------------------------- #} {% macro default__type_int() %} int @@ -110,6 +122,10 @@ signed {% endmacro %} +{% macro databend__type_int() %} + INT +{% endmacro %} + {# bigint ------------------------------------------------- #} {% macro mysql__type_bigint() %} signed @@ -127,6 +143,10 @@ signed {% endmacro %} +{% macro databend__type_bigint() %} + BIGINT +{% endmacro %} + {# numeric ------------------------------------------------- --#} {% macro mysql__type_numeric() %} float @@ -140,6 +160,10 @@ float {% endmacro %} +{% macro databend__type_numeric() %} + DOUBLE +{% endmacro %} + {# very_large_integer --------------------------------------- --#} {# Most databases don't have a true unbounded numeric datatype, so we use a really big numeric field. @@ -170,6 +194,10 @@ so this macro needs to be called very_large_integer. decimal(38, 0) {% endmacro %} +{% macro databend__type_very_large_integer() %} + numeric +{% endmacro %} + {# timestamp ------------------------------------------------- --#} {% macro mysql__type_timestamp() %} time @@ -189,6 +217,10 @@ so this macro needs to be called very_large_integer. time {% endmacro %} +{% macro databend__type_timestamp() %} + timestamp +{% endmacro %} + {# timestamp with time zone ------------------------------------------------- #} {%- macro type_timestamp_with_timezone() -%} @@ -229,6 +261,10 @@ so this macro needs to be called very_large_integer. char(1000) {%- endmacro -%} +{% macro databend__type_timestamp_with_timezone() %} + TIMESTAMP +{% endmacro %} + {# timestamp without time zone ------------------------------------------------- #} {%- macro type_timestamp_without_timezone() -%} @@ -261,6 +297,10 @@ so this macro needs to be called very_large_integer. datetime {% endmacro %} +{% macro databend__type_timestamp_without_timezone() %} + timestamp +{% endmacro %} + {# time without time zone ------------------------------------------------- #} {%- macro type_time_without_timezone() -%} @@ -287,6 +327,10 @@ so this macro needs to be called very_large_integer. time {% endmacro %} +{% macro databend__type_time_without_timezone() %} + String +{% endmacro %} + {# time with time zone ------------------------------------------------- #} @@ -330,6 +374,10 @@ so this macro needs to be called very_large_integer. char(1000) {%- endmacro -%} +{% macro databend__type_time_with_timezone() %} + String +{% endmacro %} + {# date ------------------------------------------------- #} {%- macro type_date() -%} @@ -351,3 +399,7 @@ so this macro needs to be called very_large_integer. {% macro clickhouse__type_date() %} Date32 {% endmacro %} + +{% macro databend__type_date() %} + DATE +{% endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql index 0b76f5f49a29..c8fd28df396e 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql @@ -7,6 +7,7 @@ - MySQL: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://dev.mysql.com/doc/refman/8.0/en/json-search-functions.html - ClickHouse: JSONExtractString(json_doc, 'path' [, 'path'] ...) -> https://clickhouse.com/docs/en/sql-reference/functions/json-functions/ - TiDB: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://docs.pingcap.com/tidb/stable/json-functions + - Databend: json_extract_path_text( , ) -> https://databend.rs/doc/sql-functions/semi-structured-functions/json_extract_path_text #} {# format_json_path -------------------------------------------------- #} @@ -103,6 +104,14 @@ {{ "'$.\"" ~ json_path_list|join(".") ~ "\"'" }} {%- endmacro %} +{% macro databend__format_json_path(json_path_list) -%} + {%- set str_list = [] -%} + {%- for json_path in json_path_list -%} + {%- if str_list.append(json_path.replace("'", "''").replace('"', '""')) -%} {%- endif -%} + {%- endfor -%} + {{ "'\"" ~ str_list|join('"."') ~ "\"'" }} +{%- endmacro %} + {# json_extract ------------------------------------------------- #} {% macro json_extract(from_table, json_column, json_path_list, normalized_json_path) -%} @@ -180,6 +189,14 @@ {% endif -%} {%- endmacro %} +{% macro databend__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%} + {%- if from_table|string() == '' %} + get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }}) + {% else %} + get_path(parse_json({{ from_table }}.{{ json_column }}), {{ format_json_path(json_path_list) }}) + {% endif -%} +{%- endmacro %} + {# json_extract_scalar ------------------------------------------------- #} {% macro json_extract_scalar(json_column, json_path_list, normalized_json_path) -%} @@ -234,6 +251,10 @@ ) {%- endmacro %} +{% macro databend__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%} + to_varchar(get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }})) +{%- endmacro %} + {# json_extract_array ------------------------------------------------- #} {% macro json_extract_array(json_column, json_path_list, normalized_json_path) -%} @@ -284,6 +305,10 @@ json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }}) {%- endmacro %} +{% macro databend__json_extract_array(json_column, json_path_list, normalized_json_path) -%} + get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }}) +{%- endmacro %} + {# json_extract_string_array ------------------------------------------------- #} {% macro json_extract_string_array(json_column, json_path_list, normalized_json_path) -%} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql index 87862498cfc5..d82a005d5c91 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql @@ -14,3 +14,7 @@ {% macro clickhouse__quote(column_name) -%} {{ '\"' ~ column_name ~ '\"'}} {%- endmacro %} + +{% macro databend__quote(column_name) -%} + {{ '\"' ~ column_name ~ '\"'}} +{%- endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml b/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml index 4f95cb7a4720..79fec482c45b 100644 --- a/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml +++ b/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml @@ -57,3 +57,10 @@ services: context: . labels: io.airbyte.git-revision: ${GIT_REVISION} + normalization-databend: + image: airbyte/normalization-databend:${VERSION} + build: + dockerfile: databend.Dockerfile + context: . + labels: + io.airbyte.git-revision: ${GIT_REVISION} diff --git a/airbyte-integrations/bases/base-normalization/docker-compose.yaml b/airbyte-integrations/bases/base-normalization/docker-compose.yaml index ae29237b5149..0fb017e11204 100644 --- a/airbyte-integrations/bases/base-normalization/docker-compose.yaml +++ b/airbyte-integrations/bases/base-normalization/docker-compose.yaml @@ -18,3 +18,5 @@ services: image: airbyte/normalization-redshift:${VERSION} normalization-tidb: image: airbyte/normalization-tidb:${VERSION} + normalization-databend: + image: airbyte/normalization-databend:${VERSION} diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py b/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py index 7cb25ea39ad9..28c940916b23 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py @@ -387,6 +387,8 @@ def generate_profile_yaml_file( } elif destination_type.value == DestinationType.MYSQL.value: profiles_config["database"] = self.target_schema + elif destination_type.value == DestinationType.DATABEND.value: + profiles_config["database"] = self.target_schema elif destination_type.value == DestinationType.REDSHIFT.value: profiles_config["schema"] = self.target_schema if random_schema: @@ -443,6 +445,8 @@ def get_normalization_image(destination_type: DestinationType) -> str: return "airbyte/normalization-redshift:dev" elif DestinationType.TIDB.value == destination_type.value: return "airbyte/normalization-tidb:dev" + elif DestinationType.DATABEND.value == destination_type.value: + return "airbyte/normalization-databend:dev" else: return "airbyte/normalization:dev" diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py b/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py index 0163cd128151..73262514df38 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py @@ -140,7 +140,12 @@ def run_schema_change_normalization(destination_type: DestinationType, test_reso if destination_type.value in [DestinationType.MYSQL.value, DestinationType.ORACLE.value]: # TODO: upgrade dbt-adapter repositories to work with dbt 0.21.0+ (outside airbyte's control) pytest.skip(f"{destination_type} does not support schema change in incremental yet (requires dbt 0.21.0+)") - if destination_type.value in [DestinationType.SNOWFLAKE.value, DestinationType.CLICKHOUSE.value, DestinationType.TIDB.value]: + if destination_type.value in [ + DestinationType.SNOWFLAKE.value, + DestinationType.CLICKHOUSE.value, + DestinationType.TIDB.value, + DestinationType.DATABEND.value, + ]: pytest.skip(f"{destination_type} is disabled as it doesnt support schema change in incremental yet (column type changes)") if destination_type.value in [DestinationType.MSSQL.value, DestinationType.SNOWFLAKE.value]: # TODO: create/fix github issue in corresponding dbt-adapter repository to handle schema changes (outside airbyte's control) @@ -213,6 +218,9 @@ def setup_test_dir(destination_type: DestinationType, test_resource_name: str) - elif destination_type.value == DestinationType.TIDB.value: copy_tree("../dbt-project-template-tidb", test_root_dir) dbt_project_yaml = "../dbt-project-template-tidb/dbt_project.yml" + elif destination_type.value == DestinationType.DATABEND.value: + copy_tree("../dbt-project-template-databend", test_root_dir) + dbt_project_yaml = "../dbt-project-template-databend/dbt_project.yml" dbt_test_utils.copy_replace(dbt_project_yaml, os.path.join(test_root_dir, "dbt_project.yml")) return test_root_dir diff --git a/airbyte-integrations/bases/base-normalization/normalization/destination_type.py b/airbyte-integrations/bases/base-normalization/normalization/destination_type.py index 3f1d154f52ce..7fc2e5db597e 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/destination_type.py +++ b/airbyte-integrations/bases/base-normalization/normalization/destination_type.py @@ -16,6 +16,7 @@ class DestinationType(Enum): REDSHIFT = "redshift" SNOWFLAKE = "snowflake" TIDB = "tidb" + DATABEND = "databend" @classmethod def from_string(cls, string_value: str) -> "DestinationType": diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index b65c5545e56e..abde0cda3933 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -29,6 +29,7 @@ DestinationType.CLICKHOUSE.value: 63, # https://docs.pingcap.com/tidb/stable/tidb-limitations DestinationType.TIDB.value: 64, + DestinationType.DATABEND.value: 255, } # DBT also needs to generate suffix to table names, so we need to make sure it has enough characters to do so... @@ -172,6 +173,10 @@ def __normalize_identifier_name( result = result.replace("'", "_") elif self.destination_type.value != DestinationType.MYSQL.value and self.destination_type.value != DestinationType.TIDB.value: result = result.replace('"', '""') + elif self.destination_type.value == DestinationType.DATABEND.value: + result = result.replace('"', "_") + result = result.replace("`", "_") + result = result.replace("'", "_") else: result = result.replace("`", "_") result = result.replace("'", "\\'") @@ -239,6 +244,8 @@ def __normalize_identifier_case(self, input_name: str, is_quoted: bool = False) elif self.destination_type.value == DestinationType.TIDB.value: if not is_quoted and not self.needs_quotes(input_name): result = input_name.lower() + elif self.destination_type.value == DestinationType.DATABEND.value: + pass else: raise KeyError(f"Unknown destination type {self.destination_type}") return result @@ -279,6 +286,8 @@ def normalize_column_identifier_case_for_lookup(self, input_name: str, is_quoted pass elif self.destination_type.value == DestinationType.TIDB.value: result = input_name.lower() + elif self.destination_type.value == DestinationType.DATABEND.value: + pass else: raise KeyError(f"Unknown destination type {self.destination_type}") return result diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py index 0931b4f29c29..2525a11da057 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py @@ -2048,7 +2048,6 @@ "WITH", } - # https://docs.microsoft.com/en-us/sql/t-sql/language-elements/reserved-keywords-transact-sql?view=sql-server-ver15 MSSQL = { "ADD", @@ -3111,6 +3110,8 @@ "ZEROFILL", } +DATABEND: Set[str] = set() + RESERVED_KEYWORDS = { DestinationType.BIGQUERY.value: BIGQUERY, DestinationType.POSTGRES.value: POSTGRES, @@ -3121,6 +3122,7 @@ DestinationType.MSSQL.value: MSSQL, DestinationType.CLICKHOUSE.value: CLICKHOUSE, DestinationType.TIDB.value: TIDB, + DestinationType.DATABEND.value: DATABEND, } diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py index a762b39f1a45..295aa5a8a5ae 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py @@ -59,6 +59,7 @@ def transform(self, integration_type: DestinationType, config: Dict[str, Any]): DestinationType.MSSQL.value: self.transform_mssql, DestinationType.CLICKHOUSE.value: self.transform_clickhouse, DestinationType.TIDB.value: self.transform_tidb, + DestinationType.DATABEND.value: self.transform_databend, }[integration_type.value](config) # merge pre-populated base_profile with destination-specific configuration. @@ -77,9 +78,9 @@ def create_file(name, content): def is_ssh_tunnelling(config: Dict[str, Any]) -> bool: tunnel_methods = ["SSH_KEY_AUTH", "SSH_PASSWORD_AUTH"] if ( - "tunnel_method" in config.keys() - and "tunnel_method" in config["tunnel_method"] - and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods + "tunnel_method" in config.keys() + and "tunnel_method" in config["tunnel_method"] + and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods ): return True else: @@ -345,6 +346,20 @@ def transform_tidb(config: Dict[str, Any]): } return dbt_config + @staticmethod + def transform_databend(config: Dict[str, Any]): + print("transform_databend") + dbt_config = { + "type": "databend", + "host": config["host"], + "port": config["port"], + "schema": config["database"], + "user": config["username"], + } + if "pass" in config: + dbt_config["pass"] = config["password"] + return dbt_config + @staticmethod def read_json_config(input_path: str): with open(input_path, "r") as file: diff --git a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py index bcb750df766b..d33156347966 100644 --- a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py @@ -38,6 +38,7 @@ def before_tests(request): ("Hello World", "MySQL", True), ("Hello World", "MSSQL", True), ("Hello World", "TiDB", True), + ("Helllo World", "Databend", True), # Reserved Word for BigQuery and MySQL only ("Groups", "Postgres", False), ("Groups", "BigQuery", True), @@ -46,6 +47,7 @@ def before_tests(request): ("Groups", "MySQL", True), ("Groups", "MSSQL", False), ("Groups", "TiDB", True), + ("Groups", "Databend", True), # Doesnt start with alpha or underscore ("100x200", "Postgres", True), ("100x200", "BigQuery", False), @@ -54,6 +56,7 @@ def before_tests(request): ("100x200", "MySQL", True), ("100x200", "MSSQL", True), ("100x200", "TiDB", True), + ("100x200", "Databend", True), # Contains non alpha numeric ("post.wall", "Postgres", True), ("post.wall", "BigQuery", False), @@ -62,6 +65,7 @@ def before_tests(request): ("post.wall", "MySQL", True), ("post.wall", "MSSQL", True), ("post.wall", "TiDB", True), + ("post.wall", "Databend", True), ], ) def test_needs_quote(input_str: str, destination_type: str, expected: bool): @@ -113,6 +117,7 @@ def test_transform_standard_naming(input_str: str, expected: str): ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), ("Identifier Name", "TiDB", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), + ("Identifier Name", "Databend", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), # Reserved Word for BigQuery and MySQL only ("Groups", "Postgres", "groups", "'groups'"), ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), @@ -121,6 +126,7 @@ def test_transform_standard_naming(input_str: str, expected: str): ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), ("Groups", "MSSQL", "groups", "'groups'"), ("Groups", "TiDB", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), + ("Groups", "Databend", "Groups", "'Groups'"), ], ) def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): @@ -171,6 +177,7 @@ def test_truncate_identifier(input_str: str, expected: str): ("Identifier Name5", "MySQL", "identifier_name5", "{{ adapter.quote('Identifier Name5') }}"), ("Identifier Name6", "MSSQL", "identifier_name6", "{{ adapter.quote('Identifier Name6') }}"), ("Identifier Name7", "TiDB", "identifier_name7", "{{ adapter.quote('Identifier Name7') }}"), + ("Identifier Name8", "Databend", "identifier_name8", "{{ adapter.quote('Identifier Name8') }}"), # Unicode ("a-Unicode_name_文1", "Postgres", "a_unicode_name__1", "{{ adapter.quote('a-Unicode_name_文1') }}"), ("a-Unicode_name_文2", "BigQuery", "a_Unicode_name__2", "a_Unicode_name__2"), @@ -179,6 +186,7 @@ def test_truncate_identifier(input_str: str, expected: str): ("a-Unicode_name_文5", "MySQL", "a_unicode_name__5", "{{ adapter.quote('a-Unicode_name_文5') }}"), ("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"), ("a-Unicode_name_文7", "TiDB", "a_unicode_name__7", "{{ adapter.quote('a-Unicode_name_文7') }}"), + ("a-Unicode_name_文8", "Databend", "a_unicode_name__8", "{{ adapter.quote('a-Unicode_name_文8') }}"), # Doesnt start with alpha or underscore ("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"), ("100x2002", "BigQuery", "100x2002", "_100x2002"), @@ -188,6 +196,7 @@ def test_truncate_identifier(input_str: str, expected: str): ("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"), ("100x2006", "MSSQL", "_100x2006", "{{ adapter.quote('100x2006') }}"), ("100x2007", "TiDB", "100x2007", "{{ adapter.quote('100x2007') }}"), + ("100x2008", "Databend", "100x2008", "{{ adapter.quote('100x2008') }}"), # Reserved Keywords in BQ and MySQL ("Groups", "Postgres", "groups", "groups"), ("Groups", "BigQuery", "Groups", "{{ adapter.quote('Groups') }}"), @@ -196,6 +205,7 @@ def test_truncate_identifier(input_str: str, expected: str): ("Groups", "MySQL", "Groups", "{{ adapter.quote('Groups') }}"), ("Groups", "MSSQL", "groups", "groups"), ("Groups", "TiDB", "Groups", "{{ adapter.quote('Groups') }}"), + ("Groups", "Databend", "Groups", "{{ adapter.quote('Groups') }}"), # Reserved Keywords ("DisTincT", "Postgres", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "BigQuery", "DisTincT", "{{ adapter.quote('DisTincT') }}"), @@ -204,6 +214,7 @@ def test_truncate_identifier(input_str: str, expected: str): ("DisTincT", "MySQL", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "MSSQL", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "TiDB", "DisTincT", "{{ adapter.quote('DisTincT') }}"), + ("DisTincT", "Databend", "DisTincT", "{{ adapter.quote('DisTincT') }}"), # Quoted identifiers ("'QuoTed1 IdenTifiER'", "Postgres", "_quoted1_identifier_", "{{ adapter.quote('\\'QuoTed1 IdenTifiER\\'') }}"), ("'QuoTed2 IdenTifiER'", "BigQuery", "_QuoTed2_IdenTifiER_", "_QuoTed2_IdenTifiER_"), diff --git a/docs/cloud/core-concepts.md b/docs/cloud/core-concepts.md index c53c3189712a..3f7781d8315d 100644 --- a/docs/cloud/core-concepts.md +++ b/docs/cloud/core-concepts.md @@ -139,6 +139,7 @@ Note that normalization is only relevant for the following relational database & * Oracle * MySQL * MSSQL +* DATABEND Other destinations do not support normalization as described in this section, though they may normalize data in a format that makes sense for them. For example, the S3 destination connector offers the option of writing JSON files in S3, but also offers the option of writing statically typed files such as Parquet or Avro. diff --git a/docs/understanding-airbyte/basic-normalization.md b/docs/understanding-airbyte/basic-normalization.md index 305e382f75c6..7857a408cc13 100644 --- a/docs/understanding-airbyte/basic-normalization.md +++ b/docs/understanding-airbyte/basic-normalization.md @@ -102,6 +102,7 @@ In Airbyte, the current normalization option is implemented using a dbt Transfor * [Postgres](../integrations/destinations/postgres.md) * [Redshift](../integrations/destinations/redshift.md) * [Snowflake](../integrations/destinations/snowflake.md) +* [Databend](../integrations/destinations/databend.md) Basic Normalization can be configured when you're creating the connection between your Connection Setup and after in the Transformation Tab. Select the option: **Normalized tabular data**. From 5b3eee5ddecb697f3ecb00cf61f9fec731fce830 Mon Sep 17 00:00:00 2001 From: hantmac Date: Thu, 1 Dec 2022 16:11:54 +0800 Subject: [PATCH 07/34] update --- .../test_nested_streams/dbt_project.yml | 65 +++++++++++++++++++ .../test_simple_streams/dbt_project.yml | 65 +++++++++++++++++++ .../integration_tests/test_ephemeral.py | 2 + 3 files changed, 132 insertions(+) create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml new file mode 100644 index 000000000000..58d58e7e1104 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml @@ -0,0 +1,65 @@ +# This file is necessary to install dbt-utils with dbt deps +# the content will be overwritten by the transform function + +# Name your package! Package names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: "airbyte_utils" +version: "1.0" +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. Profiles contain +# database connection information, and should be configured in the ~/.dbt/profiles.yml file +profile: "normalize" + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that source models can be found +# in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +docs-paths: ["docs"] +analysis-paths: ["analysis"] +test-paths: ["tests"] +seed-paths: ["data"] +macro-paths: ["macros"] + +target-path: "../build" # directory which will store compiled SQL files +log-path: "../logs" # directory which will store DBT logs +packages-install-path: "/dbt" # directory which will store external DBT dependencies + +clean-targets: # directories to be removed by `dbt clean` + - "build" + - "dbt_modules" + +quoting: + database: true + # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) + # all schemas should be unquoted + schema: true + identifier: true + +# You can define configurations for models in the `model-paths` directory here. +# Using these configurations, you can enable or disable models, change how they +# are materialized, and more! +models: + airbyte_utils: + +materialized: table + generated: + airbyte_ctes: + +tags: airbyte_internal_cte + # ephemeral materialization isn't supported in Databend yet + +materialized: view + airbyte_incremental: + +tags: incremental_tables + +materialized: incremental + # schema change test isn't supported in Databend yet + +on_schema_change: "ignore" + airbyte_tables: + +tags: normalized_tables + +materialized: table + airbyte_views: + +tags: airbyte_internal_views + +materialized: view + +dispatch: + - macro_namespace: dbt_utils + search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml new file mode 100644 index 000000000000..58d58e7e1104 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml @@ -0,0 +1,65 @@ +# This file is necessary to install dbt-utils with dbt deps +# the content will be overwritten by the transform function + +# Name your package! Package names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: "airbyte_utils" +version: "1.0" +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. Profiles contain +# database connection information, and should be configured in the ~/.dbt/profiles.yml file +profile: "normalize" + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that source models can be found +# in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +docs-paths: ["docs"] +analysis-paths: ["analysis"] +test-paths: ["tests"] +seed-paths: ["data"] +macro-paths: ["macros"] + +target-path: "../build" # directory which will store compiled SQL files +log-path: "../logs" # directory which will store DBT logs +packages-install-path: "/dbt" # directory which will store external DBT dependencies + +clean-targets: # directories to be removed by `dbt clean` + - "build" + - "dbt_modules" + +quoting: + database: true + # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) + # all schemas should be unquoted + schema: true + identifier: true + +# You can define configurations for models in the `model-paths` directory here. +# Using these configurations, you can enable or disable models, change how they +# are materialized, and more! +models: + airbyte_utils: + +materialized: table + generated: + airbyte_ctes: + +tags: airbyte_internal_cte + # ephemeral materialization isn't supported in Databend yet + +materialized: view + airbyte_incremental: + +tags: incremental_tables + +materialized: incremental + # schema change test isn't supported in Databend yet + +on_schema_change: "ignore" + airbyte_tables: + +tags: normalized_tables + +materialized: table + airbyte_views: + +tags: airbyte_internal_views + +materialized: view + +dispatch: + - macro_namespace: dbt_utils + search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py b/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py index f459f5faecd6..9114e38cb137 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py @@ -106,6 +106,8 @@ def run_test(destination_type: DestinationType, column_count: int, expected_exce elif destination_type.value == DestinationType.REDSHIFT.value: # set unique schema for Redshift test dbt_test_utils.set_target_schema(dbt_test_utils.generate_random_string("test_ephemeral_")) + if destination_type.value == DestinationType.DATABEND.value: + pytest.skip("ephemeral materialization isn't supported in Databend yet") else: dbt_test_utils.set_target_schema("test_ephemeral") print(f"Testing ephemeral for destination {destination_type.value} with column count {column_count}") From 06ba91162531cdaaee7ce71133fa21026bb365d9 Mon Sep 17 00:00:00 2001 From: hantmac Date: Thu, 1 Dec 2022 21:13:25 +0800 Subject: [PATCH 08/34] fix gradle build --- .../normalization/transform_config/transform.py | 6 +++--- .../test_destination_name_transformer.py | 11 ----------- .../destination_databend/client.py | 8 +++++--- .../destination_databend/destination.py | 15 ++++++++------- .../destination_databend/writer.py | 15 ++++++++++----- .../integration_tests/integration_test.py | 4 ++-- .../connectors/destination-databend/setup.py | 7 +------ .../unit_tests/test_databend_destination.py | 7 +++---- .../destination-databend/unit_tests/unit_test.py | 14 ++------------ 9 files changed, 34 insertions(+), 53 deletions(-) diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py index 295aa5a8a5ae..c5908cfb7313 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py @@ -78,9 +78,9 @@ def create_file(name, content): def is_ssh_tunnelling(config: Dict[str, Any]) -> bool: tunnel_methods = ["SSH_KEY_AUTH", "SSH_PASSWORD_AUTH"] if ( - "tunnel_method" in config.keys() - and "tunnel_method" in config["tunnel_method"] - and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods + "tunnel_method" in config.keys() + and "tunnel_method" in config["tunnel_method"] + and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods ): return True else: diff --git a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py index d33156347966..bcb750df766b 100644 --- a/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py @@ -38,7 +38,6 @@ def before_tests(request): ("Hello World", "MySQL", True), ("Hello World", "MSSQL", True), ("Hello World", "TiDB", True), - ("Helllo World", "Databend", True), # Reserved Word for BigQuery and MySQL only ("Groups", "Postgres", False), ("Groups", "BigQuery", True), @@ -47,7 +46,6 @@ def before_tests(request): ("Groups", "MySQL", True), ("Groups", "MSSQL", False), ("Groups", "TiDB", True), - ("Groups", "Databend", True), # Doesnt start with alpha or underscore ("100x200", "Postgres", True), ("100x200", "BigQuery", False), @@ -56,7 +54,6 @@ def before_tests(request): ("100x200", "MySQL", True), ("100x200", "MSSQL", True), ("100x200", "TiDB", True), - ("100x200", "Databend", True), # Contains non alpha numeric ("post.wall", "Postgres", True), ("post.wall", "BigQuery", False), @@ -65,7 +62,6 @@ def before_tests(request): ("post.wall", "MySQL", True), ("post.wall", "MSSQL", True), ("post.wall", "TiDB", True), - ("post.wall", "Databend", True), ], ) def test_needs_quote(input_str: str, destination_type: str, expected: bool): @@ -117,7 +113,6 @@ def test_transform_standard_naming(input_str: str, expected: str): ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), ("Identifier Name", "TiDB", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), - ("Identifier Name", "Databend", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"), # Reserved Word for BigQuery and MySQL only ("Groups", "Postgres", "groups", "'groups'"), ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), @@ -126,7 +121,6 @@ def test_transform_standard_naming(input_str: str, expected: str): ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), ("Groups", "MSSQL", "groups", "'groups'"), ("Groups", "TiDB", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"), - ("Groups", "Databend", "Groups", "'Groups'"), ], ) def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str): @@ -177,7 +171,6 @@ def test_truncate_identifier(input_str: str, expected: str): ("Identifier Name5", "MySQL", "identifier_name5", "{{ adapter.quote('Identifier Name5') }}"), ("Identifier Name6", "MSSQL", "identifier_name6", "{{ adapter.quote('Identifier Name6') }}"), ("Identifier Name7", "TiDB", "identifier_name7", "{{ adapter.quote('Identifier Name7') }}"), - ("Identifier Name8", "Databend", "identifier_name8", "{{ adapter.quote('Identifier Name8') }}"), # Unicode ("a-Unicode_name_文1", "Postgres", "a_unicode_name__1", "{{ adapter.quote('a-Unicode_name_文1') }}"), ("a-Unicode_name_文2", "BigQuery", "a_Unicode_name__2", "a_Unicode_name__2"), @@ -186,7 +179,6 @@ def test_truncate_identifier(input_str: str, expected: str): ("a-Unicode_name_文5", "MySQL", "a_unicode_name__5", "{{ adapter.quote('a-Unicode_name_文5') }}"), ("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"), ("a-Unicode_name_文7", "TiDB", "a_unicode_name__7", "{{ adapter.quote('a-Unicode_name_文7') }}"), - ("a-Unicode_name_文8", "Databend", "a_unicode_name__8", "{{ adapter.quote('a-Unicode_name_文8') }}"), # Doesnt start with alpha or underscore ("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"), ("100x2002", "BigQuery", "100x2002", "_100x2002"), @@ -196,7 +188,6 @@ def test_truncate_identifier(input_str: str, expected: str): ("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"), ("100x2006", "MSSQL", "_100x2006", "{{ adapter.quote('100x2006') }}"), ("100x2007", "TiDB", "100x2007", "{{ adapter.quote('100x2007') }}"), - ("100x2008", "Databend", "100x2008", "{{ adapter.quote('100x2008') }}"), # Reserved Keywords in BQ and MySQL ("Groups", "Postgres", "groups", "groups"), ("Groups", "BigQuery", "Groups", "{{ adapter.quote('Groups') }}"), @@ -205,7 +196,6 @@ def test_truncate_identifier(input_str: str, expected: str): ("Groups", "MySQL", "Groups", "{{ adapter.quote('Groups') }}"), ("Groups", "MSSQL", "groups", "groups"), ("Groups", "TiDB", "Groups", "{{ adapter.quote('Groups') }}"), - ("Groups", "Databend", "Groups", "{{ adapter.quote('Groups') }}"), # Reserved Keywords ("DisTincT", "Postgres", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "BigQuery", "DisTincT", "{{ adapter.quote('DisTincT') }}"), @@ -214,7 +204,6 @@ def test_truncate_identifier(input_str: str, expected: str): ("DisTincT", "MySQL", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "MSSQL", "DisTincT", "{{ adapter.quote('DisTincT') }}"), ("DisTincT", "TiDB", "DisTincT", "{{ adapter.quote('DisTincT') }}"), - ("DisTincT", "Databend", "DisTincT", "{{ adapter.quote('DisTincT') }}"), # Quoted identifiers ("'QuoTed1 IdenTifiER'", "Postgres", "_quoted1_identifier_", "{{ adapter.quote('\\'QuoTed1 IdenTifiER\\'') }}"), ("'QuoTed2 IdenTifiER'", "BigQuery", "_QuoTed2_IdenTifiER_", "_QuoTed2_IdenTifiER_"), diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/client.py b/airbyte-integrations/connectors/destination-databend/destination_databend/client.py index 161c6f807162..989cd1dd1059 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/client.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/client.py @@ -1,8 +1,11 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + from databend_sqlalchemy import connector class DatabendClient: - def __init__(self, protocol: str, host: str, port: int, database: str, table: str, username: str, password: str = None): self.protocol = protocol self.host = host @@ -13,7 +16,6 @@ def __init__(self, protocol: str, host: str, port: int, database: str, table: st self.password = password def open(self): - handle = connector.connect( - f'{self.protocol}://{self.username}:{self.password}@{self.host}:{self.port}').cursor() + handle = connector.connect(f"{self.protocol}://{self.username}:{self.password}@{self.host}:{self.port}").cursor() return handle diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py index 45c20ba9c05f..8fcfebb6ed3b 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py @@ -3,24 +3,25 @@ # -from typing import Any, Iterable, Mapping -from logging import getLogger import json from datetime import datetime +from logging import getLogger +from typing import Any, Iterable, Mapping +from uuid import uuid4 + from airbyte_cdk import AirbyteLogger -from destination_databend.client import DatabendClient from airbyte_cdk.destinations import Destination from airbyte_cdk.models import AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, DestinationSyncMode, Status, Type +from destination_databend.client import DatabendClient + from .writer import create_databend_wirter -from typing import Any, Dict, Iterable, Mapping, Optional -from uuid import uuid4 logger = getLogger("airbyte") class DestinationDatabend(Destination): def write( - self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] + self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] ) -> Iterable[AirbyteMessage]: """ @@ -79,7 +80,7 @@ def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConn try: client = DatabendClient(**config) cursor = client.open() - cursor.execute('select 1') + cursor.execute("select 1") return AirbyteConnectionStatus(status=Status.SUCCEEDED) except Exception as e: return AirbyteConnectionStatus(status=Status.FAILED, message=f"An exception occurred: {repr(e)}") diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py index 9afaebc14372..88cb6069d90a 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py @@ -1,9 +1,12 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + from collections import defaultdict -import json from datetime import datetime -from time import time -from airbyte_cdk import AirbyteLogger from itertools import chain + +from airbyte_cdk import AirbyteLogger from destination_databend.client import DatabendClient @@ -104,8 +107,10 @@ def _flush(self) -> None: cursor = self.cursor # id, written_at, data for table, data in self._buffer.items(): - cursor.execute(f"INSERT INTO _airbyte_raw_{table} (_airbyte_ab_id,_airbyte_emitted_at,_airbyte_data) VALUES (%, %, %)", - list(chain.from_iterable(data))) + cursor.execute( + f"INSERT INTO _airbyte_raw_{table} (_airbyte_ab_id,_airbyte_emitted_at,_airbyte_data) VALUES (%, %, %)", + list(chain.from_iterable(data)), + ) self._buffer.clear() self._values = 0 diff --git a/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py index 87d74732a4ac..913a0909366a 100644 --- a/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py +++ b/airbyte-integrations/connectors/destination-databend/integration_tests/integration_test.py @@ -2,11 +2,11 @@ # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import json, logging +import json +import logging from typing import Any, Dict, List, Mapping import pytest -from airbyte_cdk import AirbyteLogger from airbyte_cdk.models import ( AirbyteMessage, AirbyteRecordMessage, diff --git a/airbyte-integrations/connectors/destination-databend/setup.py b/airbyte-integrations/connectors/destination-databend/setup.py index 462605e00103..65671d22383a 100644 --- a/airbyte-integrations/connectors/destination-databend/setup.py +++ b/airbyte-integrations/connectors/destination-databend/setup.py @@ -5,12 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = [ - "airbyte-cdk", - "requests", - "databend-sqlalchemy", - "databend-py" -] +MAIN_REQUIREMENTS = ["airbyte-cdk", "requests", "databend-sqlalchemy", "databend-py"] TEST_REQUIREMENTS = ["pytest~=6.1"] setup( diff --git a/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py b/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py index c20039df9de1..eb6bbbffe616 100644 --- a/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py +++ b/airbyte-integrations/connectors/destination-databend/unit_tests/test_databend_destination.py @@ -3,7 +3,7 @@ # from datetime import datetime -from typing import Any, Dict +from typing import Dict from unittest.mock import AsyncMock, MagicMock, call, patch from airbyte_cdk.models import ( @@ -13,12 +13,11 @@ ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode, - Status, SyncMode, Type, ) -from destination_databend.destination import DestinationDatabend, DatabendClient -from pytest import fixture, mark +from destination_databend.destination import DatabendClient, DestinationDatabend +from pytest import fixture @fixture diff --git a/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py b/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py index 25ba388053e8..5412bd26e36b 100644 --- a/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py +++ b/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py @@ -1,8 +1,9 @@ # # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # + from typing import Any, Union -from unittest.mock import ANY, MagicMock, call, patch +from unittest.mock import MagicMock from destination_databend.writer import DatabendSQLWriter from pytest import fixture, mark @@ -26,14 +27,6 @@ def test_sql_default(sql_writer: DatabendSQLWriter) -> None: @mark.parametrize("writer", ["sql_writer"]) def test_sql_create(client: MagicMock, writer: Union[DatabendSQLWriter], request: Any) -> None: writer = request.getfixturevalue(writer) - expected_query = """ - CREATE FACT TABLE IF NOT EXISTS _airbyte_raw_dummy ( - _airbyte_ab_id TEXT, - _airbyte_emitted_at TIMESTAMP, - _airbyte_data TEXT - ) - PRIMARY INDEX _airbyte_ab_id - """ writer.create_raw_table("dummy") @@ -51,6 +44,3 @@ def test_data_buffering(sql_writer: DatabendSQLWriter) -> None: assert len(sql_writer._buffer["dummy"]) == 2 assert len(sql_writer._buffer["dummy2"]) == 1 assert len(sql_writer._buffer.keys()) == 2 - - - From 748bee54ee1e11998b03ab7ccbddbcb67cdbc750 Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 2 Dec 2022 10:55:24 +0800 Subject: [PATCH 09/34] fix --- build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/build.gradle b/build.gradle index 8b11def5547a..940e55ea5298 100644 --- a/build.gradle +++ b/build.gradle @@ -118,6 +118,7 @@ def createSpotlessTarget = { pattern -> 'dbt-project-template-mysql', 'dbt-project-template-oracle', 'dbt-project-template-clickhouse', + 'dbt-project-template-databend', 'dbt-project-template-snowflake', 'dbt-project-template-tidb', 'dbt_test_config', From d326285f70c2fb0e2091bc2c67c8fe50e1df2731 Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 2 Dec 2022 15:29:28 +0800 Subject: [PATCH 10/34] update --- .../base-normalization/databend.Dockerfile | 2 +- ..._columns_resulting_into_long_names_ab1.sql | 19 ++ ..._columns_resulting_into_long_names_ab2.sql | 19 ++ ...ing_into_long_names_partition_DATA_ab1.sql | 20 +++ ...esulting_into_long_names_partition_ab1.sql | 19 ++ ..._names_partition_double_array_data_ab1.sql | 20 +++ ..._columns_resulting_into_long_names_scd.sql | 162 ++++++++++++++++++ ...plex_columns_resulting_into_long_names.sql | 22 +++ ...ns_resulting_into_long_names_partition.sql | 19 ++ ...sulting_into_long_names_partition_DATA.sql | 18 ++ ...long_names_partition_double_array_data.sql | 18 ++ .../models/generated/sources.yml | 23 +++ 12 files changed, 360 insertions(+), 1 deletion(-) create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql create mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml diff --git a/airbyte-integrations/bases/base-normalization/databend.Dockerfile b/airbyte-integrations/bases/base-normalization/databend.Dockerfile index deddbc0c2190..ed271403e89b 100644 --- a/airbyte-integrations/bases/base-normalization/databend.Dockerfile +++ b/airbyte-integrations/bases/base-normalization/databend.Dockerfile @@ -21,7 +21,7 @@ RUN pip install . WORKDIR /airbyte/normalization_code/dbt-template/ #RUN pip install dbt-databend-cloud -RUN pip install git+https://github.com/databendcloud/dbt-databend.git +RUN pip install dbt-databend-cloud==1.2.8 # Download external dbt dependencies RUN dbt deps diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql new file mode 100644 index 000000000000..527abd7fec29 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql @@ -0,0 +1,19 @@ +{{ config( + unique_key = '_airbyte_ab_id', + schema = "_airbyte_test_normalization", + tags = [ "top-level-intermediate" ] +) }} +-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema +-- depends_on: {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} +select + {{ json_extract_scalar('_airbyte_data', ['id'], ['id']) }} as id, + {{ json_extract_scalar('_airbyte_data', ['date'], ['date']) }} as date, + {{ json_extract('table_alias', '_airbyte_data', ['partition'], ['partition']) }} as partition, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at +from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} as table_alias +-- nested_stream_with_complex_columns_resulting_into_long_names +where 1 = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql new file mode 100644 index 000000000000..40974116420c --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql @@ -0,0 +1,19 @@ +{{ config( + unique_key = '_airbyte_ab_id', + schema = "_airbyte_test_normalization", + tags = [ "top-level-intermediate" ] +) }} +-- SQL model to cast each column to its adequate SQL type converted from the JSON schema type +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }} +select + cast(id as {{ dbt_utils.type_string() }}) as id, + cast(date as {{ dbt_utils.type_string() }}) as date, + cast(partition as {{ type_json() }}) as partition, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }} +-- nested_stream_with_complex_columns_resulting_into_long_names +where 1 = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql new file mode 100644 index 000000000000..f21e68df4476 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql @@ -0,0 +1,20 @@ +{{ config( + schema = "_airbyte_test_normalization", + tags = [ "nested-intermediate" ] +) }} +-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} +{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'DATA') }} +select + _airbyte_partition_hashid, + {{ json_extract_scalar(unnested_column_value('DATA'), ['currency'], ['currency']) }} as currency, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias +-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA +{{ cross_join_unnest('partition', 'DATA') }} +where 1 = 1 +and DATA is not null +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql new file mode 100644 index 000000000000..cb91f0f5a201 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql @@ -0,0 +1,19 @@ +{{ config( + schema = "_airbyte_test_normalization", + tags = [ "nested-intermediate" ] +) }} +-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} +select + _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid, + {{ json_extract_array('partition', ['double_array_data'], ['double_array_data']) }} as double_array_data, + {{ json_extract_array('partition', ['DATA'], ['DATA']) }} as DATA, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} as table_alias +-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition +where 1 = 1 +and partition is not null +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql new file mode 100644 index 000000000000..5f5a0066bb92 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql @@ -0,0 +1,20 @@ +{{ config( + schema = "_airbyte_test_normalization", + tags = [ "nested-intermediate" ] +) }} +-- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} +{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'double_array_data') }} +select + _airbyte_partition_hashid, + {{ json_extract_scalar(unnested_column_value('double_array_data'), ['id'], ['id']) }} as id, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias +-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data +{{ cross_join_unnest('partition', 'double_array_data') }} +where 1 = 1 +and double_array_data is not null +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql new file mode 100644 index 000000000000..3990b534b1b3 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql @@ -0,0 +1,162 @@ +{{ config( + unique_key = "_airbyte_unique_key_scd", + schema = "test_normalization", + post_hook = [" + {% + set final_table_relation = adapter.get_relation( + database=this.database, + schema=this.schema, + identifier='nested_stream_with_complex_columns_resulting_into_long_names' + ) + %} + {# + If the final table doesn't exist, then obviously we can't delete anything from it. + Also, after a reset, the final table is created without the _airbyte_unique_key column (this column is created during the first sync) + So skip this deletion if the column doesn't exist. (in this case, the table is guaranteed to be empty anyway) + #} + {% + if final_table_relation is not none and '_airbyte_unique_key' in adapter.get_columns_in_relation(final_table_relation)|map(attribute='name') + %} + -- Delete records which are no longer active: + -- This query is equivalent, but the left join version is more performant: + -- delete from final_table where unique_key in ( + -- select unique_key from scd_table where 1 = 1 + -- ) and unique_key not in ( + -- select unique_key from scd_table where active_row = 1 + -- ) + -- We're incremental against normalized_at rather than emitted_at because we need to fetch the SCD + -- entries that were _updated_ recently. This is because a deleted record will have an SCD record + -- which was emitted a long time ago, but recently re-normalized to have active_row = 0. + delete from {{ final_table_relation }} where {{ final_table_relation }}._airbyte_unique_key in ( + select recent_records.unique_key + from ( + select distinct _airbyte_unique_key as unique_key + from {{ this }} + where 1=1 {{ incremental_clause('_airbyte_normalized_at', adapter.quote(this.schema) + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }} + ) recent_records + left join ( + select _airbyte_unique_key as unique_key, count(_airbyte_unique_key) as active_count + from {{ this }} + where _airbyte_active_row = 1 {{ incremental_clause('_airbyte_normalized_at', adapter.quote(this.schema) + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }} + group by _airbyte_unique_key + ) active_counts + on recent_records.unique_key = active_counts.unique_key + where active_count is null or active_count = 0 + ) + {% else %} + -- We have to have a non-empty query, so just do a noop delete + delete from {{ this }} where 1=0 + {% endif %} + ","drop view _airbyte_test_normalization.nested_stream_with_complex_columns_resulting_into_long_names_stg"], + tags = [ "top-level" ] +) }} +-- depends_on: ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') +with +{% if is_incremental() %} +new_data as ( + -- retrieve incremental "new" data + select + * + from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }} + -- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} + where 1 = 1 + {{ incremental_clause('_airbyte_emitted_at', this) }} +), +new_data_ids as ( + -- build a subset of _airbyte_unique_key from rows that are new + select distinct + {{ dbt_utils.surrogate_key([ + 'id', + ]) }} as _airbyte_unique_key + from new_data +), +empty_new_data as ( + -- build an empty table to only keep the table's column types + select * from new_data where 1 = 0 +), +previous_active_scd_data as ( + -- retrieve "incomplete old" data that needs to be updated with an end date because of new changes + select + {{ star_intersect(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg'), this, from_alias='inc_data', intersect_alias='this_data') }} + from {{ this }} as this_data + -- make a join with new_data using primary key to filter active data that need to be updated only + join new_data_ids on this_data._airbyte_unique_key = new_data_ids._airbyte_unique_key + -- force left join to NULL values (we just need to transfer column types only for the star_intersect macro on schema changes) + left join empty_new_data as inc_data on this_data._airbyte_ab_id = inc_data._airbyte_ab_id + where _airbyte_active_row = 1 +), +input_data as ( + select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from new_data + union all + select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from previous_active_scd_data +), +{% else %} +input_data as ( + select * + from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }} + -- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} +), +{% endif %} +scd_data as ( + -- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key + select + {{ dbt_utils.surrogate_key([ + 'id', + ]) }} as _airbyte_unique_key, + id, + date, + partition, + date as _airbyte_start_at, + lag(date) over ( + partition by id + order by + date is null asc, + date desc, + _airbyte_emitted_at desc + ) as _airbyte_end_at, + case when row_number() over ( + partition by id + order by + date is null asc, + date desc, + _airbyte_emitted_at desc + ) = 1 then 1 else 0 end as _airbyte_active_row, + _airbyte_ab_id, + _airbyte_emitted_at, + _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid + from input_data +), +dedup_data as ( + select + -- we need to ensure de-duplicated rows for merge/update queries + -- additionally, we generate a unique key for the scd table + row_number() over ( + partition by + _airbyte_unique_key, + _airbyte_start_at, + _airbyte_emitted_at + order by _airbyte_active_row desc, _airbyte_ab_id + ) as _airbyte_row_num, + {{ dbt_utils.surrogate_key([ + '_airbyte_unique_key', + '_airbyte_start_at', + '_airbyte_emitted_at' + ]) }} as _airbyte_unique_key_scd, + scd_data.* + from scd_data +) +select + _airbyte_unique_key, + _airbyte_unique_key_scd, + id, + date, + partition, + _airbyte_start_at, + _airbyte_end_at, + _airbyte_active_row, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at, + _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid +from dedup_data where _airbyte_row_num = 1 + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql new file mode 100644 index 000000000000..16e9999a1767 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql @@ -0,0 +1,22 @@ +{{ config( + unique_key = "_airbyte_unique_key", + schema = "test_normalization", + tags = [ "top-level" ] +) }} +-- Final base SQL model +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} +select + _airbyte_unique_key, + id, + date, + partition, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at, + _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} +-- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} +where 1 = 1 +and _airbyte_active_row = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql new file mode 100644 index 000000000000..c2f68acb99c1 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql @@ -0,0 +1,19 @@ +{{ config( + schema = "test_normalization", + tags = [ "nested" ] +) }} +-- Final base SQL model +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }} +select + _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid, + double_array_data, + DATA, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at, + _airbyte_partition_hashid +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }} +-- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} +where 1 = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql new file mode 100644 index 000000000000..c2ad0964d06d --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql @@ -0,0 +1,18 @@ +{{ config( + schema = "test_normalization", + tags = [ "nested" ] +) }} +-- Final base SQL model +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }} +select + _airbyte_partition_hashid, + currency, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at, + _airbyte_DATA_hashid +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }} +-- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} +where 1 = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql new file mode 100644 index 000000000000..0d3f5190d847 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql @@ -0,0 +1,18 @@ +{{ config( + schema = "test_normalization", + tags = [ "nested" ] +) }} +-- Final base SQL model +-- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }} +select + _airbyte_partition_hashid, + id, + _airbyte_ab_id, + _airbyte_emitted_at, + {{ current_timestamp() }} as _airbyte_normalized_at, + _airbyte_double_array_data_hashid +from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }} +-- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} +where 1 = 1 +{{ incremental_clause('_airbyte_emitted_at', this) }} + diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml new file mode 100644 index 000000000000..29bae1b4b510 --- /dev/null +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml @@ -0,0 +1,23 @@ +version: 2 +sources: +- name: test_normalization + quoting: + database: true + schema: false + identifier: false + tables: + - name: _airbyte_raw_arrays + - name: _airbyte_raw_conflict_stream_array + - name: _airbyte_raw_conflict_stream_name + - name: _airbyte_raw_conflict_stream_scalar + - name: _airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + - name: _airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names + - name: _airbyte_raw_some_stream_that_was_empty + - name: _airbyte_raw_unnest_alias +- name: test_normalization_namespace + quoting: + database: true + schema: false + identifier: false + tables: + - name: _airbyte_raw_simple_stream_with_namespace_resulting_into_long_names From 936c02b0225b53ba20dff17c3b97907d913dea3d Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 2 Dec 2022 17:13:21 +0800 Subject: [PATCH 11/34] fix transform --- .../test_nested_streams/dbt_project.yml | 146 ++++++++++++------ .../transform_config/transform.py | 9 +- 2 files changed, 107 insertions(+), 48 deletions(-) diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml index 58d58e7e1104..c54a0bf34350 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml +++ b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml @@ -1,65 +1,125 @@ -# This file is necessary to install dbt-utils with dbt deps -# the content will be overwritten by the transform function - -# Name your package! Package names should contain only lowercase characters -# and underscores. A good package name should reflect your organization's -# name or the intended use of these models -name: "airbyte_utils" -version: "1.0" +name: airbyte_utils +version: '1.0' config-version: 2 - -# This setting configures which "profile" dbt uses for this project. Profiles contain -# database connection information, and should be configured in the ~/.dbt/profiles.yml file -profile: "normalize" - -# These configurations specify where dbt should look for different types of files. -# The `model-paths` config, for example, states that source models can be found -# in the "models/" directory. You probably won't need to change these! -model-paths: ["models"] -docs-paths: ["docs"] -analysis-paths: ["analysis"] -test-paths: ["tests"] -seed-paths: ["data"] -macro-paths: ["macros"] - -target-path: "../build" # directory which will store compiled SQL files -log-path: "../logs" # directory which will store DBT logs -packages-install-path: "/dbt" # directory which will store external DBT dependencies - -clean-targets: # directories to be removed by `dbt clean` - - "build" - - "dbt_modules" - +profile: normalize +model-paths: +- models +docs-paths: +- docs +analysis-paths: +- analysis +test-paths: +- tests +seed-paths: +- data +macro-paths: +- macros +target-path: ../build +log-path: ../logs +packages-install-path: /dbt +clean-targets: +- build +- dbt_modules quoting: database: true - # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) - # all schemas should be unquoted schema: true identifier: true - -# You can define configurations for models in the `model-paths` directory here. -# Using these configurations, you can enable or disable models, change how they -# are materialized, and more! models: airbyte_utils: +materialized: table generated: airbyte_ctes: +tags: airbyte_internal_cte - # ephemeral materialization isn't supported in Databend yet +materialized: view airbyte_incremental: +tags: incremental_tables +materialized: incremental - # schema change test isn't supported in Databend yet - +on_schema_change: "ignore" + +on_schema_change: ignore airbyte_tables: +tags: normalized_tables +materialized: table airbyte_views: +tags: airbyte_internal_views +materialized: view - dispatch: - - macro_namespace: dbt_utils - search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file +- macro_namespace: dbt_utils + search_order: + - airbyte_utils + - dbt_utils +vars: + json_column: _airbyte_data + models_to_source: + nested_stream_with_complex_columns_resulting_into_long_names_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_stg: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_scd: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + non_nested_stream_without_namespace_resulting_into_long_names_ab1: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names + non_nested_stream_without_namespace_resulting_into_long_names_ab2: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names + non_nested_stream_without_namespace_resulting_into_long_names_ab3: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names + non_nested_stream_without_namespace_resulting_into_long_names: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names + some_stream_that_was_empty_ab1: test_normalization._airbyte_raw_some_stream_that_was_empty + some_stream_that_was_empty_ab2: test_normalization._airbyte_raw_some_stream_that_was_empty + some_stream_that_was_empty_stg: test_normalization._airbyte_raw_some_stream_that_was_empty + some_stream_that_was_empty_scd: test_normalization._airbyte_raw_some_stream_that_was_empty + some_stream_that_was_empty: test_normalization._airbyte_raw_some_stream_that_was_empty + simple_stream_with_namespace_resulting_into_long_names_ab1: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names + simple_stream_with_namespace_resulting_into_long_names_ab2: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names + simple_stream_with_namespace_resulting_into_long_names_ab3: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names + simple_stream_with_namespace_resulting_into_long_names: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names + conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_scalar_ab1: test_normalization._airbyte_raw_conflict_stream_scalar + conflict_stream_scalar_ab2: test_normalization._airbyte_raw_conflict_stream_scalar + conflict_stream_scalar_ab3: test_normalization._airbyte_raw_conflict_stream_scalar + conflict_stream_scalar: test_normalization._airbyte_raw_conflict_stream_scalar + conflict_stream_array_ab1: test_normalization._airbyte_raw_conflict_stream_array + conflict_stream_array_ab2: test_normalization._airbyte_raw_conflict_stream_array + conflict_stream_array_ab3: test_normalization._airbyte_raw_conflict_stream_array + conflict_stream_array: test_normalization._airbyte_raw_conflict_stream_array + unnest_alias_ab1: test_normalization._airbyte_raw_unnest_alias + unnest_alias_ab2: test_normalization._airbyte_raw_unnest_alias + unnest_alias_ab3: test_normalization._airbyte_raw_unnest_alias + unnest_alias: test_normalization._airbyte_raw_unnest_alias + arrays_ab1: test_normalization._airbyte_raw_arrays + arrays_ab2: test_normalization._airbyte_raw_arrays + arrays_ab3: test_normalization._airbyte_raw_arrays + arrays: test_normalization._airbyte_raw_arrays + nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name + unnest_alias_children_ab1: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_ab2: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_ab3: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children: test_normalization._airbyte_raw_unnest_alias + arrays_nested_array_parent_ab1: test_normalization._airbyte_raw_arrays + arrays_nested_array_parent_ab2: test_normalization._airbyte_raw_arrays + arrays_nested_array_parent_ab3: test_normalization._airbyte_raw_arrays + arrays_nested_array_parent: test_normalization._airbyte_raw_arrays + nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names + conflict_stream_name_conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name + conflict_stream_name_conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name + unnest_alias_children_owner_ab1: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_ab2: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_ab3: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_column___with__quotes_ab1: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_column___with__quotes_ab2: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_column___with__quotes_ab3: test_normalization._airbyte_raw_unnest_alias + unnest_alias_children_owner_column___with__quotes: test_normalization._airbyte_raw_unnest_alias diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py index c5908cfb7313..3e92df5edcea 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py @@ -78,9 +78,9 @@ def create_file(name, content): def is_ssh_tunnelling(config: Dict[str, Any]) -> bool: tunnel_methods = ["SSH_KEY_AUTH", "SSH_PASSWORD_AUTH"] if ( - "tunnel_method" in config.keys() - and "tunnel_method" in config["tunnel_method"] - and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods + "tunnel_method" in config.keys() + and "tunnel_method" in config["tunnel_method"] + and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods ): return True else: @@ -355,9 +355,8 @@ def transform_databend(config: Dict[str, Any]): "port": config["port"], "schema": config["database"], "user": config["username"], + "pass": config.get("password", ""), } - if "pass" in config: - dbt_config["pass"] = config["password"] return dbt_config @staticmethod From 34efd07aba2eda1093d54efe02bebc5fba31f6a2 Mon Sep 17 00:00:00 2001 From: hantmac Date: Mon, 5 Dec 2022 15:35:29 +0800 Subject: [PATCH 12/34] fix spec.json --- .../destination-databend/destination_databend/spec.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json index 13f842129d6b..4e630900664e 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json @@ -53,7 +53,7 @@ "type" : "string", "order" : 3 }, - "schema" : { + "table" : { "title" : "Default Table", "description" : "The default table was written to.", "type" : "string", From 75b336734038e8700a1c8b29c99ee2cf39104e67 Mon Sep 17 00:00:00 2001 From: hantmac Date: Wed, 14 Dec 2022 14:11:51 +0800 Subject: [PATCH 13/34] fix --- .../bases/base-normalization/databend.Dockerfile | 2 +- .../transform_catalog/destination_name_transformer.py | 10 +++++----- .../connectors/destination-databend/setup.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/bases/base-normalization/databend.Dockerfile b/airbyte-integrations/bases/base-normalization/databend.Dockerfile index ed271403e89b..24695edbab12 100644 --- a/airbyte-integrations/bases/base-normalization/databend.Dockerfile +++ b/airbyte-integrations/bases/base-normalization/databend.Dockerfile @@ -21,7 +21,7 @@ RUN pip install . WORKDIR /airbyte/normalization_code/dbt-template/ #RUN pip install dbt-databend-cloud -RUN pip install dbt-databend-cloud==1.2.8 +RUN pip install dbt-databend-cloud==1.3.2 # Download external dbt dependencies RUN dbt deps diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index abde0cda3933..abf5e1f6f7cc 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -81,7 +81,7 @@ def normalize_schema_name(self, schema_name: str, in_jinja: bool = False, trunca return self.__normalize_non_column_identifier_name(input_name=schema_name, in_jinja=in_jinja, truncate=truncate) def normalize_table_name( - self, table_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, table_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: """ @param table_name is the table to normalize @@ -98,7 +98,7 @@ def normalize_table_name( ) def normalize_column_name( - self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: """ @param column_name is the column to normalize @@ -125,7 +125,7 @@ def truncate_identifier_name(self, input_name: str, custom_limit: int = -1, conf middle = round(limit / 2) # truncate in the middle to preserve prefix/suffix instead prefix = input_name[: limit - middle - 1] - suffix = input_name[1 - middle :] + suffix = input_name[1 - middle:] # Add extra characters '__', signaling a truncate in identifier print(f"Truncating {input_name} (#{len(input_name)}) to {prefix}_{suffix} (#{2 + len(prefix) + len(suffix)})") mid = "__" @@ -145,7 +145,7 @@ def get_name_max_length(self): # Private methods def __normalize_non_column_identifier_name( - self, input_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, input_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: # We force standard naming for non column names (see issue #1785) result = transform_standard_naming(input_name) @@ -161,7 +161,7 @@ def __normalize_non_column_identifier_name( return result def __normalize_identifier_name( - self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: result = self.__normalize_naming_conventions(column_name, is_column=True) if truncate: diff --git a/airbyte-integrations/connectors/destination-databend/setup.py b/airbyte-integrations/connectors/destination-databend/setup.py index 65671d22383a..7aa721c49fdd 100644 --- a/airbyte-integrations/connectors/destination-databend/setup.py +++ b/airbyte-integrations/connectors/destination-databend/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -MAIN_REQUIREMENTS = ["airbyte-cdk", "requests", "databend-sqlalchemy", "databend-py"] +MAIN_REQUIREMENTS = ["airbyte-cdk", "requests", "databend-sqlalchemy"] TEST_REQUIREMENTS = ["pytest~=6.1"] setup( From 7d41ae863d46005e0e49cc2996f483eb5fbcc640 Mon Sep 17 00:00:00 2001 From: hantmac Date: Wed, 14 Dec 2022 14:19:23 +0800 Subject: [PATCH 14/34] more check --- .../destination_databend/destination.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py index 8fcfebb6ed3b..2ee1552b5a50 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py @@ -21,7 +21,7 @@ class DestinationDatabend(Destination): def write( - self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] + self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] ) -> Iterable[AirbyteMessage]: """ @@ -80,7 +80,10 @@ def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConn try: client = DatabendClient(**config) cursor = client.open() - cursor.execute("select 1") + cursor.execute("DROP TABLE IF EXISTS test") + cursor.execute('CREATE TABLE if not exists test (x Int32,y VARCHAR)') + cursor.execute('INSERT INTO test (x,y) VALUES (%,%)', [1, 'yy', 2, 'xx']) + cursor.execute("DROP TABLE IF EXISTS test") return AirbyteConnectionStatus(status=Status.SUCCEEDED) except Exception as e: return AirbyteConnectionStatus(status=Status.FAILED, message=f"An exception occurred: {repr(e)}") From ef35195c85f409bb21c78c5075f74ac42045a701 Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 16 Dec 2022 11:14:26 +0800 Subject: [PATCH 15/34] add finally for connector --- .../destination_databend/writer.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py index 88cb6069d90a..53d86cd4ba82 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py @@ -7,6 +7,7 @@ from itertools import chain from airbyte_cdk import AirbyteLogger +from airbyte_cdk.models import AirbyteConnectionStatus,Status from destination_databend.client import DatabendClient @@ -22,10 +23,18 @@ def __init__(self, client: DatabendClient) -> None: :param client: Databend SDK connection class with established connection to the databse. """ - self.client = client - self.cursor = client.open() - self._buffer = defaultdict(list) - self._values = 0 + try: + # open a cursor and do some work with it + self.client = client + self.cursor = client.open() + self._buffer = defaultdict(list) + self._values = 0 + except Exception as e: + # handle the exception + raise AirbyteConnectionStatus(status=Status.FAILED, message=f"An exception occurred: {repr(e)}") + finally: + # close the cursor + self.cursor.close() def delete_table(self, name: str) -> None: """ From 5ddc5d905053525138056fe98bd8ef9f9d6fc829 Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 16 Dec 2022 13:09:42 +0800 Subject: [PATCH 16/34] fix comment --- .../destination-databend/destination_databend/writer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py index 53d86cd4ba82..c70e4eda6918 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py @@ -7,7 +7,7 @@ from itertools import chain from airbyte_cdk import AirbyteLogger -from airbyte_cdk.models import AirbyteConnectionStatus,Status +from airbyte_cdk.models import AirbyteConnectionStatus, Status from destination_databend.client import DatabendClient @@ -94,8 +94,7 @@ def flush(self): class DatabendSQLWriter(DatabendWriter): """ Data writer using the SQL writing strategy. Data is buffered in memory - and flushed using INSERT INTO SQL statement. This is less effective strategy - better suited for testing and small data sets. + and flushed using INSERT INTO SQL statement. """ flush_interval = 1000 From 28081fdc09276f4a34960e0980d347134f2290cf Mon Sep 17 00:00:00 2001 From: hantmac Date: Fri, 16 Dec 2022 13:11:41 +0800 Subject: [PATCH 17/34] fix --- .../destination-databend/destination_databend/writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py index c70e4eda6918..a9c4dfe57da1 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/writer.py @@ -43,8 +43,7 @@ def delete_table(self, name: str) -> None: :param name: table name to delete. """ - cursor = self.cursor - cursor.execute(f"DROP TABLE IF EXISTS _airbyte_raw_{name}") + self.cursor.execute(f"DROP TABLE IF EXISTS _airbyte_raw_{name}") def create_raw_table(self, name: str): """ From d324ef33df1e573a84bd15f4ed902ebf38fc0a9a Mon Sep 17 00:00:00 2001 From: hantmac Date: Tue, 20 Dec 2022 13:02:37 +0800 Subject: [PATCH 18/34] add section to destination_definitions --- .../main/resources/seed/destination_definitions.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 22a3651376b9..5c62bd86354a 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -418,3 +418,14 @@ documentationUrl: https://docs.airbyte.com/integrations/destinations/yugabytedb icon: yugabytedb.svg releaseStage: alpha +- name: Databend + destinationDefinitionId: 302e4d8e-08d3-4098-acd4-ac67ca365b88 + dockerRepository: airbyte/destination-databend + dockerImageTag: 0.1.0 + documentationUrl: https://docs.airbyte.com/integrations/destinations/databend + normalizationConfig: + normalizationRepository: airbyte/normalization-databend + normalizationTag: 0.2.25 + normalizationIntegrationType: databend + supportsDbt: true + releaseStage: alpha From cee19c3c07222fd0943648d50da1090fbb00d110 Mon Sep 17 00:00:00 2001 From: hantmac Date: Tue, 20 Dec 2022 14:39:04 +0800 Subject: [PATCH 19/34] default port --- .../destination-databend/destination_databend/spec.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json index 4e630900664e..441b7c65d9ba 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json @@ -41,9 +41,9 @@ "type" : "integer", "minimum" : 0, "maximum" : 65536, - "default" : 8081, + "default" : 443, "examples" : [ - "8081" + "443" ], "order" : 2 }, From 20ef8fca76b497125417139fcf68b49148377258 Mon Sep 17 00:00:00 2001 From: hantmac Date: Tue, 20 Dec 2022 16:29:21 +0800 Subject: [PATCH 20/34] add databend icon --- airbyte-config/init/src/main/resources/icons/databend.svg | 1 + .../init/src/main/resources/seed/destination_definitions.yaml | 1 + airbyte-webapp/src/utils/connectors/destinations.json | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 airbyte-config/init/src/main/resources/icons/databend.svg diff --git a/airbyte-config/init/src/main/resources/icons/databend.svg b/airbyte-config/init/src/main/resources/icons/databend.svg new file mode 100644 index 000000000000..2761a45c04b4 --- /dev/null +++ b/airbyte-config/init/src/main/resources/icons/databend.svg @@ -0,0 +1 @@ +databend logo \ No newline at end of file diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 5c62bd86354a..e37a726f0132 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -422,6 +422,7 @@ destinationDefinitionId: 302e4d8e-08d3-4098-acd4-ac67ca365b88 dockerRepository: airbyte/destination-databend dockerImageTag: 0.1.0 + icon: databend.svg documentationUrl: https://docs.airbyte.com/integrations/destinations/databend normalizationConfig: normalizationRepository: airbyte/normalization-databend diff --git a/airbyte-webapp/src/utils/connectors/destinations.json b/airbyte-webapp/src/utils/connectors/destinations.json index a3189be25cc0..d938c4e96a04 100644 --- a/airbyte-webapp/src/utils/connectors/destinations.json +++ b/airbyte-webapp/src/utils/connectors/destinations.json @@ -46,5 +46,6 @@ "LocalSqLite": "b76be0a6-27dc-4560-95f6-2623da0bd7b6", "TiDb": "06ec60c7-7468-45c0-91ac-174f6e1a788b", "Typesense": "36be8dc6-9851-49af-b776-9d4c30e4ab6a", - "YugabyteDb": "2300fdcf-a532-419f-9f24-a014336e7966" + "YugabyteDb": "2300fdcf-a532-419f-9f24-a014336e7966", + "Databend": "302e4d8e-08d3-4098-acd4-ac67ca365b88" } From 687aa9b2fe26ec38b205534db4e5b6792bda1c5f Mon Sep 17 00:00:00 2001 From: hantmac Date: Wed, 21 Dec 2022 08:58:53 +0800 Subject: [PATCH 21/34] add sample_config.json --- .../integration_tests/sample_config.json | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 airbyte-integrations/connectors/destination-databend/integration_tests/sample_config.json diff --git a/airbyte-integrations/connectors/destination-databend/integration_tests/sample_config.json b/airbyte-integrations/connectors/destination-databend/integration_tests/sample_config.json new file mode 100644 index 000000000000..cc8ac8584d94 --- /dev/null +++ b/airbyte-integrations/connectors/destination-databend/integration_tests/sample_config.json @@ -0,0 +1,9 @@ +{ + "protocol" : "https", + "host" : "tnc7yee14--xxxx.ch.datafusecloud.com", + "port" : 443, + "username" : "username", + "password" : "password", + "database" : "default", + "table" : "default" +} From 7d91bff355aab8923cccb56fb7b3162ace5a81b3 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 30 Dec 2022 12:32:54 -0800 Subject: [PATCH 22/34] Fix formatting --- .../transform_catalog/destination_name_transformer.py | 10 +++++----- .../normalization/transform_config/transform.py | 6 +++--- .../destination_databend/destination.py | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index abf5e1f6f7cc..abde0cda3933 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -81,7 +81,7 @@ def normalize_schema_name(self, schema_name: str, in_jinja: bool = False, trunca return self.__normalize_non_column_identifier_name(input_name=schema_name, in_jinja=in_jinja, truncate=truncate) def normalize_table_name( - self, table_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, table_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: """ @param table_name is the table to normalize @@ -98,7 +98,7 @@ def normalize_table_name( ) def normalize_column_name( - self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: """ @param column_name is the column to normalize @@ -125,7 +125,7 @@ def truncate_identifier_name(self, input_name: str, custom_limit: int = -1, conf middle = round(limit / 2) # truncate in the middle to preserve prefix/suffix instead prefix = input_name[: limit - middle - 1] - suffix = input_name[1 - middle:] + suffix = input_name[1 - middle :] # Add extra characters '__', signaling a truncate in identifier print(f"Truncating {input_name} (#{len(input_name)}) to {prefix}_{suffix} (#{2 + len(prefix) + len(suffix)})") mid = "__" @@ -145,7 +145,7 @@ def get_name_max_length(self): # Private methods def __normalize_non_column_identifier_name( - self, input_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, input_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: # We force standard naming for non column names (see issue #1785) result = transform_standard_naming(input_name) @@ -161,7 +161,7 @@ def __normalize_non_column_identifier_name( return result def __normalize_identifier_name( - self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 + self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0 ) -> str: result = self.__normalize_naming_conventions(column_name, is_column=True) if truncate: diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py index 3e92df5edcea..842dae0afc59 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py @@ -78,9 +78,9 @@ def create_file(name, content): def is_ssh_tunnelling(config: Dict[str, Any]) -> bool: tunnel_methods = ["SSH_KEY_AUTH", "SSH_PASSWORD_AUTH"] if ( - "tunnel_method" in config.keys() - and "tunnel_method" in config["tunnel_method"] - and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods + "tunnel_method" in config.keys() + and "tunnel_method" in config["tunnel_method"] + and config["tunnel_method"]["tunnel_method"].upper() in tunnel_methods ): return True else: diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py index 2ee1552b5a50..2629ff54983a 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/destination.py @@ -21,7 +21,7 @@ class DestinationDatabend(Destination): def write( - self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] + self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage] ) -> Iterable[AirbyteMessage]: """ @@ -81,8 +81,8 @@ def check(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteConn client = DatabendClient(**config) cursor = client.open() cursor.execute("DROP TABLE IF EXISTS test") - cursor.execute('CREATE TABLE if not exists test (x Int32,y VARCHAR)') - cursor.execute('INSERT INTO test (x,y) VALUES (%,%)', [1, 'yy', 2, 'xx']) + cursor.execute("CREATE TABLE if not exists test (x Int32,y VARCHAR)") + cursor.execute("INSERT INTO test (x,y) VALUES (%,%)", [1, "yy", 2, "xx"]) cursor.execute("DROP TABLE IF EXISTS test") return AirbyteConnectionStatus(status=Status.SUCCEEDED) except Exception as e: From a9ec2564b3b5a6eb16f610bca3a4bfdf3a6244b2 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 30 Dec 2022 19:43:05 -0800 Subject: [PATCH 23/34] Remove databend from seed file and UI (this will have to be a separate PR) --- .../main/resources/seed/destination_definitions.yaml | 12 ------------ .../unit_tests/{unit_test.py => test_writer.py} | 0 .../src/utils/connectors/destinations.json | 3 +-- 3 files changed, 1 insertion(+), 14 deletions(-) rename airbyte-integrations/connectors/destination-databend/unit_tests/{unit_test.py => test_writer.py} (100%) diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index e37a726f0132..22a3651376b9 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -418,15 +418,3 @@ documentationUrl: https://docs.airbyte.com/integrations/destinations/yugabytedb icon: yugabytedb.svg releaseStage: alpha -- name: Databend - destinationDefinitionId: 302e4d8e-08d3-4098-acd4-ac67ca365b88 - dockerRepository: airbyte/destination-databend - dockerImageTag: 0.1.0 - icon: databend.svg - documentationUrl: https://docs.airbyte.com/integrations/destinations/databend - normalizationConfig: - normalizationRepository: airbyte/normalization-databend - normalizationTag: 0.2.25 - normalizationIntegrationType: databend - supportsDbt: true - releaseStage: alpha diff --git a/airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py b/airbyte-integrations/connectors/destination-databend/unit_tests/test_writer.py similarity index 100% rename from airbyte-integrations/connectors/destination-databend/unit_tests/unit_test.py rename to airbyte-integrations/connectors/destination-databend/unit_tests/test_writer.py diff --git a/airbyte-webapp/src/utils/connectors/destinations.json b/airbyte-webapp/src/utils/connectors/destinations.json index d938c4e96a04..a3189be25cc0 100644 --- a/airbyte-webapp/src/utils/connectors/destinations.json +++ b/airbyte-webapp/src/utils/connectors/destinations.json @@ -46,6 +46,5 @@ "LocalSqLite": "b76be0a6-27dc-4560-95f6-2623da0bd7b6", "TiDb": "06ec60c7-7468-45c0-91ac-174f6e1a788b", "Typesense": "36be8dc6-9851-49af-b776-9d4c30e4ab6a", - "YugabyteDb": "2300fdcf-a532-419f-9f24-a014336e7966", - "Databend": "302e4d8e-08d3-4098-acd4-ac67ca365b88" + "YugabyteDb": "2300fdcf-a532-419f-9f24-a014336e7966" } From 6cecea8f9fa647a89978b7b80c241b37d9f5cb56 Mon Sep 17 00:00:00 2001 From: grishick Date: Wed, 4 Jan 2023 08:30:01 -0800 Subject: [PATCH 24/34] Add databend dependency --- settings.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/settings.gradle b/settings.gradle index ba9e30251976..8b594d803305 100644 --- a/settings.gradle +++ b/settings.gradle @@ -146,6 +146,7 @@ if (!System.getenv().containsKey("SUB_BUILD") || System.getenv().get("SUB_BUILD" include ':airbyte-integrations:connectors:destination-mssql' include ':airbyte-integrations:connectors:destination-clickhouse' include ':airbyte-integrations:connectors:destination-tidb' + include ':airbyte-integrations:connectors:destination-databend' //Needed by destination-bigquery include ':airbyte-integrations:connectors:destination-s3' From 7238ce811dddf2c0c9ccc9521095891125091cef Mon Sep 17 00:00:00 2001 From: grishick Date: Thu, 5 Jan 2023 11:36:21 -0800 Subject: [PATCH 25/34] remove normalization from databend destination --- .../bases/base-normalization/README.md | 1 - .../bases/base-normalization/build.gradle | 10 -- .../dbt_project.yml | 65 ------- .../packages.yml | 5 - .../macros/cross_db_utils/array.sql | 5 - .../macros/cross_db_utils/datatypes.sql | 36 ---- .../macros/cross_db_utils/json_operations.sql | 25 --- .../macros/cross_db_utils/quote.sql | 4 - .../docker-compose.build.yaml | 7 - .../base-normalization/docker-compose.yaml | 2 - .../test_nested_streams/dbt_project.yml | 125 -------------- ..._columns_resulting_into_long_names_ab1.sql | 19 -- ..._columns_resulting_into_long_names_ab2.sql | 19 -- ...ing_into_long_names_partition_DATA_ab1.sql | 20 --- ...esulting_into_long_names_partition_ab1.sql | 19 -- ..._names_partition_double_array_data_ab1.sql | 20 --- ..._columns_resulting_into_long_names_scd.sql | 162 ------------------ ...plex_columns_resulting_into_long_names.sql | 22 --- ...ns_resulting_into_long_names_partition.sql | 19 -- ...sulting_into_long_names_partition_DATA.sql | 18 -- ...long_names_partition_double_array_data.sql | 18 -- .../models/generated/sources.yml | 23 --- .../test_simple_streams/dbt_project.yml | 65 ------- build.gradle | 1 - docs/cloud/core-concepts.md | 1 - .../basic-normalization.md | 1 - settings.gradle | 1 - 27 files changed, 713 deletions(-) delete mode 100644 airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml delete mode 100644 airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml delete mode 100644 airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml diff --git a/airbyte-integrations/bases/base-normalization/README.md b/airbyte-integrations/bases/base-normalization/README.md index 99644d9f82fa..bfa9ada93db4 100644 --- a/airbyte-integrations/bases/base-normalization/README.md +++ b/airbyte-integrations/bases/base-normalization/README.md @@ -235,7 +235,6 @@ allowed characters, if quotes are needed or not, and the length limitations: * [mysql](../../../docs/integrations/destinations/mysql.md) * [oracle](../../../docs/integrations/destinations/oracle.md) * [mssql](../../../docs/integrations/destinations/mssql.md) -* [databend](../../../docs/integrations/destinations/databend.md) Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit: * `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii` diff --git a/airbyte-integrations/bases/base-normalization/build.gradle b/airbyte-integrations/bases/base-normalization/build.gradle index 5c5b50598568..a95816c2d0b9 100644 --- a/airbyte-integrations/bases/base-normalization/build.gradle +++ b/airbyte-integrations/bases/base-normalization/build.gradle @@ -1,5 +1,3 @@ -import java.nio.file.Paths - plugins { id 'airbyte-docker' id 'airbyte-python' @@ -84,11 +82,6 @@ task airbyteDockerTiDB(type: Exec, dependsOn: checkSshScriptCopy) { dependsOn assemble } -task airbyteDockerDatabend(type: Exec, dependsOn: checkSshScriptCopy) { - configure buildAirbyteDocker('databend') - dependsOn assemble -} - airbyteDocker.dependsOn(airbyteDockerMSSql) airbyteDocker.dependsOn(airbyteDockerMySql) airbyteDocker.dependsOn(airbyteDockerOracle) @@ -96,7 +89,6 @@ airbyteDocker.dependsOn(airbyteDockerClickhouse) airbyteDocker.dependsOn(airbyteDockerSnowflake) airbyteDocker.dependsOn(airbyteDockerRedshift) airbyteDocker.dependsOn(airbyteDockerTiDB) -airbyteDocker.dependsOn(airbyteDockerDatabend) task("customIntegrationTestPython", type: PythonTask, dependsOn: installTestReqs) { module = "pytest" @@ -112,7 +104,6 @@ task("customIntegrationTestPython", type: PythonTask, dependsOn: installTestReqs dependsOn ':airbyte-integrations:connectors:destination-mssql:airbyteDocker' dependsOn ':airbyte-integrations:connectors:destination-clickhouse:airbyteDocker' dependsOn ':airbyte-integrations:connectors:destination-tidb:airbyteDocker' - dependsOn ':airbyte-integrations:connectors:destination-databend:airbyteDocker' } // not really sure what this task does differently from customIntegrationTestPython, but it seems to also run integration tests @@ -127,7 +118,6 @@ project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte- project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-mssql:airbyteDocker' project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-clickhouse:airbyteDocker' project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-tidb:airbyteDocker' -project.tasks.findByName('_customIntegrationTestsCoverage').dependsOn ':airbyte-integrations:connectors:destination-databend:airbyteDocker' // DATs have some additional tests that exercise normalization code paths, // so we want to run these in addition to the base-normalization integration tests. diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml deleted file mode 100644 index 58d58e7e1104..000000000000 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/dbt_project.yml +++ /dev/null @@ -1,65 +0,0 @@ -# This file is necessary to install dbt-utils with dbt deps -# the content will be overwritten by the transform function - -# Name your package! Package names should contain only lowercase characters -# and underscores. A good package name should reflect your organization's -# name or the intended use of these models -name: "airbyte_utils" -version: "1.0" -config-version: 2 - -# This setting configures which "profile" dbt uses for this project. Profiles contain -# database connection information, and should be configured in the ~/.dbt/profiles.yml file -profile: "normalize" - -# These configurations specify where dbt should look for different types of files. -# The `model-paths` config, for example, states that source models can be found -# in the "models/" directory. You probably won't need to change these! -model-paths: ["models"] -docs-paths: ["docs"] -analysis-paths: ["analysis"] -test-paths: ["tests"] -seed-paths: ["data"] -macro-paths: ["macros"] - -target-path: "../build" # directory which will store compiled SQL files -log-path: "../logs" # directory which will store DBT logs -packages-install-path: "/dbt" # directory which will store external DBT dependencies - -clean-targets: # directories to be removed by `dbt clean` - - "build" - - "dbt_modules" - -quoting: - database: true - # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) - # all schemas should be unquoted - schema: true - identifier: true - -# You can define configurations for models in the `model-paths` directory here. -# Using these configurations, you can enable or disable models, change how they -# are materialized, and more! -models: - airbyte_utils: - +materialized: table - generated: - airbyte_ctes: - +tags: airbyte_internal_cte - # ephemeral materialization isn't supported in Databend yet - +materialized: view - airbyte_incremental: - +tags: incremental_tables - +materialized: incremental - # schema change test isn't supported in Databend yet - +on_schema_change: "ignore" - airbyte_tables: - +tags: normalized_tables - +materialized: table - airbyte_views: - +tags: airbyte_internal_views - +materialized: view - -dispatch: - - macro_namespace: dbt_utils - search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml deleted file mode 100644 index 33b4edd58c8c..000000000000 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template-databend/packages.yml +++ /dev/null @@ -1,5 +0,0 @@ -# add dependencies. these will get pulled during the `dbt deps` process. - -packages: - - git: "https://github.com/fishtown-analytics/dbt-utils.git" - revision: 0.8.2 diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql index e2ab50385d4a..56ab17ce9af6 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql @@ -6,7 +6,6 @@ - postgres: unnest() -> https://www.postgresqltutorial.com/postgresql-array/ - MSSQL: openjson() –> https://docs.microsoft.com/en-us/sql/relational-databases/json/validate-query-and-change-json-data-with-built-in-functions-sql-server?view=sql-server-ver15 - ClickHouse: ARRAY JOIN –> https://clickhouse.com/docs/zh/sql-reference/statements/select/array-join/ - - Databend: unnest() -> https://databend.rs/doc/sql-reference/data-types/data-type-array-types/ #} {# cross_join_unnest ------------------------------------------------- #} @@ -27,10 +26,6 @@ ARRAY JOIN {{ array_col }} {%- endmacro %} -{% macro databend__cross_join_unnest(stream_name, array_col) -%} - unnest({{ array_col }}) -{%- endmacro %} - {% macro oracle__cross_join_unnest(stream_name, array_col) -%} {% do exceptions.warn("Normalization does not support unnesting for Oracle yet.") %} {%- endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql index d4cf2f20e361..a3c66782eefd 100755 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql @@ -44,10 +44,6 @@ json {%- endmacro -%} -{%- macro databend__type_json() -%} - json -{%- endmacro -%} - {# string ------------------------------------------------- #} @@ -160,10 +156,6 @@ float {% endmacro %} -{% macro databend__type_numeric() %} - DOUBLE -{% endmacro %} - {# very_large_integer --------------------------------------- --#} {# Most databases don't have a true unbounded numeric datatype, so we use a really big numeric field. @@ -194,10 +186,6 @@ so this macro needs to be called very_large_integer. decimal(38, 0) {% endmacro %} -{% macro databend__type_very_large_integer() %} - numeric -{% endmacro %} - {# timestamp ------------------------------------------------- --#} {% macro mysql__type_timestamp() %} time @@ -217,10 +205,6 @@ so this macro needs to be called very_large_integer. time {% endmacro %} -{% macro databend__type_timestamp() %} - timestamp -{% endmacro %} - {# timestamp with time zone ------------------------------------------------- #} {%- macro type_timestamp_with_timezone() -%} @@ -261,10 +245,6 @@ so this macro needs to be called very_large_integer. char(1000) {%- endmacro -%} -{% macro databend__type_timestamp_with_timezone() %} - TIMESTAMP -{% endmacro %} - {# timestamp without time zone ------------------------------------------------- #} {%- macro type_timestamp_without_timezone() -%} @@ -297,10 +277,6 @@ so this macro needs to be called very_large_integer. datetime {% endmacro %} -{% macro databend__type_timestamp_without_timezone() %} - timestamp -{% endmacro %} - {# time without time zone ------------------------------------------------- #} {%- macro type_time_without_timezone() -%} @@ -327,10 +303,6 @@ so this macro needs to be called very_large_integer. time {% endmacro %} -{% macro databend__type_time_without_timezone() %} - String -{% endmacro %} - {# time with time zone ------------------------------------------------- #} @@ -374,10 +346,6 @@ so this macro needs to be called very_large_integer. char(1000) {%- endmacro -%} -{% macro databend__type_time_with_timezone() %} - String -{% endmacro %} - {# date ------------------------------------------------- #} {%- macro type_date() -%} @@ -399,7 +367,3 @@ so this macro needs to be called very_large_integer. {% macro clickhouse__type_date() %} Date32 {% endmacro %} - -{% macro databend__type_date() %} - DATE -{% endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql index c8fd28df396e..0b76f5f49a29 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql @@ -7,7 +7,6 @@ - MySQL: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://dev.mysql.com/doc/refman/8.0/en/json-search-functions.html - ClickHouse: JSONExtractString(json_doc, 'path' [, 'path'] ...) -> https://clickhouse.com/docs/en/sql-reference/functions/json-functions/ - TiDB: JSON_EXTRACT(json_doc, 'path' [, 'path'] ...) -> https://docs.pingcap.com/tidb/stable/json-functions - - Databend: json_extract_path_text( , ) -> https://databend.rs/doc/sql-functions/semi-structured-functions/json_extract_path_text #} {# format_json_path -------------------------------------------------- #} @@ -104,14 +103,6 @@ {{ "'$.\"" ~ json_path_list|join(".") ~ "\"'" }} {%- endmacro %} -{% macro databend__format_json_path(json_path_list) -%} - {%- set str_list = [] -%} - {%- for json_path in json_path_list -%} - {%- if str_list.append(json_path.replace("'", "''").replace('"', '""')) -%} {%- endif -%} - {%- endfor -%} - {{ "'\"" ~ str_list|join('"."') ~ "\"'" }} -{%- endmacro %} - {# json_extract ------------------------------------------------- #} {% macro json_extract(from_table, json_column, json_path_list, normalized_json_path) -%} @@ -189,14 +180,6 @@ {% endif -%} {%- endmacro %} -{% macro databend__json_extract(from_table, json_column, json_path_list, normalized_json_path) -%} - {%- if from_table|string() == '' %} - get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }}) - {% else %} - get_path(parse_json({{ from_table }}.{{ json_column }}), {{ format_json_path(json_path_list) }}) - {% endif -%} -{%- endmacro %} - {# json_extract_scalar ------------------------------------------------- #} {% macro json_extract_scalar(json_column, json_path_list, normalized_json_path) -%} @@ -251,10 +234,6 @@ ) {%- endmacro %} -{% macro databend__json_extract_scalar(json_column, json_path_list, normalized_json_path) -%} - to_varchar(get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }})) -{%- endmacro %} - {# json_extract_array ------------------------------------------------- #} {% macro json_extract_array(json_column, json_path_list, normalized_json_path) -%} @@ -305,10 +284,6 @@ json_extract({{ json_column }}, {{ format_json_path(normalized_json_path) }}) {%- endmacro %} -{% macro databend__json_extract_array(json_column, json_path_list, normalized_json_path) -%} - get_path(parse_json({{ json_column }}), {{ format_json_path(json_path_list) }}) -{%- endmacro %} - {# json_extract_string_array ------------------------------------------------- #} {% macro json_extract_string_array(json_column, json_path_list, normalized_json_path) -%} diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql index d82a005d5c91..87862498cfc5 100644 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/quote.sql @@ -14,7 +14,3 @@ {% macro clickhouse__quote(column_name) -%} {{ '\"' ~ column_name ~ '\"'}} {%- endmacro %} - -{% macro databend__quote(column_name) -%} - {{ '\"' ~ column_name ~ '\"'}} -{%- endmacro %} diff --git a/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml b/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml index 79fec482c45b..4f95cb7a4720 100644 --- a/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml +++ b/airbyte-integrations/bases/base-normalization/docker-compose.build.yaml @@ -57,10 +57,3 @@ services: context: . labels: io.airbyte.git-revision: ${GIT_REVISION} - normalization-databend: - image: airbyte/normalization-databend:${VERSION} - build: - dockerfile: databend.Dockerfile - context: . - labels: - io.airbyte.git-revision: ${GIT_REVISION} diff --git a/airbyte-integrations/bases/base-normalization/docker-compose.yaml b/airbyte-integrations/bases/base-normalization/docker-compose.yaml index 0fb017e11204..ae29237b5149 100644 --- a/airbyte-integrations/bases/base-normalization/docker-compose.yaml +++ b/airbyte-integrations/bases/base-normalization/docker-compose.yaml @@ -18,5 +18,3 @@ services: image: airbyte/normalization-redshift:${VERSION} normalization-tidb: image: airbyte/normalization-tidb:${VERSION} - normalization-databend: - image: airbyte/normalization-databend:${VERSION} diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml deleted file mode 100644 index c54a0bf34350..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/dbt_project.yml +++ /dev/null @@ -1,125 +0,0 @@ -name: airbyte_utils -version: '1.0' -config-version: 2 -profile: normalize -model-paths: -- models -docs-paths: -- docs -analysis-paths: -- analysis -test-paths: -- tests -seed-paths: -- data -macro-paths: -- macros -target-path: ../build -log-path: ../logs -packages-install-path: /dbt -clean-targets: -- build -- dbt_modules -quoting: - database: true - schema: true - identifier: true -models: - airbyte_utils: - +materialized: table - generated: - airbyte_ctes: - +tags: airbyte_internal_cte - +materialized: view - airbyte_incremental: - +tags: incremental_tables - +materialized: incremental - +on_schema_change: ignore - airbyte_tables: - +tags: normalized_tables - +materialized: table - airbyte_views: - +tags: airbyte_internal_views - +materialized: view -dispatch: -- macro_namespace: dbt_utils - search_order: - - airbyte_utils - - dbt_utils -vars: - json_column: _airbyte_data - models_to_source: - nested_stream_with_complex_columns_resulting_into_long_names_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_stg: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_scd: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - non_nested_stream_without_namespace_resulting_into_long_names_ab1: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names - non_nested_stream_without_namespace_resulting_into_long_names_ab2: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names - non_nested_stream_without_namespace_resulting_into_long_names_ab3: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names - non_nested_stream_without_namespace_resulting_into_long_names: test_normalization._airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names - some_stream_that_was_empty_ab1: test_normalization._airbyte_raw_some_stream_that_was_empty - some_stream_that_was_empty_ab2: test_normalization._airbyte_raw_some_stream_that_was_empty - some_stream_that_was_empty_stg: test_normalization._airbyte_raw_some_stream_that_was_empty - some_stream_that_was_empty_scd: test_normalization._airbyte_raw_some_stream_that_was_empty - some_stream_that_was_empty: test_normalization._airbyte_raw_some_stream_that_was_empty - simple_stream_with_namespace_resulting_into_long_names_ab1: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names - simple_stream_with_namespace_resulting_into_long_names_ab2: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names - simple_stream_with_namespace_resulting_into_long_names_ab3: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names - simple_stream_with_namespace_resulting_into_long_names: test_normalization_namespace._airbyte_raw_simple_stream_with_namespace_resulting_into_long_names - conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_scalar_ab1: test_normalization._airbyte_raw_conflict_stream_scalar - conflict_stream_scalar_ab2: test_normalization._airbyte_raw_conflict_stream_scalar - conflict_stream_scalar_ab3: test_normalization._airbyte_raw_conflict_stream_scalar - conflict_stream_scalar: test_normalization._airbyte_raw_conflict_stream_scalar - conflict_stream_array_ab1: test_normalization._airbyte_raw_conflict_stream_array - conflict_stream_array_ab2: test_normalization._airbyte_raw_conflict_stream_array - conflict_stream_array_ab3: test_normalization._airbyte_raw_conflict_stream_array - conflict_stream_array: test_normalization._airbyte_raw_conflict_stream_array - unnest_alias_ab1: test_normalization._airbyte_raw_unnest_alias - unnest_alias_ab2: test_normalization._airbyte_raw_unnest_alias - unnest_alias_ab3: test_normalization._airbyte_raw_unnest_alias - unnest_alias: test_normalization._airbyte_raw_unnest_alias - arrays_ab1: test_normalization._airbyte_raw_arrays - arrays_ab2: test_normalization._airbyte_raw_arrays - arrays_ab3: test_normalization._airbyte_raw_arrays - arrays: test_normalization._airbyte_raw_arrays - nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name - unnest_alias_children_ab1: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_ab2: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_ab3: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children: test_normalization._airbyte_raw_unnest_alias - arrays_nested_array_parent_ab1: test_normalization._airbyte_raw_arrays - arrays_nested_array_parent_ab2: test_normalization._airbyte_raw_arrays - arrays_nested_array_parent_ab3: test_normalization._airbyte_raw_arrays - arrays_nested_array_parent: test_normalization._airbyte_raw_arrays - nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab2: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA: test_normalization._airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - conflict_stream_name_conflict_stream_name_conflict_stream_name_ab1: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name_conflict_stream_name_ab2: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name_conflict_stream_name_ab3: test_normalization._airbyte_raw_conflict_stream_name - conflict_stream_name_conflict_stream_name_conflict_stream_name: test_normalization._airbyte_raw_conflict_stream_name - unnest_alias_children_owner_ab1: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_ab2: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_ab3: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_column___with__quotes_ab1: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_column___with__quotes_ab2: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_column___with__quotes_ab3: test_normalization._airbyte_raw_unnest_alias - unnest_alias_children_owner_column___with__quotes: test_normalization._airbyte_raw_unnest_alias diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql deleted file mode 100644 index 527abd7fec29..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab1.sql +++ /dev/null @@ -1,19 +0,0 @@ -{{ config( - unique_key = '_airbyte_ab_id', - schema = "_airbyte_test_normalization", - tags = [ "top-level-intermediate" ] -) }} --- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema --- depends_on: {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} -select - {{ json_extract_scalar('_airbyte_data', ['id'], ['id']) }} as id, - {{ json_extract_scalar('_airbyte_data', ['date'], ['date']) }} as date, - {{ json_extract('table_alias', '_airbyte_data', ['partition'], ['partition']) }} as partition, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at -from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} as table_alias --- nested_stream_with_complex_columns_resulting_into_long_names -where 1 = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql deleted file mode 100644 index 40974116420c..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_ab2.sql +++ /dev/null @@ -1,19 +0,0 @@ -{{ config( - unique_key = '_airbyte_ab_id', - schema = "_airbyte_test_normalization", - tags = [ "top-level-intermediate" ] -) }} --- SQL model to cast each column to its adequate SQL type converted from the JSON schema type --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }} -select - cast(id as {{ dbt_utils.type_string() }}) as id, - cast(date as {{ dbt_utils.type_string() }}) as date, - cast(partition as {{ type_json() }}) as partition, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_ab1') }} --- nested_stream_with_complex_columns_resulting_into_long_names -where 1 = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql deleted file mode 100644 index f21e68df4476..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab1.sql +++ /dev/null @@ -1,20 +0,0 @@ -{{ config( - schema = "_airbyte_test_normalization", - tags = [ "nested-intermediate" ] -) }} --- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} -{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'DATA') }} -select - _airbyte_partition_hashid, - {{ json_extract_scalar(unnested_column_value('DATA'), ['currency'], ['currency']) }} as currency, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias --- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA -{{ cross_join_unnest('partition', 'DATA') }} -where 1 = 1 -and DATA is not null -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql deleted file mode 100644 index cb91f0f5a201..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_ab1.sql +++ /dev/null @@ -1,19 +0,0 @@ -{{ config( - schema = "_airbyte_test_normalization", - tags = [ "nested-intermediate" ] -) }} --- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} -select - _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid, - {{ json_extract_array('partition', ['double_array_data'], ['double_array_data']) }} as double_array_data, - {{ json_extract_array('partition', ['DATA'], ['DATA']) }} as DATA, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} as table_alias --- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition -where 1 = 1 -and partition is not null -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql deleted file mode 100644 index 5f5a0066bb92..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_ctes/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab1.sql +++ /dev/null @@ -1,20 +0,0 @@ -{{ config( - schema = "_airbyte_test_normalization", - tags = [ "nested-intermediate" ] -) }} --- SQL model to parse JSON blob stored in a single column and extract into separated field columns as described by the JSON Schema --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} -{{ unnest_cte(ref('nested_stream_with_complex_columns_resulting_into_long_names_partition'), 'partition', 'double_array_data') }} -select - _airbyte_partition_hashid, - {{ json_extract_scalar(unnested_column_value('double_array_data'), ['id'], ['id']) }} as id, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} as table_alias --- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data -{{ cross_join_unnest('partition', 'double_array_data') }} -where 1 = 1 -and double_array_data is not null -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql deleted file mode 100644 index 3990b534b1b3..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/scd/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_scd.sql +++ /dev/null @@ -1,162 +0,0 @@ -{{ config( - unique_key = "_airbyte_unique_key_scd", - schema = "test_normalization", - post_hook = [" - {% - set final_table_relation = adapter.get_relation( - database=this.database, - schema=this.schema, - identifier='nested_stream_with_complex_columns_resulting_into_long_names' - ) - %} - {# - If the final table doesn't exist, then obviously we can't delete anything from it. - Also, after a reset, the final table is created without the _airbyte_unique_key column (this column is created during the first sync) - So skip this deletion if the column doesn't exist. (in this case, the table is guaranteed to be empty anyway) - #} - {% - if final_table_relation is not none and '_airbyte_unique_key' in adapter.get_columns_in_relation(final_table_relation)|map(attribute='name') - %} - -- Delete records which are no longer active: - -- This query is equivalent, but the left join version is more performant: - -- delete from final_table where unique_key in ( - -- select unique_key from scd_table where 1 = 1 - -- ) and unique_key not in ( - -- select unique_key from scd_table where active_row = 1 - -- ) - -- We're incremental against normalized_at rather than emitted_at because we need to fetch the SCD - -- entries that were _updated_ recently. This is because a deleted record will have an SCD record - -- which was emitted a long time ago, but recently re-normalized to have active_row = 0. - delete from {{ final_table_relation }} where {{ final_table_relation }}._airbyte_unique_key in ( - select recent_records.unique_key - from ( - select distinct _airbyte_unique_key as unique_key - from {{ this }} - where 1=1 {{ incremental_clause('_airbyte_normalized_at', adapter.quote(this.schema) + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }} - ) recent_records - left join ( - select _airbyte_unique_key as unique_key, count(_airbyte_unique_key) as active_count - from {{ this }} - where _airbyte_active_row = 1 {{ incremental_clause('_airbyte_normalized_at', adapter.quote(this.schema) + '.' + adapter.quote('nested_stream_with_complex_columns_resulting_into_long_names')) }} - group by _airbyte_unique_key - ) active_counts - on recent_records.unique_key = active_counts.unique_key - where active_count is null or active_count = 0 - ) - {% else %} - -- We have to have a non-empty query, so just do a noop delete - delete from {{ this }} where 1=0 - {% endif %} - ","drop view _airbyte_test_normalization.nested_stream_with_complex_columns_resulting_into_long_names_stg"], - tags = [ "top-level" ] -) }} --- depends_on: ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') -with -{% if is_incremental() %} -new_data as ( - -- retrieve incremental "new" data - select - * - from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }} - -- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} - where 1 = 1 - {{ incremental_clause('_airbyte_emitted_at', this) }} -), -new_data_ids as ( - -- build a subset of _airbyte_unique_key from rows that are new - select distinct - {{ dbt_utils.surrogate_key([ - 'id', - ]) }} as _airbyte_unique_key - from new_data -), -empty_new_data as ( - -- build an empty table to only keep the table's column types - select * from new_data where 1 = 0 -), -previous_active_scd_data as ( - -- retrieve "incomplete old" data that needs to be updated with an end date because of new changes - select - {{ star_intersect(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg'), this, from_alias='inc_data', intersect_alias='this_data') }} - from {{ this }} as this_data - -- make a join with new_data using primary key to filter active data that need to be updated only - join new_data_ids on this_data._airbyte_unique_key = new_data_ids._airbyte_unique_key - -- force left join to NULL values (we just need to transfer column types only for the star_intersect macro on schema changes) - left join empty_new_data as inc_data on this_data._airbyte_ab_id = inc_data._airbyte_ab_id - where _airbyte_active_row = 1 -), -input_data as ( - select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from new_data - union all - select {{ dbt_utils.star(ref('nested_stream_with_complex_columns_resulting_into_long_names_stg')) }} from previous_active_scd_data -), -{% else %} -input_data as ( - select * - from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_stg') }} - -- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} -), -{% endif %} -scd_data as ( - -- SQL model to build a Type 2 Slowly Changing Dimension (SCD) table for each record identified by their primary key - select - {{ dbt_utils.surrogate_key([ - 'id', - ]) }} as _airbyte_unique_key, - id, - date, - partition, - date as _airbyte_start_at, - lag(date) over ( - partition by id - order by - date is null asc, - date desc, - _airbyte_emitted_at desc - ) as _airbyte_end_at, - case when row_number() over ( - partition by id - order by - date is null asc, - date desc, - _airbyte_emitted_at desc - ) = 1 then 1 else 0 end as _airbyte_active_row, - _airbyte_ab_id, - _airbyte_emitted_at, - _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid - from input_data -), -dedup_data as ( - select - -- we need to ensure de-duplicated rows for merge/update queries - -- additionally, we generate a unique key for the scd table - row_number() over ( - partition by - _airbyte_unique_key, - _airbyte_start_at, - _airbyte_emitted_at - order by _airbyte_active_row desc, _airbyte_ab_id - ) as _airbyte_row_num, - {{ dbt_utils.surrogate_key([ - '_airbyte_unique_key', - '_airbyte_start_at', - '_airbyte_emitted_at' - ]) }} as _airbyte_unique_key_scd, - scd_data.* - from scd_data -) -select - _airbyte_unique_key, - _airbyte_unique_key_scd, - id, - date, - partition, - _airbyte_start_at, - _airbyte_end_at, - _airbyte_active_row, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at, - _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid -from dedup_data where _airbyte_row_num = 1 - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql deleted file mode 100644 index 16e9999a1767..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names.sql +++ /dev/null @@ -1,22 +0,0 @@ -{{ config( - unique_key = "_airbyte_unique_key", - schema = "test_normalization", - tags = [ "top-level" ] -) }} --- Final base SQL model --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} -select - _airbyte_unique_key, - id, - date, - partition, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at, - _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} --- nested_stream_with_complex_columns_resulting_into_long_names from {{ source('test_normalization', '_airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names') }} -where 1 = 1 -and _airbyte_active_row = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql deleted file mode 100644 index c2f68acb99c1..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition.sql +++ /dev/null @@ -1,19 +0,0 @@ -{{ config( - schema = "test_normalization", - tags = [ "nested" ] -) }} --- Final base SQL model --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }} -select - _airbyte_nested_stream_with_complex_columns_resulting_into_long_names_hashid, - double_array_data, - DATA, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at, - _airbyte_partition_hashid -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_ab3') }} --- partition at nested_stream_with_complex_columns_resulting_into_long_names/partition from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_scd') }} -where 1 = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql deleted file mode 100644 index c2ad0964d06d..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA.sql +++ /dev/null @@ -1,18 +0,0 @@ -{{ config( - schema = "test_normalization", - tags = [ "nested" ] -) }} --- Final base SQL model --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }} -select - _airbyte_partition_hashid, - currency, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at, - _airbyte_DATA_hashid -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_DATA_ab3') }} --- DATA at nested_stream_with_complex_columns_resulting_into_long_names/partition/DATA from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} -where 1 = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql deleted file mode 100644 index 0d3f5190d847..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/airbyte_incremental/test_normalization/nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data.sql +++ /dev/null @@ -1,18 +0,0 @@ -{{ config( - schema = "test_normalization", - tags = [ "nested" ] -) }} --- Final base SQL model --- depends_on: {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }} -select - _airbyte_partition_hashid, - id, - _airbyte_ab_id, - _airbyte_emitted_at, - {{ current_timestamp() }} as _airbyte_normalized_at, - _airbyte_double_array_data_hashid -from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition_double_array_data_ab3') }} --- double_array_data at nested_stream_with_complex_columns_resulting_into_long_names/partition/double_array_data from {{ ref('nested_stream_with_complex_columns_resulting_into_long_names_partition') }} -where 1 = 1 -{{ incremental_clause('_airbyte_emitted_at', this) }} - diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml deleted file mode 100644 index 29bae1b4b510..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_nested_streams/models/generated/sources.yml +++ /dev/null @@ -1,23 +0,0 @@ -version: 2 -sources: -- name: test_normalization - quoting: - database: true - schema: false - identifier: false - tables: - - name: _airbyte_raw_arrays - - name: _airbyte_raw_conflict_stream_array - - name: _airbyte_raw_conflict_stream_name - - name: _airbyte_raw_conflict_stream_scalar - - name: _airbyte_raw_nested_stream_with_complex_columns_resulting_into_long_names - - name: _airbyte_raw_non_nested_stream_without_namespace_resulting_into_long_names - - name: _airbyte_raw_some_stream_that_was_empty - - name: _airbyte_raw_unnest_alias -- name: test_normalization_namespace - quoting: - database: true - schema: false - identifier: false - tables: - - name: _airbyte_raw_simple_stream_with_namespace_resulting_into_long_names diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml b/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml deleted file mode 100644 index 58d58e7e1104..000000000000 --- a/airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output/databend/test_simple_streams/dbt_project.yml +++ /dev/null @@ -1,65 +0,0 @@ -# This file is necessary to install dbt-utils with dbt deps -# the content will be overwritten by the transform function - -# Name your package! Package names should contain only lowercase characters -# and underscores. A good package name should reflect your organization's -# name or the intended use of these models -name: "airbyte_utils" -version: "1.0" -config-version: 2 - -# This setting configures which "profile" dbt uses for this project. Profiles contain -# database connection information, and should be configured in the ~/.dbt/profiles.yml file -profile: "normalize" - -# These configurations specify where dbt should look for different types of files. -# The `model-paths` config, for example, states that source models can be found -# in the "models/" directory. You probably won't need to change these! -model-paths: ["models"] -docs-paths: ["docs"] -analysis-paths: ["analysis"] -test-paths: ["tests"] -seed-paths: ["data"] -macro-paths: ["macros"] - -target-path: "../build" # directory which will store compiled SQL files -log-path: "../logs" # directory which will store DBT logs -packages-install-path: "/dbt" # directory which will store external DBT dependencies - -clean-targets: # directories to be removed by `dbt clean` - - "build" - - "dbt_modules" - -quoting: - database: true - # Temporarily disabling the behavior of the ExtendedNameTransformer on table/schema names, see (issue #1785) - # all schemas should be unquoted - schema: true - identifier: true - -# You can define configurations for models in the `model-paths` directory here. -# Using these configurations, you can enable or disable models, change how they -# are materialized, and more! -models: - airbyte_utils: - +materialized: table - generated: - airbyte_ctes: - +tags: airbyte_internal_cte - # ephemeral materialization isn't supported in Databend yet - +materialized: view - airbyte_incremental: - +tags: incremental_tables - +materialized: incremental - # schema change test isn't supported in Databend yet - +on_schema_change: "ignore" - airbyte_tables: - +tags: normalized_tables - +materialized: table - airbyte_views: - +tags: airbyte_internal_views - +materialized: view - -dispatch: - - macro_namespace: dbt_utils - search_order: ["airbyte_utils", "dbt_utils"] \ No newline at end of file diff --git a/build.gradle b/build.gradle index 940e55ea5298..8b11def5547a 100644 --- a/build.gradle +++ b/build.gradle @@ -118,7 +118,6 @@ def createSpotlessTarget = { pattern -> 'dbt-project-template-mysql', 'dbt-project-template-oracle', 'dbt-project-template-clickhouse', - 'dbt-project-template-databend', 'dbt-project-template-snowflake', 'dbt-project-template-tidb', 'dbt_test_config', diff --git a/docs/cloud/core-concepts.md b/docs/cloud/core-concepts.md index 3f7781d8315d..c53c3189712a 100644 --- a/docs/cloud/core-concepts.md +++ b/docs/cloud/core-concepts.md @@ -139,7 +139,6 @@ Note that normalization is only relevant for the following relational database & * Oracle * MySQL * MSSQL -* DATABEND Other destinations do not support normalization as described in this section, though they may normalize data in a format that makes sense for them. For example, the S3 destination connector offers the option of writing JSON files in S3, but also offers the option of writing statically typed files such as Parquet or Avro. diff --git a/docs/understanding-airbyte/basic-normalization.md b/docs/understanding-airbyte/basic-normalization.md index 7857a408cc13..305e382f75c6 100644 --- a/docs/understanding-airbyte/basic-normalization.md +++ b/docs/understanding-airbyte/basic-normalization.md @@ -102,7 +102,6 @@ In Airbyte, the current normalization option is implemented using a dbt Transfor * [Postgres](../integrations/destinations/postgres.md) * [Redshift](../integrations/destinations/redshift.md) * [Snowflake](../integrations/destinations/snowflake.md) -* [Databend](../integrations/destinations/databend.md) Basic Normalization can be configured when you're creating the connection between your Connection Setup and after in the Transformation Tab. Select the option: **Normalized tabular data**. diff --git a/settings.gradle b/settings.gradle index 8b594d803305..ba9e30251976 100644 --- a/settings.gradle +++ b/settings.gradle @@ -146,7 +146,6 @@ if (!System.getenv().containsKey("SUB_BUILD") || System.getenv().get("SUB_BUILD" include ':airbyte-integrations:connectors:destination-mssql' include ':airbyte-integrations:connectors:destination-clickhouse' include ':airbyte-integrations:connectors:destination-tidb' - include ':airbyte-integrations:connectors:destination-databend' //Needed by destination-bigquery include ':airbyte-integrations:connectors:destination-s3' From 16779515be8527f251ed628e45e93062ff46ba79 Mon Sep 17 00:00:00 2001 From: grishick Date: Thu, 5 Jan 2023 11:36:48 -0800 Subject: [PATCH 26/34] remove normalization and dedupe from databend destination spec --- .../destination-databend/destination_databend/spec.json | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json index 441b7c65d9ba..9605df680ada 100644 --- a/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json +++ b/airbyte-integrations/connectors/destination-databend/destination_databend/spec.json @@ -2,12 +2,11 @@ "documentationUrl" : "https://docs.airbyte.com/integrations/destinations/databend", "supported_destination_sync_modes" : [ "overwrite", - "append", - "append_dedup" + "append" ], "supportsIncremental" : true, - "supportsDBT" : true, - "supportsNormalization" : true, + "supportsDBT" : false, + "supportsNormalization" : false, "connectionSpecification" : { "$schema" : "http://json-schema.org/draft-07/schema#", "title" : "Destination Databend", From b0076183f3da05560e0330db65f2ad4dc57777f0 Mon Sep 17 00:00:00 2001 From: grishick Date: Thu, 5 Jan 2023 11:40:54 -0800 Subject: [PATCH 27/34] removing normalization from databend destination --- .../macros/cross_db_utils/datatypes.sql | 16 ---------------- .../integration_tests/dbt_integration_test.py | 4 ---- .../integration_tests/test_ephemeral.py | 2 -- .../integration_tests/test_normalization.py | 10 +--------- .../normalization/destination_type.py | 1 - .../destination_name_transformer.py | 7 ------- .../normalization/transform_config/transform.py | 14 -------------- 7 files changed, 1 insertion(+), 53 deletions(-) diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql index a3c66782eefd..42f5312b054f 100755 --- a/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql +++ b/airbyte-integrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/datatypes.sql @@ -72,10 +72,6 @@ char(1000) {%- endmacro -%} -{%- macro databend__type_string() -%} - String -{%- endmacro -%} - {# float ------------------------------------------------- #} {% macro mysql__type_float() %} float @@ -93,10 +89,6 @@ float {% endmacro %} -{% macro databend__type_float() %} - float -{% endmacro %} - {# int ------------------------------------------------- #} {% macro default__type_int() %} int @@ -118,10 +110,6 @@ signed {% endmacro %} -{% macro databend__type_int() %} - INT -{% endmacro %} - {# bigint ------------------------------------------------- #} {% macro mysql__type_bigint() %} signed @@ -139,10 +127,6 @@ signed {% endmacro %} -{% macro databend__type_bigint() %} - BIGINT -{% endmacro %} - {# numeric ------------------------------------------------- --#} {% macro mysql__type_numeric() %} float diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py b/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py index 28c940916b23..7cb25ea39ad9 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/dbt_integration_test.py @@ -387,8 +387,6 @@ def generate_profile_yaml_file( } elif destination_type.value == DestinationType.MYSQL.value: profiles_config["database"] = self.target_schema - elif destination_type.value == DestinationType.DATABEND.value: - profiles_config["database"] = self.target_schema elif destination_type.value == DestinationType.REDSHIFT.value: profiles_config["schema"] = self.target_schema if random_schema: @@ -445,8 +443,6 @@ def get_normalization_image(destination_type: DestinationType) -> str: return "airbyte/normalization-redshift:dev" elif DestinationType.TIDB.value == destination_type.value: return "airbyte/normalization-tidb:dev" - elif DestinationType.DATABEND.value == destination_type.value: - return "airbyte/normalization-databend:dev" else: return "airbyte/normalization:dev" diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py b/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py index 9114e38cb137..f459f5faecd6 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/test_ephemeral.py @@ -106,8 +106,6 @@ def run_test(destination_type: DestinationType, column_count: int, expected_exce elif destination_type.value == DestinationType.REDSHIFT.value: # set unique schema for Redshift test dbt_test_utils.set_target_schema(dbt_test_utils.generate_random_string("test_ephemeral_")) - if destination_type.value == DestinationType.DATABEND.value: - pytest.skip("ephemeral materialization isn't supported in Databend yet") else: dbt_test_utils.set_target_schema("test_ephemeral") print(f"Testing ephemeral for destination {destination_type.value} with column count {column_count}") diff --git a/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py b/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py index 73262514df38..0163cd128151 100644 --- a/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py +++ b/airbyte-integrations/bases/base-normalization/integration_tests/test_normalization.py @@ -140,12 +140,7 @@ def run_schema_change_normalization(destination_type: DestinationType, test_reso if destination_type.value in [DestinationType.MYSQL.value, DestinationType.ORACLE.value]: # TODO: upgrade dbt-adapter repositories to work with dbt 0.21.0+ (outside airbyte's control) pytest.skip(f"{destination_type} does not support schema change in incremental yet (requires dbt 0.21.0+)") - if destination_type.value in [ - DestinationType.SNOWFLAKE.value, - DestinationType.CLICKHOUSE.value, - DestinationType.TIDB.value, - DestinationType.DATABEND.value, - ]: + if destination_type.value in [DestinationType.SNOWFLAKE.value, DestinationType.CLICKHOUSE.value, DestinationType.TIDB.value]: pytest.skip(f"{destination_type} is disabled as it doesnt support schema change in incremental yet (column type changes)") if destination_type.value in [DestinationType.MSSQL.value, DestinationType.SNOWFLAKE.value]: # TODO: create/fix github issue in corresponding dbt-adapter repository to handle schema changes (outside airbyte's control) @@ -218,9 +213,6 @@ def setup_test_dir(destination_type: DestinationType, test_resource_name: str) - elif destination_type.value == DestinationType.TIDB.value: copy_tree("../dbt-project-template-tidb", test_root_dir) dbt_project_yaml = "../dbt-project-template-tidb/dbt_project.yml" - elif destination_type.value == DestinationType.DATABEND.value: - copy_tree("../dbt-project-template-databend", test_root_dir) - dbt_project_yaml = "../dbt-project-template-databend/dbt_project.yml" dbt_test_utils.copy_replace(dbt_project_yaml, os.path.join(test_root_dir, "dbt_project.yml")) return test_root_dir diff --git a/airbyte-integrations/bases/base-normalization/normalization/destination_type.py b/airbyte-integrations/bases/base-normalization/normalization/destination_type.py index 7fc2e5db597e..3f1d154f52ce 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/destination_type.py +++ b/airbyte-integrations/bases/base-normalization/normalization/destination_type.py @@ -16,7 +16,6 @@ class DestinationType(Enum): REDSHIFT = "redshift" SNOWFLAKE = "snowflake" TIDB = "tidb" - DATABEND = "databend" @classmethod def from_string(cls, string_value: str) -> "DestinationType": diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index abde0cda3933..f5d3d710f3ce 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -29,7 +29,6 @@ DestinationType.CLICKHOUSE.value: 63, # https://docs.pingcap.com/tidb/stable/tidb-limitations DestinationType.TIDB.value: 64, - DestinationType.DATABEND.value: 255, } # DBT also needs to generate suffix to table names, so we need to make sure it has enough characters to do so... @@ -173,10 +172,6 @@ def __normalize_identifier_name( result = result.replace("'", "_") elif self.destination_type.value != DestinationType.MYSQL.value and self.destination_type.value != DestinationType.TIDB.value: result = result.replace('"', '""') - elif self.destination_type.value == DestinationType.DATABEND.value: - result = result.replace('"', "_") - result = result.replace("`", "_") - result = result.replace("'", "_") else: result = result.replace("`", "_") result = result.replace("'", "\\'") @@ -286,8 +281,6 @@ def normalize_column_identifier_case_for_lookup(self, input_name: str, is_quoted pass elif self.destination_type.value == DestinationType.TIDB.value: result = input_name.lower() - elif self.destination_type.value == DestinationType.DATABEND.value: - pass else: raise KeyError(f"Unknown destination type {self.destination_type}") return result diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py index 842dae0afc59..a762b39f1a45 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_config/transform.py @@ -59,7 +59,6 @@ def transform(self, integration_type: DestinationType, config: Dict[str, Any]): DestinationType.MSSQL.value: self.transform_mssql, DestinationType.CLICKHOUSE.value: self.transform_clickhouse, DestinationType.TIDB.value: self.transform_tidb, - DestinationType.DATABEND.value: self.transform_databend, }[integration_type.value](config) # merge pre-populated base_profile with destination-specific configuration. @@ -346,19 +345,6 @@ def transform_tidb(config: Dict[str, Any]): } return dbt_config - @staticmethod - def transform_databend(config: Dict[str, Any]): - print("transform_databend") - dbt_config = { - "type": "databend", - "host": config["host"], - "port": config["port"], - "schema": config["database"], - "user": config["username"], - "pass": config.get("password", ""), - } - return dbt_config - @staticmethod def read_json_config(input_path: str): with open(input_path, "r") as file: From f1fee8478e10fe6d0a66ea1c24b556b4d33f6a80 Mon Sep 17 00:00:00 2001 From: grishick Date: Thu, 5 Jan 2023 17:54:49 -0800 Subject: [PATCH 28/34] Update logo --- airbyte-config/init/src/main/resources/icons/databend.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/icons/databend.svg b/airbyte-config/init/src/main/resources/icons/databend.svg index 2761a45c04b4..bf78f7655a12 100644 --- a/airbyte-config/init/src/main/resources/icons/databend.svg +++ b/airbyte-config/init/src/main/resources/icons/databend.svg @@ -1 +1 @@ -databend logo \ No newline at end of file +databend logo \ No newline at end of file From 104de0ac25d4ac0e7e3e999753d3b20397182bfe Mon Sep 17 00:00:00 2001 From: josephkmh Date: Fri, 6 Jan 2023 13:09:37 +0100 Subject: [PATCH 29/34] make databend logo square --- airbyte-config/init/src/main/resources/icons/databend.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/icons/databend.svg b/airbyte-config/init/src/main/resources/icons/databend.svg index bf78f7655a12..b6afca7ea9eb 100644 --- a/airbyte-config/init/src/main/resources/icons/databend.svg +++ b/airbyte-config/init/src/main/resources/icons/databend.svg @@ -1 +1 @@ -databend logo \ No newline at end of file + \ No newline at end of file From 61d727ff25522fdbac29b118361d4ce17bf0972e Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 6 Jan 2023 10:21:36 -0800 Subject: [PATCH 30/34] removing normalization artifacts --- .../base-normalization/databend.Dockerfile | 32 ------------------- .../destination_name_transformer.py | 2 -- .../transform_catalog/reserved_keywords.py | 5 +-- 3 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 airbyte-integrations/bases/base-normalization/databend.Dockerfile diff --git a/airbyte-integrations/bases/base-normalization/databend.Dockerfile b/airbyte-integrations/bases/base-normalization/databend.Dockerfile deleted file mode 100644 index 24695edbab12..000000000000 --- a/airbyte-integrations/bases/base-normalization/databend.Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM fishtownanalytics/dbt:0.21.0 -COPY --from=airbyte/base-airbyte-protocol-python:0.1.1 /airbyte /airbyte - -# Install SSH Tunneling dependencies -RUN apt-get update && apt-get install -y jq sshpass -WORKDIR /airbyte -COPY entrypoint.sh . -COPY build/sshtunneling.sh . - -WORKDIR /airbyte/normalization_code -COPY normalization ./normalization -COPY setup.py . -COPY dbt-project-template/ ./dbt-template/ - -# Install python dependencies -WORKDIR /airbyte/base_python_structs -RUN pip install . - -WORKDIR /airbyte/normalization_code -RUN pip install . - -WORKDIR /airbyte/normalization_code/dbt-template/ -#RUN pip install dbt-databend-cloud -RUN pip install dbt-databend-cloud==1.3.2 -# Download external dbt dependencies -RUN dbt deps - -WORKDIR /airbyte -ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh" -ENTRYPOINT ["/airbyte/entrypoint.sh"] - -LABEL io.airbyte.name=airbyte/normalization-databend diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py index f5d3d710f3ce..b65c5545e56e 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py @@ -239,8 +239,6 @@ def __normalize_identifier_case(self, input_name: str, is_quoted: bool = False) elif self.destination_type.value == DestinationType.TIDB.value: if not is_quoted and not self.needs_quotes(input_name): result = input_name.lower() - elif self.destination_type.value == DestinationType.DATABEND.value: - pass else: raise KeyError(f"Unknown destination type {self.destination_type}") return result diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py index 2525a11da057..2e85ee29189b 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py @@ -3110,8 +3110,6 @@ "ZEROFILL", } -DATABEND: Set[str] = set() - RESERVED_KEYWORDS = { DestinationType.BIGQUERY.value: BIGQUERY, DestinationType.POSTGRES.value: POSTGRES, @@ -3121,8 +3119,7 @@ DestinationType.ORACLE.value: ORACLE, DestinationType.MSSQL.value: MSSQL, DestinationType.CLICKHOUSE.value: CLICKHOUSE, - DestinationType.TIDB.value: TIDB, - DestinationType.DATABEND.value: DATABEND, + DestinationType.TIDB.value: TIDB } From 131be25e26ba3b0297a69f4ba12440b7a50f3074 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 6 Jan 2023 10:38:41 -0800 Subject: [PATCH 31/34] format fix --- .../normalization/transform_catalog/reserved_keywords.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py index 2e85ee29189b..0931b4f29c29 100644 --- a/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py +++ b/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/reserved_keywords.py @@ -2048,6 +2048,7 @@ "WITH", } + # https://docs.microsoft.com/en-us/sql/t-sql/language-elements/reserved-keywords-transact-sql?view=sql-server-ver15 MSSQL = { "ADD", @@ -3119,7 +3120,7 @@ DestinationType.ORACLE.value: ORACLE, DestinationType.MSSQL.value: MSSQL, DestinationType.CLICKHOUSE.value: CLICKHOUSE, - DestinationType.TIDB.value: TIDB + DestinationType.TIDB.value: TIDB, } From 4e920d7b75a050ec55e467c836b8fd43aab060e1 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 6 Jan 2023 16:03:45 -0800 Subject: [PATCH 32/34] Generate seed files for databend destination --- .../seed/destination_definitions.yaml | 7 ++ .../resources/seed/destination_specs.yaml | 66 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 22a3651376b9..ddcb7ea44efc 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -418,3 +418,10 @@ documentationUrl: https://docs.airbyte.com/integrations/destinations/yugabytedb icon: yugabytedb.svg releaseStage: alpha +- name: Databend + destinationDefinitionId: 302e4d8e-08d3-4098-acd4-ac67ca365b88 + dockerRepository: airbyte/destination-databend + dockerImageTag: 0.1.0 + icon: databend.svg + documentationUrl: https://docs.airbyte.com/integrations/destinations/databend + releaseStage: alpha \ No newline at end of file diff --git a/airbyte-config/init/src/main/resources/seed/destination_specs.yaml b/airbyte-config/init/src/main/resources/seed/destination_specs.yaml index faf403561ba4..bc47b88d1148 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_specs.yaml @@ -7140,3 +7140,69 @@ supported_destination_sync_modes: - "overwrite" - "append" +- dockerImage: "airbyte/destination-databend:0.1.0" + spec: + documentationUrl: "https://docs.airbyte.com/integrations/destinations/databend" + connectionSpecification: + $schema: "http://json-schema.org/draft-07/schema#" + title: "Destination Databend" + type: "object" + required: + - "host" + - "username" + - "database" + additionalProperties: true + properties: + host: + title: "Host" + description: "Hostname of the database." + type: "string" + order: 0 + protocol: + title: "Protocol" + description: "Protocol of the host." + type: "string" + examples: + - "https" + default: "https" + order: 1 + port: + title: "Port" + description: "Port of the database." + type: "integer" + minimum: 0 + maximum: 65536 + default: 443 + examples: + - "443" + order: 2 + database: + title: "DB Name" + description: "Name of the database." + type: "string" + order: 3 + table: + title: "Default Table" + description: "The default table was written to." + type: "string" + examples: + - "default" + default: "default" + order: 4 + username: + title: "User" + description: "Username to use to access the database." + type: "string" + order: 5 + password: + title: "Password" + description: "Password associated with the username." + type: "string" + airbyte_secret: true + order: 6 + supportsIncremental: true + supportsNormalization: false + supportsDBT: false + supported_destination_sync_modes: + - "overwrite" + - "append" From 421692a8b7df43aa4bf6c229bd26db03e6a1c5d1 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 6 Jan 2023 16:13:15 -0800 Subject: [PATCH 33/34] Trying to fix git diff quirkiness --- .../init/src/main/resources/seed/destination_definitions.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index ddcb7ea44efc..5226faa4c6b0 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -424,4 +424,5 @@ dockerImageTag: 0.1.0 icon: databend.svg documentationUrl: https://docs.airbyte.com/integrations/destinations/databend - releaseStage: alpha \ No newline at end of file + releaseStage: alpha + From a630700e94e6ad8711f0513b6d3bdcd6a3c77190 Mon Sep 17 00:00:00 2001 From: grishick Date: Fri, 6 Jan 2023 16:14:19 -0800 Subject: [PATCH 34/34] run format --- .../init/src/main/resources/seed/destination_definitions.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml index 5226faa4c6b0..60632d14bbaf 100644 --- a/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/destination_definitions.yaml @@ -425,4 +425,3 @@ icon: databend.svg documentationUrl: https://docs.airbyte.com/integrations/destinations/databend releaseStage: alpha -