From 9d66402e8319b31e1bab4c61d5c5b8ba13e58243 Mon Sep 17 00:00:00 2001 From: Yevhenii <34103125+yevhenii-ldv@users.noreply.github.com> Date: Fri, 29 Oct 2021 13:15:13 +0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20Source=20File:=20Migrate=20File?= =?UTF-8?q?=20source=20to=20CDK=20structure=20(#7387)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate File source to CDK structure * fix .dockerignore file * remove SAT requirements * update Dockerfile * change Dockerfile to base images python:3.7-slim * add SAT tests * update tests * add secret/config.json for source-file * update changelogs --- .../connectors/source-file/.dockerignore | 9 ++++-- .../connectors/source-file/Dockerfile | 15 ++++----- .../source-file/acceptance-test-config.yml | 19 +++++++++++ .../source-file/acceptance-test-docker.sh | 15 +++++++++ .../connectors/source-file/build.gradle | 24 +------------- .../source-file/integration_tests/__init__.py | 3 ++ .../integration_tests/acceptance.py | 16 ++++++++++ .../client_storage_providers_test.py | 8 ++--- .../integration_tests/configured_catalog.json | 32 ++++++++++++------- .../integration_tests/file_formats_test.py | 2 +- .../integration_tests/invalid_config.json | 10 ++++++ .../source-file/{main_dev.py => main.py} | 2 +- .../connectors/source-file/requirements.txt | 3 +- .../connectors/source-file/setup.py | 9 ++---- .../source-file/source_file/client.py | 11 +++---- .../source-file/source_file/source.py | 5 +-- .../source-file/source_file/spec.json | 24 +++++++++----- docs/integrations/sources/file.md | 1 + tools/bin/ci_credentials.sh | 1 + 19 files changed, 134 insertions(+), 75 deletions(-) create mode 100644 airbyte-integrations/connectors/source-file/acceptance-test-config.yml create mode 100644 airbyte-integrations/connectors/source-file/acceptance-test-docker.sh create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/__init__.py create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/acceptance.py create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/invalid_config.json rename airbyte-integrations/connectors/source-file/{main_dev.py => main.py} (82%) diff --git a/airbyte-integrations/connectors/source-file/.dockerignore b/airbyte-integrations/connectors/source-file/.dockerignore index 9ef96044faba..959635b7ae3d 100644 --- a/airbyte-integrations/connectors/source-file/.dockerignore +++ b/airbyte-integrations/connectors/source-file/.dockerignore @@ -1,2 +1,7 @@ -build - +* +!Dockerfile +!main.py +!source_file +!setup.py +!integration_tests +!secrets diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index f63f23c865a1..ac258f3f902b 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -1,17 +1,16 @@ -FROM airbyte/integration-base-python:0.1.1 +FROM python:3.7-slim +# Bash is installed for more convenient debugging. RUN apt-get update && apt-get install -y jq curl bash && rm -rf /var/lib/apt/lists/* -ENV CODE_PATH="source_file" -ENV AIRBYTE_IMPL_MODULE="source_file" -ENV AIRBYTE_IMPL_PATH="SourceFile" - WORKDIR /airbyte/integration_code -COPY $CODE_PATH ./$CODE_PATH +COPY source_file ./source_file +COPY main.py ./ COPY setup.py ./ RUN pip install . -ENV AIRBYTE_ENTRYPOINT "/airbyte/base.sh" +ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" +ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.6 +LABEL io.airbyte.version=0.2.7 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/acceptance-test-config.yml b/airbyte-integrations/connectors/source-file/acceptance-test-config.yml new file mode 100644 index 000000000000..e769a740cc3d --- /dev/null +++ b/airbyte-integrations/connectors/source-file/acceptance-test-config.yml @@ -0,0 +1,19 @@ +# See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) +# for more information about how to configure these tests +connector_image: airbyte/source-file:dev +tests: + spec: + - spec_path: "source_file/spec.json" + connection: + - config_path: "integration_tests/config.json" + status: "succeed" + - config_path: "integration_tests/invalid_config.json" + status: "failed" + discovery: + - config_path: "integration_tests/config.json" + basic_read: + - config_path: "integration_tests/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + full_refresh: + - config_path: "integration_tests/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-file/acceptance-test-docker.sh b/airbyte-integrations/connectors/source-file/acceptance-test-docker.sh new file mode 100644 index 000000000000..c522eebbd94e --- /dev/null +++ b/airbyte-integrations/connectors/source-file/acceptance-test-docker.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env sh + +# Build latest connector image +docker build . -t $(cat acceptance-test-config.yml | grep "connector_image" | head -n 1 | cut -d: -f2) + +# Pull latest acctest image +docker pull airbyte/source-acceptance-test:latest + +# Run +docker run --rm -it \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /tmp:/tmp \ + -v $(pwd):/test_input \ + airbyte/source-acceptance-test \ + --acceptance-test-config /test_input diff --git a/airbyte-integrations/connectors/source-file/build.gradle b/airbyte-integrations/connectors/source-file/build.gradle index ddcf4ba0e9e6..a3917d1f07cd 100644 --- a/airbyte-integrations/connectors/source-file/build.gradle +++ b/airbyte-integrations/connectors/source-file/build.gradle @@ -1,31 +1,9 @@ -import ru.vyarus.gradle.plugin.python.task.PythonTask - plugins { id 'airbyte-python' id 'airbyte-docker' - id 'airbyte-standard-source-test-file' + id 'airbyte-source-acceptance-test' } airbytePython { moduleDirectory 'source_file' } - - -airbyteStandardSourceTestFile { - specPath = "source_file/spec.json" - configPath = "integration_tests/config.json" - configuredCatalogPath = "integration_tests/configured_catalog.json" -} - -task("customIntegrationTestPython", type: PythonTask, dependsOn: installTestReqs){ - module = "pytest" - command = "-s integration_tests" -} - -integrationTest.dependsOn("customIntegrationTestPython") - - -dependencies { - implementation files(project(':airbyte-integrations:bases:base-standard-source-test-file').airbyteDocker.outputs) - implementation files(project(':airbyte-integrations:bases:base-python').airbyteDocker.outputs) -} diff --git a/airbyte-integrations/connectors/source-file/integration_tests/__init__.py b/airbyte-integrations/connectors/source-file/integration_tests/__init__.py new file mode 100644 index 000000000000..46b7376756ec --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-file/integration_tests/acceptance.py b/airbyte-integrations/connectors/source-file/integration_tests/acceptance.py new file mode 100644 index 000000000000..4b6c44dfc613 --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/acceptance.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# + + +import pytest + +pytest_plugins = ("source_acceptance_test.plugin",) + + +@pytest.fixture(scope="session", autouse=True) +def connector_setup(): + """This fixture is a placeholder for external resources that acceptance test might require.""" + # TODO: setup test dependencies + yield + # TODO: clean up test dependencies diff --git a/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py b/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py index 6ee66a99d68b..604420e9c252 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py +++ b/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py @@ -62,10 +62,10 @@ def test__streams_from_ssh_providers(provider_config, provider_name, file_path, streams = list(client.streams) assert len(streams) == 1 assert streams[0].json_schema["properties"] == { - "header1": {"type": "string"}, - "header2": {"type": "number"}, - "header3": {"type": "number"}, - "header4": {"type": "boolean"}, + "header1": {"type": ["string", "null"]}, + "header2": {"type": ["number", "null"]}, + "header3": {"type": ["number", "null"]}, + "header4": {"type": ["boolean", "null"]}, } diff --git a/airbyte-integrations/connectors/source-file/integration_tests/configured_catalog.json b/airbyte-integrations/connectors/source-file/integration_tests/configured_catalog.json index 183b1a0b9d53..904ec0c20a9e 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/configured_catalog.json +++ b/airbyte-integrations/connectors/source-file/integration_tests/configured_catalog.json @@ -2,34 +2,44 @@ "streams": [ { "stream": { - "name": "my_own_data_sample/my_file.csv", + "name": "integrationTestFile", "json_schema": { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "date": { - "type": "string" + "type": ["string", "null"] }, "key": { - "type": "string" + "type": ["string", "null"] }, - "total_confirmed": { - "type": "number" + "new_confirmed": { + "type": ["number", "null"] + }, + "new_deceased": { + "type": ["number", "null"] + }, + "new_recovered": { + "type": ["number", "null"] }, - "total_healed": { - "type": "number" + "new_tested": { + "type": ["number", "null"] + }, + "total_confirmed": { + "type": ["number", "null"] }, "total_deceased": { - "type": "number" + "type": ["number", "null"] }, "total_recovered": { - "type": "number" + "type": ["number", "null"] }, "total_tested": { - "type": "number" + "type": ["number", "null"] } } - } + }, + "supported_sync_modes": ["full_refresh"] }, "sync_mode": "full_refresh", "destination_sync_mode": "overwrite" diff --git a/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py index 3bd5c9a10966..75da00168ff0 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py +++ b/airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py @@ -6,7 +6,7 @@ from pathlib import Path import pytest -from base_python import AirbyteLogger +from airbyte_cdk import AirbyteLogger from source_file import SourceFile from source_file.client import Client diff --git a/airbyte-integrations/connectors/source-file/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-file/integration_tests/invalid_config.json new file mode 100644 index 000000000000..3674a2e844c0 --- /dev/null +++ b/airbyte-integrations/connectors/source-file/integration_tests/invalid_config.json @@ -0,0 +1,10 @@ +{ + "dataset_name": "fake_csv", + "format": "csv", + "reader_options": "{\"sep\": \",\", \"nrows\": 20}", + "url": "https://test.fakr.com/cfake_data.csv", + "provider": { + "storage": "HTTPS", + "reader_impl": "fake" + } +} diff --git a/airbyte-integrations/connectors/source-file/main_dev.py b/airbyte-integrations/connectors/source-file/main.py similarity index 82% rename from airbyte-integrations/connectors/source-file/main_dev.py rename to airbyte-integrations/connectors/source-file/main.py index 6e4c21c74488..59723e5e4f1c 100644 --- a/airbyte-integrations/connectors/source-file/main_dev.py +++ b/airbyte-integrations/connectors/source-file/main.py @@ -5,7 +5,7 @@ import sys -from base_python.entrypoint import launch +from airbyte_cdk.entrypoint import launch from source_file import SourceFile if __name__ == "__main__": diff --git a/airbyte-integrations/connectors/source-file/requirements.txt b/airbyte-integrations/connectors/source-file/requirements.txt index dd447512e620..7be17a56d745 100644 --- a/airbyte-integrations/connectors/source-file/requirements.txt +++ b/airbyte-integrations/connectors/source-file/requirements.txt @@ -1,4 +1,3 @@ # This file is autogenerated -- only edit if you know what you are doing. Use setup.py for declaring dependencies. --e ../../bases/airbyte-protocol --e ../../bases/base-python +-e ../../bases/source-acceptance-test -e . diff --git a/airbyte-integrations/connectors/source-file/setup.py b/airbyte-integrations/connectors/source-file/setup.py index 15636060002a..0575ae93fd17 100644 --- a/airbyte-integrations/connectors/source-file/setup.py +++ b/airbyte-integrations/connectors/source-file/setup.py @@ -6,8 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-protocol", - "base-python", + "airbyte-cdk~=0.1", "gcsfs==0.7.1", "genson==1.2.2", "google-cloud-storage==1.35.0", @@ -24,11 +23,7 @@ "pyxlsb==1.0.8", ] -TEST_REQUIREMENTS = [ - "boto3==1.16.57", - "pytest==6.1.2", - "pytest-docker==0.10.1", -] +TEST_REQUIREMENTS = ["boto3==1.16.57", "pytest==6.1.2", "pytest-docker==0.10.1"] setup( name="source_file", diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index aeb08bb3aa82..0ece40b60286 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -9,12 +9,11 @@ from urllib.parse import urlparse import google -import numpy as np import pandas as pd import smart_open -from airbyte_protocol import AirbyteStream +from airbyte_cdk.entrypoint import logger +from airbyte_cdk.models import AirbyteStream, SyncMode from azure.storage.blob import BlobServiceClient -from base_python.entrypoint import logger from botocore import UNSIGNED from botocore.config import Config from genson import SchemaBuilder @@ -339,7 +338,7 @@ def read(self, fields: Iterable = None) -> Iterable[dict]: fields = set(fields) if fields else None for df in self.load_dataframes(fp): columns = fields.intersection(set(df.columns)) if fields else df.columns - df = df.replace(np.nan, "NaN", regex=True) + df = df.where(pd.notnull(df), None) yield from df[columns].to_dict(orient="records") def _stream_properties(self): @@ -352,7 +351,7 @@ def _stream_properties(self): for df in df_list: for col in df.columns: fields[col] = self.dtype_to_json_type(df[col].dtype) - return {field: {"type": fields[field]} for field in fields} + return {field: {"type": [fields[field], "null"]} for field in fields} @property def streams(self) -> Iterable: @@ -363,4 +362,4 @@ def streams(self) -> Iterable: "type": "object", "properties": self._stream_properties(), } - yield AirbyteStream(name=self.stream_name, json_schema=json_schema) + yield AirbyteStream(name=self.stream_name, json_schema=json_schema, supported_sync_modes=[SyncMode.full_refresh]) diff --git a/airbyte-integrations/connectors/source-file/source_file/source.py b/airbyte-integrations/connectors/source-file/source_file/source.py index 6df6b531a4e3..6a8405925c08 100644 --- a/airbyte-integrations/connectors/source-file/source_file/source.py +++ b/airbyte-integrations/connectors/source-file/source_file/source.py @@ -7,7 +7,8 @@ from datetime import datetime from typing import Generator, Iterable, Mapping -from airbyte_protocol import ( +from airbyte_cdk import AirbyteLogger +from airbyte_cdk.models import ( AirbyteCatalog, AirbyteConnectionStatus, AirbyteMessage, @@ -16,7 +17,7 @@ Status, Type, ) -from base_python import AirbyteLogger, Source +from airbyte_cdk.sources import Source from .client import Client diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json index 1ebc2d915952..4ddd79af9d7c 100644 --- a/airbyte-integrations/connectors/source-file/source_file/spec.json +++ b/airbyte-integrations/connectors/source-file/source_file/spec.json @@ -39,7 +39,8 @@ "storage": { "type": "string", "enum": ["HTTPS"], - "default": "HTTPS" + "default": "HTTPS", + "const": "HTTPS" } } }, @@ -50,7 +51,8 @@ "storage": { "type": "string", "enum": ["GCS"], - "default": "GCS" + "default": "GCS", + "const": "GCS" }, "service_account_json": { "type": "string", @@ -65,7 +67,8 @@ "storage": { "type": "string", "enum": ["S3"], - "default": "S3" + "default": "S3", + "const": "S3" }, "aws_access_key_id": { "type": "string", @@ -85,7 +88,8 @@ "storage": { "type": "string", "enum": ["AzBlob"], - "default": "AzBlob" + "default": "AzBlob", + "const": "AzBlob" }, "storage_account": { "type": "string", @@ -110,7 +114,8 @@ "storage": { "type": "string", "enum": ["SSH"], - "default": "SSH" + "default": "SSH", + "const": "SSH" }, "user": { "type": "string" @@ -135,7 +140,8 @@ "storage": { "type": "string", "enum": ["SCP"], - "default": "SCP" + "default": "SCP", + "const": "SCP" }, "user": { "type": "string" @@ -160,7 +166,8 @@ "storage": { "type": "string", "enum": ["SFTP"], - "default": "SFTP" + "default": "SFTP", + "const": "SFTP" }, "user": { "type": "string" @@ -186,7 +193,8 @@ "type": "string", "description": "WARNING: Note that local storage URL available for read must start with the local mount \"/local/\" at the moment until we implement more advanced docker mounting options...", "enum": ["local"], - "default": "local" + "default": "local", + "const": "local" } } } diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 835cdc86c612..3170dcffdbe4 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -124,6 +124,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | | :--- | :--- | :--- | :--- | +| 0.2.7 | 2021-10-28 | [7387](https://github.com/airbytehq/airbyte/pull/7387) | Migrate source to CDK structure, add SAT testing. | | 0.2.6 | 2021-08-26 | [5613](https://github.com/airbytehq/airbyte/pull/5613) | Add support to xlsb format | | 0.2.5 | 2021-07-26 | [4953](https://github.com/airbytehq/airbyte/pull/4953) | Allow non-default port for SFTP type | | 0.2.4 | 2021-06-09 | [3973](https://github.com/airbytehq/airbyte/pull/3973) | Add AIRBYTE\_ENTRYPOINT for Kubernetes support | diff --git a/tools/bin/ci_credentials.sh b/tools/bin/ci_credentials.sh index 69147692d89f..9a5cc15b0414 100755 --- a/tools/bin/ci_credentials.sh +++ b/tools/bin/ci_credentials.sh @@ -70,6 +70,7 @@ write_standard_creds source-exchange-rates "$EXCHANGE_RATES_TEST_CREDS" write_standard_creds source-file "$GOOGLE_CLOUD_STORAGE_TEST_CREDS" "gcs.json" write_standard_creds source-file "$AWS_S3_INTEGRATION_TEST_CREDS" "aws.json" write_standard_creds source-file "$AZURE_STORAGE_INTEGRATION_TEST_CREDS" "azblob.json" +write_standard_creds source-file "$FILE_SECURE_HTTPS_TEST_CREDS" write_standard_creds source-file-secure "$FILE_SECURE_HTTPS_TEST_CREDS" write_standard_creds source-freshdesk "$FRESHDESK_TEST_CREDS" write_standard_creds source-freshservice "$SOURCE_FRESHSERVICE_TEST_CREDS"