diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 8472cdbf8431..90a6096aba8d 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -310,7 +310,7 @@ - name: File sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.23 + dockerImageTag: 0.2.24 documentationUrl: https://docs.airbyte.io/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index a85c9952d724..6897c88acedd 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -3092,7 +3092,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.23" +- dockerImage: "airbyte/source-file:0.2.24" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/file" connectionSpecification: diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index c3546538f215..5af1b98180a3 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.23 +LABEL io.airbyte.version=0.2.24 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/README.md b/airbyte-integrations/connectors/source-file/README.md index 134f687606cb..290229c680c2 100644 --- a/airbyte-integrations/connectors/source-file/README.md +++ b/airbyte-integrations/connectors/source-file/README.md @@ -91,10 +91,10 @@ and place them into `secrets/config.json`. ### Locally running the connector ``` -python main_dev.py spec -python main_dev.py check --config secrets/config.json -python main_dev.py discover --config secrets/config.json -python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json +python main.py spec +python main.py check --config secrets/config.json +python main.py discover --config secrets/config.json +python main.py read --config secrets/config.json --catalog sample_files/configured_catalog.json ``` ### Unit Tests diff --git a/airbyte-integrations/connectors/source-file/source_file/source.py b/airbyte-integrations/connectors/source-file/source_file/source.py index a178f5524cdf..ae0e58497f0f 100644 --- a/airbyte-integrations/connectors/source-file/source_file/source.py +++ b/airbyte-integrations/connectors/source-file/source_file/source.py @@ -19,6 +19,7 @@ Type, ) from airbyte_cdk.sources import Source +from pandas.errors import ParserError from .client import Client @@ -84,8 +85,21 @@ def check(self, logger, config: Mapping) -> AirbyteConnectionStatus: client = self._get_client(config) logger.info(f"Checking access to {client.reader.full_url}...") try: - with client.reader.open(): + with client.reader.open() as f: + if config.get("provider").get("storage") == "HTTPS": + # on behalf of https://github.com/airbytehq/alpha-beta-issues/issues/224 + # some providers like Dropbox creates the Shared Public URLs with ?dl=0 query param, + # this requires user interaction before accessing the file, + # we should validate this on the Check Connection stage to avoid sync issues further. + client.CSV_CHUNK_SIZE = 2 + next(client.load_dataframes(f)) + return AirbyteConnectionStatus(status=Status.SUCCEEDED) + # for all other formats and storrage providers return AirbyteConnectionStatus(status=Status.SUCCEEDED) + except ParserError: + reason = f"Failed to load {client.reader.full_url}, check the URL is valid and allows to download file directly" + logger.error(reason) + return AirbyteConnectionStatus(status=Status.FAILED, message=reason) except Exception as err: reason = f"Failed to load {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py index 6846e5f6fa4a..20a1b25560de 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py @@ -34,6 +34,19 @@ def invalid_config(read_file): } +@pytest.fixture +def non_direct_url_provided_config(): + return { + "dataset_name": "test", + "format": "csv", + "url": "https://www.dropbox.com/s/tcxj6fzwuwyfusq/CSV_Test.csv?dl=0", + "provider": { + "storage": "HTTPS", + "user_agent": False, + }, + } + + @pytest.fixture def client(): return Client( diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py index 5c29acddc287..41f3b0027ac3 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py @@ -128,6 +128,12 @@ def test_check_invalid_config(source, invalid_config): assert actual.status == expected.status +def test_check_non_direct_url_provided_config(source, non_direct_url_provided_config): + expected = AirbyteConnectionStatus(status=Status.FAILED) + actual = source.check(logger=logger, config=non_direct_url_provided_config) + assert actual.status == expected.status + + def test_discover(source, config, client): catalog = source.discover(logger=logger, config=config) catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 785b239d8bec..147daec75a63 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -129,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | | ------- | ---------- | -------------------------------------------------------- | -------------------------------------------------------- | +| 0.2.24 | 2022-10-03 | [17504](https://github.com/airbytehq/airbyte/pull/17504) | Validate data for `HTTPS` while `check_connection` | | 0.2.23 | 2022-09-28 | [17304](https://github.com/airbytehq/airbyte/pull/17304) | Migrate to per-stream state. | | 0.2.22 | 2022-09-15 | [16772](https://github.com/airbytehq/airbyte/pull/16772) | Fix schema generation for JSON files containing arrays | | 0.2.21 | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files |