diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 0fd90b3c4e73..56678863edf2 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -271,7 +271,7 @@ - name: File sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.12 + dockerImageTag: 0.2.13 documentationUrl: https://docs.airbyte.io/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 10772f5ba612..f8eb8b5ab5ff 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2261,7 +2261,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.12" +- dockerImage: "airbyte/source-file:0.2.13" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/file" connectionSpecification: @@ -2320,6 +2320,11 @@ storage: type: "string" const: "HTTPS" + user_agent: + type: "boolean" + title: "User-Agent" + default: false + description: "Add User-Agent to request" - title: "GCS: Google Cloud Storage" required: - "storage" diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index 8c0759cb6bba..8737dd173e34 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.12 +LABEL io.airbyte.version=0.2.13 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py b/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py index 9ae3febf47fc..69086321f238 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py +++ b/airbyte-integrations/connectors/source-file/integration_tests/client_storage_providers_test.py @@ -91,7 +91,7 @@ def test__read_from_public_provider(download_gcs_public_data, storage_provider, "format": "csv", "dataset_name": "output", "reader_options": json.dumps({"sep": separator, "nrows": 42}), - "provider": {"storage": storage_provider}, + "provider": {"storage": storage_provider, "user_agent": False}, "url": url, } diff --git a/airbyte-integrations/connectors/source-file/integration_tests/config.json b/airbyte-integrations/connectors/source-file/integration_tests/config.json index 95b85a01504c..6edc01a35017 100644 --- a/airbyte-integrations/connectors/source-file/integration_tests/config.json +++ b/airbyte-integrations/connectors/source-file/integration_tests/config.json @@ -5,6 +5,7 @@ "url": "https://storage.googleapis.com/covid19-open-data/v2/latest/epidemiology.csv", "provider": { "storage": "HTTPS", - "reader_impl": "gcsfs" + "reader_impl": "gcsfs", + "user_agent": false } } diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index 1ed4bc764329..c96e73261854 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -5,6 +5,7 @@ import json import traceback +from os import environ from typing import Iterable from urllib.parse import urlparse @@ -114,6 +115,17 @@ def _open(self, binary): else: uri = f"{storage}{user}@{host}:{port}/{url}" return smart_open.open(uri, transport_params=transport_params, mode=mode) + elif storage in ("https://", "http://"): + transport_params = None + if self._provider["user_agent"]: + airbyte_version = environ.get("AIRBYTE_VERSION", "0.0") + transport_params = {"headers": {"Accept-Encoding": "identity", "User-Agent": f"Airbyte/{airbyte_version}"}} + logger.info(f"TransportParams: {transport_params}") + return smart_open.open( + self.full_url, + mode=mode, + transport_params=transport_params, + ) return smart_open.open(self.full_url, mode=mode) @property diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json index 777001fe8035..1081ad2f93e9 100644 --- a/airbyte-integrations/connectors/source-file/source_file/spec.json +++ b/airbyte-integrations/connectors/source-file/source_file/spec.json @@ -44,6 +44,12 @@ "storage": { "type": "string", "const": "HTTPS" + }, + "user_agent": { + "type": "boolean", + "title": "User-Agent", + "default": false, + "description": "Add User-Agent to request" } } },