From 8bc0ef9e02f93aec390770ad2ac5ffd5c255c8c8 Mon Sep 17 00:00:00 2001 From: Sergey Chvalyuk Date: Wed, 10 Aug 2022 09:34:38 +0300 Subject: [PATCH 1/6] _cache_stream added Signed-off-by: Sergey Chvalyuk --- .../connectors/source-file-secure/Dockerfile | 4 ++-- .../connectors/source-file/Dockerfile | 2 +- .../connectors/source-file/source_file/client.py | 12 +++++++++++- docs/integrations/sources/file.md | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile index 256d21bdf9b8..dcb20379a8b8 100644 --- a/airbyte-integrations/connectors/source-file-secure/Dockerfile +++ b/airbyte-integrations/connectors/source-file-secure/Dockerfile @@ -1,4 +1,4 @@ -FROM airbyte/source-file:0.2.15 +FROM airbyte/source-file:0.2.16 WORKDIR /airbyte/integration_code COPY source_file_secure ./source_file_secure @@ -9,5 +9,5 @@ RUN pip install . ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.15 +LABEL io.airbyte.version=0.2.16 LABEL io.airbyte.name=airbyte/source-file-secure diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index 1a0769dd0cc6..9cbf49648e62 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.15 +LABEL io.airbyte.version=0.2.16 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index cda6b8c40db0..f6d04cb1887d 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -4,6 +4,7 @@ import json +import tempfile import traceback from os import environ from typing import Iterable @@ -356,10 +357,19 @@ def read(self, fields: Iterable = None) -> Iterable[dict]: yield from df[columns].to_dict(orient="records") else: fields = set(fields) if fields else None + fp = self._cache_stream(fp) for df in self.load_dataframes(fp): columns = fields.intersection(set(df.columns)) if fields else df.columns df = df.where(pd.notnull(df), None) - yield from df[columns].to_dict(orient="records") + yield from df[list(columns)].to_dict(orient="records") + + def _cache_stream(self, fp): + """cache stream to file""" + fp_tmp = tempfile.TemporaryFile(mode="w+b") + fp_tmp.write(fp.read()) + fp_tmp.seek(0) + fp.close() + return fp_tmp def _stream_properties(self, fp): if self._reader_format == "yaml": diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 34511122f630..7bc53f6a21e0 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -127,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------| ------------------------------------------------- | +| 0.2.16 | 2022-08-10 | [](https://github.com/airbytehq/airbyte/pull/) | Cache stream to file | | 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 | | 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files | | 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588) | Add support to YAML format | From 27c9d6a374f22beb0207596bbc39a058b88eccb8 Mon Sep 17 00:00:00 2001 From: Sergey Chvalyuk Date: Wed, 10 Aug 2022 10:44:37 +0300 Subject: [PATCH 2/6] cache only for binary Signed-off-by: Sergey Chvalyuk --- .../connectors/source-file/source_file/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index f6d04cb1887d..e2241ce261eb 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -357,7 +357,8 @@ def read(self, fields: Iterable = None) -> Iterable[dict]: yield from df[columns].to_dict(orient="records") else: fields = set(fields) if fields else None - fp = self._cache_stream(fp) + if self.binary_source: + fp = self._cache_stream(fp) for df in self.load_dataframes(fp): columns = fields.intersection(set(df.columns)) if fields else df.columns df = df.where(pd.notnull(df), None) From 52d90bc252c92f23c233fa93a248958a5276b005 Mon Sep 17 00:00:00 2001 From: Sergey Chvalyuk Date: Wed, 10 Aug 2022 10:58:54 +0300 Subject: [PATCH 3/6] revert back pytest 7.1 -> 6.2, pytest-mock 3.8 -> 3.6 to be compatible with SAT tests move boto3==1.21.21 to main reqs Signed-off-by: Sergey Chvalyuk --- airbyte-integrations/connectors/source-file/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-file/setup.py b/airbyte-integrations/connectors/source-file/setup.py index 352340b965f0..8ec1aba3f4b7 100644 --- a/airbyte-integrations/connectors/source-file/setup.py +++ b/airbyte-integrations/connectors/source-file/setup.py @@ -13,6 +13,7 @@ "pandas==1.4.3", "paramiko==2.11.0", "s3fs==2022.7.1", + "boto3==1.21.21", "smart-open[all]==6.0.0", "lxml==4.9.1", "html5lib==1.1", @@ -23,7 +24,7 @@ "pyxlsb==1.0.9", ] -TEST_REQUIREMENTS = ["boto3==1.21.21", "pytest==7.1.2", "pytest-docker==1.0.0", "pytest-mock~=3.8.2"] +TEST_REQUIREMENTS = ["pytest~=6.2", "pytest-docker==1.0.0", "pytest-mock~=3.6.1"] setup( name="source_file", From bc1ab54ae0e8f54df7612582b933cde313344372 Mon Sep 17 00:00:00 2001 From: Sergey Chvalyuk Date: Wed, 10 Aug 2022 11:01:20 +0300 Subject: [PATCH 4/6] file.md - fixed Signed-off-by: Sergey Chvalyuk --- docs/integrations/sources/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 7bc53f6a21e0..19adc12caf97 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -127,7 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------| ------------------------------------------------- | -| 0.2.16 | 2022-08-10 | [](https://github.com/airbytehq/airbyte/pull/) | Cache stream to file | +| 0.2.16 | 2022-08-10 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache stream to file | | 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 | | 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files | | 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588) | Add support to YAML format | From b4bb135a69e632799522f37bea795ae41567821e Mon Sep 17 00:00:00 2001 From: Sergey Chvalyuk Date: Wed, 10 Aug 2022 11:49:09 +0300 Subject: [PATCH 5/6] file.md updated Signed-off-by: Sergey Chvalyuk --- docs/integrations/sources/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 19adc12caf97..aa909b3bbf71 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -127,7 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------| ------------------------------------------------- | -| 0.2.16 | 2022-08-10 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache stream to file | +| 0.2.16 | 2022-08-10 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache binary stream to file | | 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 | | 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files | | 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588) | Add support to YAML format | From f2a1fe3f4886b69e5b2c4b7566fa2b1773cf607f Mon Sep 17 00:00:00 2001 From: Octavia Squidington III Date: Thu, 11 Aug 2022 08:11:03 +0000 Subject: [PATCH 6/6] auto-bump connector version [ci skip] --- .../init/src/main/resources/seed/source_definitions.yaml | 2 +- airbyte-config/init/src/main/resources/seed/source_specs.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 785db71a4241..ee2395a9c285 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -271,7 +271,7 @@ - name: File sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.16 + dockerImageTag: 0.2.17 documentationUrl: https://docs.airbyte.io/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 079dad4ead84..4c45cc95ad2c 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2255,7 +2255,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.16" +- dockerImage: "airbyte/source-file:0.2.17" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/file" connectionSpecification: