diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 785db71a4241..ee2395a9c285 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -271,7 +271,7 @@ - name: File sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 dockerRepository: airbyte/source-file - dockerImageTag: 0.2.16 + dockerImageTag: 0.2.17 documentationUrl: https://docs.airbyte.io/integrations/sources/file icon: file.svg sourceType: file diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 079dad4ead84..4c45cc95ad2c 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -2255,7 +2255,7 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] -- dockerImage: "airbyte/source-file:0.2.16" +- dockerImage: "airbyte/source-file:0.2.17" spec: documentationUrl: "https://docs.airbyte.io/integrations/sources/file" connectionSpecification: diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile index dcb20379a8b8..17f2d54f0d66 100644 --- a/airbyte-integrations/connectors/source-file-secure/Dockerfile +++ b/airbyte-integrations/connectors/source-file-secure/Dockerfile @@ -1,4 +1,4 @@ -FROM airbyte/source-file:0.2.16 +FROM airbyte/source-file:0.2.17 WORKDIR /airbyte/integration_code COPY source_file_secure ./source_file_secure @@ -9,5 +9,5 @@ RUN pip install . ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.16 +LABEL io.airbyte.version=0.2.17 LABEL io.airbyte.name=airbyte/source-file-secure diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index 9cbf49648e62..f91a6e7171b5 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.16 +LABEL io.airbyte.version=0.2.17 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/setup.py b/airbyte-integrations/connectors/source-file/setup.py index 352340b965f0..8ec1aba3f4b7 100644 --- a/airbyte-integrations/connectors/source-file/setup.py +++ b/airbyte-integrations/connectors/source-file/setup.py @@ -13,6 +13,7 @@ "pandas==1.4.3", "paramiko==2.11.0", "s3fs==2022.7.1", + "boto3==1.21.21", "smart-open[all]==6.0.0", "lxml==4.9.1", "html5lib==1.1", @@ -23,7 +24,7 @@ "pyxlsb==1.0.9", ] -TEST_REQUIREMENTS = ["boto3==1.21.21", "pytest==7.1.2", "pytest-docker==1.0.0", "pytest-mock~=3.8.2"] +TEST_REQUIREMENTS = ["pytest~=6.2", "pytest-docker==1.0.0", "pytest-mock~=3.6.1"] setup( name="source_file", diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index 02e256cb5b35..5e0c68ed3139 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -4,6 +4,7 @@ import json +import tempfile import traceback from os import environ from typing import Iterable @@ -352,10 +353,20 @@ def read(self, fields: Iterable = None) -> Iterable[dict]: yield from df[columns].to_dict(orient="records") else: fields = set(fields) if fields else None + if self.binary_source: + fp = self._cache_stream(fp) for df in self.load_dataframes(fp): columns = fields.intersection(set(df.columns)) if fields else df.columns df = df.where(pd.notnull(df), None) - yield from df[columns].to_dict(orient="records") + yield from df[list(columns)].to_dict(orient="records") + + def _cache_stream(self, fp): + """cache stream to file""" + fp_tmp = tempfile.TemporaryFile(mode="w+b") + fp_tmp.write(fp.read()) + fp_tmp.seek(0) + fp.close() + return fp_tmp def _stream_properties(self, fp): if self._reader_format == "yaml": diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 77f7ef18ea9e..2578eacc5bd0 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -127,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------|---------------------------------------------------| +| 0.2.17 | 2022-08-11 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache binary stream to file | | 0.2.16 | 2022-08-10 | [15293](https://github.com/airbytehq/airbyte/pull/15293) | added support for encoding reader option | | 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 | | 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files |