diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile index 33c8fae0d313..5f46ec28cf82 100644 --- a/airbyte-integrations/connectors/source-file-secure/Dockerfile +++ b/airbyte-integrations/connectors/source-file-secure/Dockerfile @@ -1,4 +1,4 @@ -FROM airbyte/source-file:0.2.20 +FROM airbyte/source-file:0.2.21 WORKDIR /airbyte/integration_code COPY source_file_secure ./source_file_secure @@ -9,5 +9,5 @@ RUN pip install . ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.20 +LABEL io.airbyte.version=0.2.21 LABEL io.airbyte.name=airbyte/source-file-secure diff --git a/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json b/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json index cc4872eb81dd..9dc5be7a95e3 100644 --- a/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json @@ -14,7 +14,7 @@ }, "format": { "type": "string", - "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"], + "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"], "default": "csv", "title": "File Format", "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)." diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index a00507cb5195..3e630de72c37 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.20 +LABEL io.airbyte.version=0.2.21 LABEL io.airbyte.name=airbyte/source-file diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb new file mode 100644 index 000000000000..f0f3082f8adb Binary files /dev/null and b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb differ diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index 99e6f31877ba..ddb857403903 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -228,7 +228,7 @@ class Client: CSV_CHUNK_SIZE = 10_000 reader_class = URLFile - binary_formats = {"excel", "feather", "parquet", "orc", "pickle"} + binary_formats = {"excel", "excel_binary", "feather", "parquet", "orc", "pickle"} def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None): self._dataset_name = dataset_name @@ -299,6 +299,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: "flat_json": pd.read_json, "html": pd.read_html, "excel": pd.read_excel, + "excel_binary": pd.read_excel, "feather": pd.read_feather, "parquet": pd.read_parquet, "orc": pd.read_orc, @@ -319,6 +320,9 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: reader_options["nrows"] = 0 reader_options["index_col"] = 0 + yield from reader(fp, **reader_options) + elif self._reader_options == "excel_binary": + reader_options["engine"] = "pyxlsb" yield from reader(fp, **reader_options) else: yield reader(fp, **reader_options) diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json index 161a1219e15a..9af744dc32f6 100644 --- a/airbyte-integrations/connectors/source-file/source_file/spec.json +++ b/airbyte-integrations/connectors/source-file/source_file/spec.json @@ -15,7 +15,7 @@ }, "format": { "type": "string", - "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"], + "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"], "default": "csv", "title": "File Format", "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)." diff --git a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py index 6df81f0fc4ed..6846e5f6fa4a 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py @@ -3,6 +3,7 @@ # from pathlib import Path + import pytest from source_file.client import Client @@ -13,6 +14,7 @@ def _read_file(file_name): parent_location = Path(__file__).absolute().parent file = open(parent_location / file_name).read() return file + return _read_file @@ -23,7 +25,13 @@ def config(): @pytest.fixture def invalid_config(read_file): - return {"dataset_name": "test", "format": "jsonl", "url": "https://airbyte.com", "reader_options":'{"encoding": "encoding"}', "provider": {"storage": "HTTPS"}} + return { + "dataset_name": "test", + "format": "jsonl", + "url": "https://airbyte.com", + "reader_options": '{"encoding": "encoding"}', + "provider": {"storage": "HTTPS"}, + } @pytest.fixture diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index 81463116fc2a..d62c2b005760 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -2,18 +2,10 @@ # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import pytest -from pandas import read_csv -from source_file.client import Client, URLFile, ConfigurationError - -@pytest.fixture -def client(): - return Client( - dataset_name="test_dataset", - url="scp://test_dataset", - provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}}, - ) +import pytest +from pandas import read_csv, read_excel +from source_file.client import Client, ConfigurationError, URLFile @pytest.fixture @@ -22,7 +14,7 @@ def wrong_format_client(): dataset_name="test_dataset", url="scp://test_dataset", provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}}, - format="wrong" + format="wrong", ) @@ -58,9 +50,18 @@ def test_load_dataframes(client, wrong_format_client, absolute_path, test_files) next(client.load_dataframes(fp=f, skip_data=True)) +def test_load_dataframes_xlsb(config, absolute_path, test_files): + config["format"] = "excel_binary" + client = Client(**config) + f = f"{absolute_path}/{test_files}/test.xlsb" + read_file = next(client.load_dataframes(fp=f)) + expected = read_excel(f, engine="pyxlsb") + assert read_file.equals(expected) + + def test_load_nested_json(client, absolute_path, test_files): f = f"{absolute_path}/{test_files}/formats/json/demo.json" - with open(f, mode='rb') as file: + with open(f, mode="rb") as file: assert client.load_nested_json(fp=file) @@ -80,7 +81,7 @@ def test_dtype_to_json_type(client, current_type, dtype, expected): def test_cache_stream(client, absolute_path, test_files): f = f"{absolute_path}/{test_files}/test.csv" - with open(f, mode='rb') as file: + with open(f, mode="rb") as file: assert client._cache_stream(file) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py index 0ed715e30503..5c29acddc287 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py @@ -20,7 +20,6 @@ SyncMode, Type, ) - from source_file.source import SourceFile logger = logging.getLogger("airbyte") diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 67dc3a9e1569..a222e78c4224 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -86,6 +86,8 @@ In case you select `JSON` format, then options from the [read\_json](https://pan For example, you can use the `{"orient" : "records"}` to change how orientation of data is loaded (if data is `[{column -> value}, … , {column -> value}]`) +If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select. + #### Changing data types of source columns Normally, Airbyte tries to infer the data type from the source, but you can use `reader_options` to force specific data types. If you input `{"dtype":"string"}`, all columns will be forced to be parsed as strings. If you only want a specific column to be parsed as a string, simply use `{"dtype" : {"column name": "string"}}`. @@ -127,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------|---------------------------------------------------| +| 0.2.21 | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files | 0.2.20 | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery | | 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' | | 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover |