airbytehq · arsenlosenko · Aug 26, 2022 · Aug 10, 2022 · Aug 26, 2022 · Aug 26, 2022
diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile
@@ -1,4 +1,4 @@
-FROM airbyte/source-file:0.2.20
+FROM airbyte/source-file:0.2.21
 
 WORKDIR /airbyte/integration_code
 COPY source_file_secure ./source_file_secure
@@ -9,5 +9,5 @@ RUN pip install .
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.2.20
+LABEL io.airbyte.version=0.2.21
 LABEL io.airbyte.name=airbyte/source-file-secure
diff --git a/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json b/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json
@@ -14,7 +14,7 @@
       },
       "format": {
         "type": "string",
-        "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
+        "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"],
         "default": "csv",
         "title": "File Format",
         "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."

diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile
@@ -17,5 +17,5 @@ COPY source_file ./source_file
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.2.20
+LABEL io.airbyte.version=0.2.21
 LABEL io.airbyte.name=airbyte/source-file
diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb
diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py
@@ -228,7 +228,7 @@ class Client:
 
     CSV_CHUNK_SIZE = 10_000
     reader_class = URLFile
-    binary_formats = {"excel", "feather", "parquet", "orc", "pickle"}
+    binary_formats = {"excel", "excel_binary", "feather", "parquet", "orc", "pickle"}
 
     def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None):
         self._dataset_name = dataset_name
@@ -299,6 +299,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
             "flat_json": pd.read_json,
             "html": pd.read_html,
             "excel": pd.read_excel,
+            "excel_binary": pd.read_excel,
             "feather": pd.read_feather,
             "parquet": pd.read_parquet,
             "orc": pd.read_orc,
@@ -319,6 +320,9 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
                 reader_options["nrows"] = 0
                 reader_options["index_col"] = 0
 
+            yield from reader(fp, **reader_options)
+        elif self._reader_options == "excel_binary":
+            reader_options["engine"] = "pyxlsb"
             yield from reader(fp, **reader_options)
         else:
             yield reader(fp, **reader_options)

diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json
@@ -15,7 +15,7 @@
       },
       "format": {
         "type": "string",
-        "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
+        "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"],
         "default": "csv",
         "title": "File Format",
         "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."

diff --git a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py
@@ -3,6 +3,7 @@
 #
 
 from pathlib import Path
+
 import pytest
 from source_file.client import Client
 
@@ -13,6 +14,7 @@ def _read_file(file_name):
         parent_location = Path(__file__).absolute().parent
         file = open(parent_location / file_name).read()
         return file
+
     return _read_file
 
 
@@ -23,7 +25,13 @@ def config():
 
 @pytest.fixture
 def invalid_config(read_file):
-    return {"dataset_name": "test", "format": "jsonl", "url": "https://airbyte.com", "reader_options":'{"encoding": "encoding"}', "provider": {"storage": "HTTPS"}}
+    return {
+        "dataset_name": "test",
+        "format": "jsonl",
+        "url": "https://airbyte.com",
+        "reader_options": '{"encoding": "encoding"}',
+        "provider": {"storage": "HTTPS"},
+    }
 
 
 @pytest.fixture

diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py
@@ -2,18 +2,10 @@
 # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
 #
 
-import pytest
-from pandas import read_csv
-from source_file.client import Client, URLFile, ConfigurationError
-
 
-@pytest.fixture
-def client():
-    return Client(
-        dataset_name="test_dataset",
-        url="scp://test_dataset",
-        provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}},
-    )
+import pytest
+from pandas import read_csv, read_excel
+from source_file.client import Client, ConfigurationError, URLFile
 
 
 @pytest.fixture
@@ -22,7 +14,7 @@ def wrong_format_client():
         dataset_name="test_dataset",
         url="scp://test_dataset",
         provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}},
-        format="wrong"
+        format="wrong",
     )
 
 
@@ -58,9 +50,18 @@ def test_load_dataframes(client, wrong_format_client, absolute_path, test_files)
         next(client.load_dataframes(fp=f, skip_data=True))
 
 
+def test_load_dataframes_xlsb(config, absolute_path, test_files):
+    config["format"] = "excel_binary"
+    client = Client(**config)
+    f = f"{absolute_path}/{test_files}/test.xlsb"
+    read_file = next(client.load_dataframes(fp=f))
+    expected = read_excel(f, engine="pyxlsb")
+    assert read_file.equals(expected)
+
+
 def test_load_nested_json(client, absolute_path, test_files):
     f = f"{absolute_path}/{test_files}/formats/json/demo.json"
-    with open(f, mode='rb') as file:
+    with open(f, mode="rb") as file:
         assert client.load_nested_json(fp=file)
 
 
@@ -80,7 +81,7 @@ def test_dtype_to_json_type(client, current_type, dtype, expected):
 
 def test_cache_stream(client, absolute_path, test_files):
     f = f"{absolute_path}/{test_files}/test.csv"
-    with open(f, mode='rb') as file:
+    with open(f, mode="rb") as file:
         assert client._cache_stream(file)
 
 

diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py
@@ -20,7 +20,6 @@
     SyncMode,
     Type,
 )
-
 from source_file.source import SourceFile
 
 logger = logging.getLogger("airbyte")

diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md
@@ -86,6 +86,8 @@ In case you select `JSON` format, then options from the [read\_json](https://pan
 
 For example, you can use the `{"orient" : "records"}` to change how orientation of data is loaded (if data is `[{column -> value}, … , {column -> value}]`)
 
+If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select.
+
 #### Changing data types of source columns
 
 Normally, Airbyte tries to infer the data type from the source, but you can use `reader_options` to force specific data types. If you input `{"dtype":"string"}`, all columns will be forced to be parsed as strings. If you only want a specific column to be parsed as a string, simply use `{"dtype" : {"column name": "string"}}`.
@@ -127,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm
 
 | Version | Date       | Pull Request                                             | Subject                                           |
 |---------|------------|----------------------------------------------------------|---------------------------------------------------|
+| 0.2.21  | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files
 | 0.2.20  | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery                          |
 | 0.2.19  | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null'                           |
 | 0.2.18  | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover          |