Skip to content

Commit

Permalink
Source File: specify pyxlsb as engine for Excel file parsing (#15568)
Browse files Browse the repository at this point in the history
* Specify engine for excel files

* Add test for the chagne related to pyxlsb

* Update changelog and bump connector version

* Update PR link and number

* Update spec to include 'excel_binary' option, formatted files

* Update spec for source-file-secure to match changes made in source-file spec
  • Loading branch information
arsenlosenko authored Aug 26, 2022
1 parent 630f67b commit 35257a0
Show file tree
Hide file tree
Showing 10 changed files with 37 additions and 22 deletions.
4 changes: 2 additions & 2 deletions airbyte-integrations/connectors/source-file-secure/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM airbyte/source-file:0.2.20
FROM airbyte/source-file:0.2.21

WORKDIR /airbyte/integration_code
COPY source_file_secure ./source_file_secure
Expand All @@ -9,5 +9,5 @@ RUN pip install .
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.2.20
LABEL io.airbyte.version=0.2.21
LABEL io.airbyte.name=airbyte/source-file-secure
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
},
"format": {
"type": "string",
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
"enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"],
"default": "csv",
"title": "File Format",
"description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COPY source_file ./source_file
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.2.20
LABEL io.airbyte.version=0.2.21
LABEL io.airbyte.name=airbyte/source-file
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ class Client:

CSV_CHUNK_SIZE = 10_000
reader_class = URLFile
binary_formats = {"excel", "feather", "parquet", "orc", "pickle"}
binary_formats = {"excel", "excel_binary", "feather", "parquet", "orc", "pickle"}

def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None):
self._dataset_name = dataset_name
Expand Down Expand Up @@ -299,6 +299,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
"flat_json": pd.read_json,
"html": pd.read_html,
"excel": pd.read_excel,
"excel_binary": pd.read_excel,
"feather": pd.read_feather,
"parquet": pd.read_parquet,
"orc": pd.read_orc,
Expand All @@ -319,6 +320,9 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
reader_options["nrows"] = 0
reader_options["index_col"] = 0

yield from reader(fp, **reader_options)
elif self._reader_options == "excel_binary":
reader_options["engine"] = "pyxlsb"
yield from reader(fp, **reader_options)
else:
yield reader(fp, **reader_options)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
},
"format": {
"type": "string",
"enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"],
"enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"],
"default": "csv",
"title": "File Format",
"description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

from pathlib import Path

import pytest
from source_file.client import Client

Expand All @@ -13,6 +14,7 @@ def _read_file(file_name):
parent_location = Path(__file__).absolute().parent
file = open(parent_location / file_name).read()
return file

return _read_file


Expand All @@ -23,7 +25,13 @@ def config():

@pytest.fixture
def invalid_config(read_file):
return {"dataset_name": "test", "format": "jsonl", "url": "https://airbyte.com", "reader_options":'{"encoding": "encoding"}', "provider": {"storage": "HTTPS"}}
return {
"dataset_name": "test",
"format": "jsonl",
"url": "https://airbyte.com",
"reader_options": '{"encoding": "encoding"}',
"provider": {"storage": "HTTPS"},
}


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,10 @@
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

import pytest
from pandas import read_csv
from source_file.client import Client, URLFile, ConfigurationError


@pytest.fixture
def client():
return Client(
dataset_name="test_dataset",
url="scp://test_dataset",
provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}},
)
import pytest
from pandas import read_csv, read_excel
from source_file.client import Client, ConfigurationError, URLFile


@pytest.fixture
Expand All @@ -22,7 +14,7 @@ def wrong_format_client():
dataset_name="test_dataset",
url="scp://test_dataset",
provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}},
format="wrong"
format="wrong",
)


Expand Down Expand Up @@ -58,9 +50,18 @@ def test_load_dataframes(client, wrong_format_client, absolute_path, test_files)
next(client.load_dataframes(fp=f, skip_data=True))


def test_load_dataframes_xlsb(config, absolute_path, test_files):
config["format"] = "excel_binary"
client = Client(**config)
f = f"{absolute_path}/{test_files}/test.xlsb"
read_file = next(client.load_dataframes(fp=f))
expected = read_excel(f, engine="pyxlsb")
assert read_file.equals(expected)


def test_load_nested_json(client, absolute_path, test_files):
f = f"{absolute_path}/{test_files}/formats/json/demo.json"
with open(f, mode='rb') as file:
with open(f, mode="rb") as file:
assert client.load_nested_json(fp=file)


Expand All @@ -80,7 +81,7 @@ def test_dtype_to_json_type(client, current_type, dtype, expected):

def test_cache_stream(client, absolute_path, test_files):
f = f"{absolute_path}/{test_files}/test.csv"
with open(f, mode='rb') as file:
with open(f, mode="rb") as file:
assert client._cache_stream(file)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
SyncMode,
Type,
)

from source_file.source import SourceFile

logger = logging.getLogger("airbyte")
Expand Down
3 changes: 3 additions & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ In case you select `JSON` format, then options from the [read\_json](https://pan

For example, you can use the `{"orient" : "records"}` to change how orientation of data is loaded (if data is `[{column -> value}, … , {column -> value}]`)

If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select.

#### Changing data types of source columns

Normally, Airbyte tries to infer the data type from the source, but you can use `reader_options` to force specific data types. If you input `{"dtype":"string"}`, all columns will be forced to be parsed as strings. If you only want a specific column to be parsed as a string, simply use `{"dtype" : {"column name": "string"}}`.
Expand Down Expand Up @@ -127,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
|---------|------------|----------------------------------------------------------|---------------------------------------------------|
| 0.2.21 | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files
| 0.2.20 | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery |
| 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' |
| 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover |
Expand Down

0 comments on commit 35257a0

Please sign in to comment.