From 0217b693909ec30a2688699426e8745322f80118 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Fri, 20 Sep 2024 09:23:25 -0600 Subject: [PATCH 1/6] feat(datasets): create separate `ibis.FileDataset` Signed-off-by: Deepyaman Datta --- .../kedro_datasets/ibis/__init__.py | 4 +- .../kedro_datasets/ibis/file_dataset.py | 160 ++++++++++++++++++ .../tests/ibis/test_file_dataset.py | 97 +++++++++++ 3 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 kedro-datasets/kedro_datasets/ibis/file_dataset.py create mode 100644 kedro-datasets/tests/ibis/test_file_dataset.py diff --git a/kedro-datasets/kedro_datasets/ibis/__init__.py b/kedro-datasets/kedro_datasets/ibis/__init__.py index 7e793c4e0..47867f657 100644 --- a/kedro-datasets/kedro_datasets/ibis/__init__.py +++ b/kedro-datasets/kedro_datasets/ibis/__init__.py @@ -4,8 +4,10 @@ import lazy_loader as lazy # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +FileDataset: Any TableDataset: Any __getattr__, __dir__, __all__ = lazy.attach( - __name__, submod_attrs={"table_dataset": ["TableDataset"]} + __name__, + submod_attrs={"file_dataset": ["FileDataset"], "table_dataset": ["TableDataset"]}, ) diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py new file mode 100644 index 000000000..d596439aa --- /dev/null +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -0,0 +1,160 @@ +"""Provide file loading and saving functionality for Ibis's backends.""" +from __future__ import annotations + +from copy import deepcopy +from typing import TYPE_CHECKING, Any, ClassVar + +import ibis.expr.types as ir +from kedro.io import AbstractDataset + +if TYPE_CHECKING: + from ibis import BaseBackend + + +class FileDataset(AbstractDataset[ir.Table, ir.Table]): + """``FileDataset`` loads/saves data from/to a specified file format. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: ibis.TableDataset + filepath: data/01_raw/company/cars.csv + file_format: csv + table_name: cars + connection: + backend: duckdb + database: company.db + load_args: + sep: "," + nullstr: "#NA" + save_args: + sep: "," + nullstr: "#NA" + + Example usage for the + `Python API `_: + + .. code-block:: pycon + + >>> import ibis + >>> from kedro_datasets.ibis import FileDataset + >>> + >>> data = ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + >>> + >>> dataset = FileDataset( + ... filepath=tmp_path / "test.csv", + ... file_format="csv", + ... table_name="test", + ... connection={"backend": "duckdb", "database": tmp_path / "file.db"}, + ... ) + >>> dataset.save(data) + >>> reloaded = dataset.load() + >>> assert data.execute().equals(reloaded.execute()) + + """ + + DEFAULT_LOAD_ARGS: ClassVar[dict[str, Any]] = {} + DEFAULT_SAVE_ARGS: ClassVar[dict[str, Any]] = {} + + _connections: ClassVar[dict[tuple[tuple[str, str]], BaseBackend]] = {} + + def __init__( # noqa: PLR0913 + self, + filepath: str, + file_format: str, + *, + table_name: str | None = None, + connection: dict[str, Any] | None = None, + load_args: dict[str, Any] | None = None, + save_args: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + """Creates a new ``FileDataset`` pointing to the given filepath. + + ``FileDataset`` connects to the Ibis backend object constructed + from the connection configuration. The `backend` key provided in + the config can be any of the `supported backends `_. The remaining dictionary entries will be + passed as arguments to the underlying ``connect()`` method (e.g. + `ibis.duckdb.connect() `_). + + The read method corresponding to the given ``file_format`` (e.g. + `read_csv() `_) is used to load + the file with the backend. Note that only the data is loaded; no + link to the underlying file exists past ``FileDataset.load()``. + + Args: + filepath: Path to a file to register as a table. Most useful + for loading data into your data warehouse (for testing). + On save, the backend exports data to the specified path. + file_format: String specifying the file format for the file. + table_name: The name to use for the created table (on load). + connection: Configuration for connecting to an Ibis backend. + load_args: Additional arguments passed to the Ibis backend's + `read_{file_format}` method. + save_args: Additional arguments passed to the Ibis backend's + `to_{file_format}` method. + metadata: Any arbitrary metadata. This is ignored by Kedro, + but may be consumed by users or external plugins. + """ + self._filepath = filepath + self._file_format = file_format + self._table_name = table_name + self._connection_config = connection + self.metadata = metadata + + # Set load and save arguments, overwriting defaults if provided. + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + @property + def connection(self) -> BaseBackend: + def hashable(value): + if isinstance(value, dict): + return tuple((k, hashable(v)) for k, v in sorted(value.items())) + if isinstance(value, list): + return tuple(hashable(x) for x in value) + return value + + cls = type(self) + key = hashable(self._connection_config) + if key not in cls._connections: + import ibis + + config = deepcopy(self._connection_config) + backend_attr = config.pop("backend") if config else None + backend = getattr(ibis, backend_attr) + cls._connections[key] = backend.connect(**config) + + return cls._connections[key] + + def _load(self) -> ir.Table: + reader = getattr(self.connection, f"read_{self._file_format}") + return reader(self._filepath, self._table_name, **self._load_args) + + def _save(self, data: ir.Table) -> None: + writer = getattr(self.connection, f"to_{self._file_format}") + writer(data, self._filepath, **self._save_args) + + def _describe(self) -> dict[str, Any]: + return { + "filepath": self._filepath, + "file_format": self._file_format, + "table_name": self._table_name, + "backend": self._connection_config.get("backend") + if self._connection_config + else None, + "load_args": self._load_args, + "save_args": self._save_args, + } diff --git a/kedro-datasets/tests/ibis/test_file_dataset.py b/kedro-datasets/tests/ibis/test_file_dataset.py new file mode 100644 index 000000000..d18c38528 --- /dev/null +++ b/kedro-datasets/tests/ibis/test_file_dataset.py @@ -0,0 +1,97 @@ +import ibis +import pytest +from pandas.testing import assert_frame_equal + +from kedro_datasets.ibis import FileDataset + + +@pytest.fixture +def filepath_csv(tmp_path): + return (tmp_path / "test.csv").as_posix() + + +@pytest.fixture +def database(tmp_path): + return (tmp_path / "file.db").as_posix() + + +@pytest.fixture(params=[None]) +def connection_config(request, database): + return request.param or {"backend": "duckdb", "database": database} + + +@pytest.fixture +def file_dataset(filepath_csv, connection_config, load_args, save_args): + return FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + load_args=load_args, + save_args=save_args, + ) + + +@pytest.fixture +def dummy_table(): + return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +class TestFileDataset: + def test_save_and_load(self, file_dataset, dummy_table, database): + """Test saving and reloading the data set.""" + file_dataset.save(dummy_table) + reloaded = file_dataset.load() + assert_frame_equal(dummy_table.execute(), reloaded.execute()) + + @pytest.mark.parametrize("load_args", [{"filename": True}], indirect=True) + def test_load_extra_params(self, file_dataset, load_args, dummy_table): + """Test overriding the default load arguments.""" + file_dataset.save(dummy_table) + assert "filename" in file_dataset.load() + + @pytest.mark.parametrize("save_args", [{"sep": "|"}], indirect=True) + def test_save_extra_params( + self, file_dataset, save_args, dummy_table, filepath_csv + ): + """Test overriding the default save arguments.""" + file_dataset.save(dummy_table) + + # Verify that the delimiter character from `save_args` was used. + with open(filepath_csv) as f: + for line in f: + assert save_args["sep"] in line + + @pytest.mark.parametrize( + ("connection_config", "key"), + [ + ( + {"backend": "duckdb", "database": "file.db", "extensions": ["spatial"]}, + ( + ("backend", "duckdb"), + ("database", "file.db"), + ("extensions", ("spatial",)), + ), + ), + # https://github.com/kedro-org/kedro-plugins/pull/560#discussion_r1536083525 + ( + { + "host": "xxx.sql.azuresynapse.net", + "database": "xxx", + "query": {"driver": "ODBC Driver 17 for SQL Server"}, + "backend": "mssql", + }, + ( + ("backend", "mssql"), + ("database", "xxx"), + ("host", "xxx.sql.azuresynapse.net"), + ("query", (("driver", "ODBC Driver 17 for SQL Server"),)), + ), + ), + ], + indirect=["connection_config"], + ) + def test_connection_config(self, mocker, file_dataset, connection_config, key): + """Test hashing of more complicated connection configuration.""" + mocker.patch(f"ibis.{connection_config['backend']}") + file_dataset.load() + assert key in file_dataset._connections From 77e1430d6b94b1057f2d83b821439dd65ae73988 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Fri, 20 Sep 2024 22:43:10 -0600 Subject: [PATCH 2/6] chore(datasets): deprecate `TableDataset` file I/O Signed-off-by: Deepyaman Datta --- .../kedro_datasets/ibis/table_dataset.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kedro-datasets/kedro_datasets/ibis/table_dataset.py b/kedro-datasets/kedro_datasets/ibis/table_dataset.py index 9839876c3..def1c2482 100644 --- a/kedro-datasets/kedro_datasets/ibis/table_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/table_dataset.py @@ -1,12 +1,15 @@ """Provide data loading and saving functionality for Ibis's backends.""" from __future__ import annotations +import warnings from copy import deepcopy from typing import TYPE_CHECKING, Any, ClassVar import ibis.expr.types as ir from kedro.io import AbstractDataset, DatasetError +from kedro_datasets import KedroDeprecationWarning + if TYPE_CHECKING: from ibis import BaseBackend @@ -21,15 +24,10 @@ class TableDataset(AbstractDataset[ir.Table, ir.Table]): cars: type: ibis.TableDataset - filepath: data/01_raw/company/cars.csv - file_format: csv table_name: cars connection: backend: duckdb database: company.db - load_args: - sep: "," - nullstr: "#NA" save_args: materialized: table @@ -91,12 +89,6 @@ def __init__( # noqa: PLR0913 `ibis.duckdb.connect() `_). - If ``filepath`` and ``file_format`` are given, the corresponding - read method (e.g. `read_csv() `_) is used to load - the file with the backend. Note that only the data is loaded; no - link to the underlying file exists past ``TableDataset.load()``. - If ``table_name`` is given (and ``filepath`` isn't), the dataset establishes a connection to the relevant table for the execution backend. Therefore, Ibis doesn't fetch data on load; all compute @@ -105,9 +97,6 @@ def __init__( # noqa: PLR0913 is saved, after running code defined across one more more nodes. Args: - filepath: Path to a file to register as a table. Most useful - for loading data into your data warehouse (for testing). - file_format: Specifies the input file format for `filepath`. table_name: The name of the table or view to read or create. connection: Configuration for connecting to an Ibis backend. load_args: Additional arguments passed to the Ibis backend's @@ -125,6 +114,15 @@ def __init__( # noqa: PLR0913 "Must provide at least one of `filepath` or `table_name`." ) + if filepath is not None or file_format is not None: + warnings.warn( + "Use 'FileDataset' to load and save files with an Ibis " + "backend; the functionality will be removed from 'Table" + "Dataset' in Kedro-Datasets 6.0.0", + KedroDeprecationWarning, + stacklevel=2, + ) + self._filepath = filepath self._file_format = file_format self._table_name = table_name From f45f4084695ebe5f77a6ef2424382e5b90514fce Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 23 Sep 2024 10:52:50 -0600 Subject: [PATCH 3/6] feat(datasets): implement `FileDataset` versioning Signed-off-by: Deepyaman Datta --- .../kedro_datasets/ibis/file_dataset.py | 32 +++- .../tests/ibis/test_file_dataset.py | 175 ++++++++++++++++++ 2 files changed, 203 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py index d596439aa..a55e92a68 100644 --- a/kedro-datasets/kedro_datasets/ibis/file_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -2,16 +2,17 @@ from __future__ import annotations from copy import deepcopy +from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING, Any, ClassVar import ibis.expr.types as ir -from kedro.io import AbstractDataset +from kedro.io import AbstractVersionedDataset, DatasetError, Version if TYPE_CHECKING: from ibis import BaseBackend -class FileDataset(AbstractDataset[ir.Table, ir.Table]): +class FileDataset(AbstractVersionedDataset[ir.Table, ir.Table]): """``FileDataset`` loads/saves data from/to a specified file format. Example usage for the @@ -71,6 +72,7 @@ def __init__( # noqa: PLR0913 connection: dict[str, Any] | None = None, load_args: dict[str, Any] | None = None, save_args: dict[str, Any] | None = None, + version: Version | None = None, metadata: dict[str, Any] | None = None, ) -> None: """Creates a new ``FileDataset`` pointing to the given filepath. @@ -100,6 +102,10 @@ def __init__( # noqa: PLR0913 `read_{file_format}` method. save_args: Additional arguments passed to the Ibis backend's `to_{file_format}` method. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ @@ -109,6 +115,12 @@ def __init__( # noqa: PLR0913 self._connection_config = connection self.metadata = metadata + super().__init__( + filepath=PurePosixPath(filepath), + version=version, + exists_function=lambda filepath: Path(filepath).exists(), + ) + # Set load and save arguments, overwriting defaults if provided. self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: @@ -140,12 +152,15 @@ def hashable(value): return cls._connections[key] def _load(self) -> ir.Table: + load_path = self._get_load_path() reader = getattr(self.connection, f"read_{self._file_format}") - return reader(self._filepath, self._table_name, **self._load_args) + return reader(load_path, self._table_name, **self._load_args) def _save(self, data: ir.Table) -> None: + save_path = self._get_save_path() + Path(save_path).parent.mkdir(parents=True, exist_ok=True) writer = getattr(self.connection, f"to_{self._file_format}") - writer(data, self._filepath, **self._save_args) + writer(data, save_path, **self._save_args) def _describe(self) -> dict[str, Any]: return { @@ -157,4 +172,13 @@ def _describe(self) -> dict[str, Any]: else None, "load_args": self._load_args, "save_args": self._save_args, + "version": self._version, } + + def _exists(self) -> bool: + try: + load_path = self._get_load_path() + except DatasetError: + return False + + return Path(load_path).exists() diff --git a/kedro-datasets/tests/ibis/test_file_dataset.py b/kedro-datasets/tests/ibis/test_file_dataset.py index d18c38528..7a41b2576 100644 --- a/kedro-datasets/tests/ibis/test_file_dataset.py +++ b/kedro-datasets/tests/ibis/test_file_dataset.py @@ -1,5 +1,10 @@ +from pathlib import Path +from time import sleep + import ibis import pytest +from kedro.io import DatasetError, Version +from kedro.io.core import generate_timestamp from pandas.testing import assert_frame_equal from kedro_datasets.ibis import FileDataset @@ -31,6 +36,16 @@ def file_dataset(filepath_csv, connection_config, load_args, save_args): ) +@pytest.fixture +def versioned_file_dataset(filepath_csv, connection_config, load_version, save_version): + return FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(load_version, save_version), + ) + + @pytest.fixture def dummy_table(): return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) @@ -95,3 +110,163 @@ def test_connection_config(self, mocker, file_dataset, connection_config, key): mocker.patch(f"ibis.{connection_config['backend']}") file_dataset.load() assert key in file_dataset._connections + + +class TestFileDatasetVersioned: + def test_version_str_repr(self, connection_config, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.csv" + ds = FileDataset( + filepath=filepath, file_format="csv", connection=connection_config + ) + ds_versioned = FileDataset( + filepath=filepath, + file_format="csv", + connection=connection_config, + version=Version(load_version, save_version), + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "FileDataset" in str(ds_versioned) + assert "FileDataset" in str(ds) + # Default save_args + assert "save_args={}" in str(ds) + assert "save_args={}" in str(ds_versioned) + + def test_save_and_load(self, versioned_file_dataset, dummy_table): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_file_dataset.save(dummy_table) + reloaded = versioned_file_dataset.load() + assert_frame_equal(dummy_table.execute(), reloaded.execute()) + + def test_multiple_loads( + self, versioned_file_dataset, dummy_table, filepath_csv, connection_config + ): + """Test that if a new version is created mid-run, by an + external system, it won't be loaded in the current run.""" + versioned_file_dataset.save(dummy_table) + versioned_file_dataset.load() + v1 = versioned_file_dataset.resolve_load_version() + + sleep(0.5) + # force-drop a newer version into the same location + v_new = generate_timestamp() + FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(v_new, v_new), + ).save(dummy_table) + + versioned_file_dataset.load() + v2 = versioned_file_dataset.resolve_load_version() + + assert v2 == v1 # v2 should not be v_new! + ds_new = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + assert ( + ds_new.resolve_load_version() == v_new + ) # new version is discoverable by a new instance + + def test_multiple_saves(self, dummy_table, filepath_csv, connection_config): + """Test multiple cycles of save followed by load for the same dataset""" + ds_versioned = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + + # first save + ds_versioned.save(dummy_table) + first_save_version = ds_versioned.resolve_save_version() + first_load_version = ds_versioned.resolve_load_version() + assert first_load_version == first_save_version + + # second save + sleep(0.5) + ds_versioned.save(dummy_table) + second_save_version = ds_versioned.resolve_save_version() + second_load_version = ds_versioned.resolve_load_version() + assert second_load_version == second_save_version + assert second_load_version > first_load_version + + # another dataset + ds_new = FileDataset( + filepath=filepath_csv, + file_format="csv", + connection=connection_config, + version=Version(None, None), + ) + assert ds_new.resolve_load_version() == second_load_version + + def test_no_versions(self, versioned_file_dataset): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for FileDataset\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.load() + + def test_exists(self, versioned_file_dataset, dummy_table): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_file_dataset.exists() + versioned_file_dataset.save(dummy_table) + assert versioned_file_dataset.exists() + + def test_prevent_overwrite(self, versioned_file_dataset, dummy_table): + """Check the error when attempting to override the data set if the + corresponding CSV file for a given save version already exists.""" + versioned_file_dataset.save(dummy_table) + pattern = ( + r"Save path \'.+\' for FileDataset\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.save(dummy_table) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_file_dataset, load_version, save_version, dummy_table + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for FileDataset\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_file_dataset.save(dummy_table) + + def test_versioning_existing_dataset( + self, file_dataset, versioned_file_dataset, dummy_table + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + file_dataset.save(dummy_table) + assert file_dataset.exists() + assert file_dataset._filepath == versioned_file_dataset._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_file_dataset._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_file_dataset.save(dummy_table) + + # Remove non-versioned dataset and try again + Path(file_dataset._filepath.as_posix()).unlink() + versioned_file_dataset.save(dummy_table) + assert versioned_file_dataset.exists() From b7ff0c7db952213b729918068cc1eee76b006286 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 23 Sep 2024 16:08:03 -0600 Subject: [PATCH 4/6] chore(datasets): try `os.path.exists`, for Windows Signed-off-by: Deepyaman Datta --- kedro-datasets/kedro_datasets/ibis/file_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py index a55e92a68..72b53d2b6 100644 --- a/kedro-datasets/kedro_datasets/ibis/file_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -1,6 +1,7 @@ """Provide file loading and saving functionality for Ibis's backends.""" from __future__ import annotations +import os from copy import deepcopy from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING, Any, ClassVar @@ -118,7 +119,7 @@ def __init__( # noqa: PLR0913 super().__init__( filepath=PurePosixPath(filepath), version=version, - exists_function=lambda filepath: Path(filepath).exists(), + exists_function=os.path.exists, ) # Set load and save arguments, overwriting defaults if provided. From c8f076e2aa27fb968634c0c598f4adab2371eb0b Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 23 Sep 2024 16:40:07 -0600 Subject: [PATCH 5/6] revert(datasets): use pathlib, ignore Windows test Refs: b7ff0c7 Signed-off-by: Deepyaman Datta --- kedro-datasets/kedro_datasets/ibis/file_dataset.py | 3 +-- kedro-datasets/tests/ibis/test_file_dataset.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py index 72b53d2b6..a55e92a68 100644 --- a/kedro-datasets/kedro_datasets/ibis/file_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -1,7 +1,6 @@ """Provide file loading and saving functionality for Ibis's backends.""" from __future__ import annotations -import os from copy import deepcopy from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING, Any, ClassVar @@ -119,7 +118,7 @@ def __init__( # noqa: PLR0913 super().__init__( filepath=PurePosixPath(filepath), version=version, - exists_function=os.path.exists, + exists_function=lambda filepath: Path(filepath).exists(), ) # Set load and save arguments, overwriting defaults if provided. diff --git a/kedro-datasets/tests/ibis/test_file_dataset.py b/kedro-datasets/tests/ibis/test_file_dataset.py index 7a41b2576..e598bffff 100644 --- a/kedro-datasets/tests/ibis/test_file_dataset.py +++ b/kedro-datasets/tests/ibis/test_file_dataset.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path from time import sleep @@ -251,6 +252,7 @@ def test_save_version_warning( with pytest.warns(UserWarning, match=pattern): versioned_file_dataset.save(dummy_table) + @pytest.mark.skipif(sys.platform == "win32", reason="different error on windows") def test_versioning_existing_dataset( self, file_dataset, versioned_file_dataset, dummy_table ): From 2f9b804d02d76e8f22b548e0adcf5fdb50f2a9bc Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 23 Sep 2024 23:02:37 -0600 Subject: [PATCH 6/6] docs(datasets): add `ibis.FileDataset` to contents Signed-off-by: Deepyaman Datta --- kedro-datasets/docs/source/api/kedro_datasets.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst index 669378b7b..06c5d1388 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets.rst @@ -21,6 +21,7 @@ kedro_datasets holoviews.HoloviewsWriter huggingface.HFDataset huggingface.HFTransformerPipelineDataset + ibis.FileDataset ibis.TableDataset json.JSONDataset matlab.MatlabDataset