diff --git a/CHANGELOG.md b/CHANGELOG.md index 482ec974..d0b87a79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Fixed + +- :bug: `MlflowArtifactDataSet.load()` now correctly loads the artifact when both `artifact_path` and `run_id` arguments are specified. Previous fix in ``0.11.4`` did not work because when the file already exist locally, mlflow did not download it again so tests were incorrectly passing. ([#362](https://github.com/Galileo-Galilei/kedro-mlflow/issues/362)) + ## [0.11.4] - 2022-10-04 ### Fixed diff --git a/kedro_mlflow/io/artifacts/mlflow_artifact_dataset.py b/kedro_mlflow/io/artifacts/mlflow_artifact_dataset.py index 855df13d..14ffe468 100644 --- a/kedro_mlflow/io/artifacts/mlflow_artifact_dataset.py +++ b/kedro_mlflow/io/artifacts/mlflow_artifact_dataset.py @@ -1,3 +1,4 @@ +import shutil from pathlib import Path from typing import Any, Dict, Union @@ -95,18 +96,31 @@ def _load(self) -> Any: # pragma: no cover # special datasets with a folder instead of a specifi files like PartitionedDataSet local_path = Path(self._path) + # BEWARE: we must enforce Path(local_path) because it is a PurePosixPath which fails on windows + # this is very weird: if you assign the value, it is converted to a Pureposixpath again, e.g: + # this fails: + # local_path = Path(local_path) + # local_path.name # local_path has been converted back to PurePosixPath on windows on this 2nd row + # but this works as a one liner:: + # filename = Path(local_path).name + + filename = Path(local_path).name artifact_path = ( - (self.artifact_path / Path(local_path.name)).as_posix() + (self.artifact_path / Path(filename)).as_posix() if self.artifact_path - else local_path.name + else filename ) - mlflow_client.download_artifacts( + # we cannot use dst_path, because it downlaods teh file to "local_path / artifact_path /filename.pkl" + # the artifact_path suffix prevents from loading when we call super._load() + temp_download_filepath = mlflow_client.download_artifacts( run_id=self.run_id, path=artifact_path, - dst_path=local_path.parent.as_posix(), # must be a **local** **directory** + # dst_path=local_path.parent.as_posix(), ) + shutil.copy(src=temp_download_filepath, dst=local_path) + # finally, read locally return super()._load() diff --git a/tests/io/artifacts/test_mlflow_artifact_dataset.py b/tests/io/artifacts/test_mlflow_artifact_dataset.py index 0c03d4e6..fddb7ed7 100644 --- a/tests/io/artifacts/test_mlflow_artifact_dataset.py +++ b/tests/io/artifacts/test_mlflow_artifact_dataset.py @@ -243,7 +243,9 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path( with mlflow.start_run(): mlflow_csv_dataset1.save(df1) first_run_id = mlflow.active_run().info.run_id - + ( + tmp_path / "df1.csv" + ).unlink() # we need to delete the data, else it is automatically reused instead of downloading # same as before, but a closed run_id is specified mlflow_csv_dataset2 = MlflowArtifactDataSet( data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), @@ -252,6 +254,9 @@ def test_artifact_dataset_load_with_run_id_and_artifact_path( ) assert df1.equals(mlflow_csv_dataset2.load()) + assert df1.equals( + mlflow_csv_dataset2.load() + ) # we check idempotency. When using a local tracking uri, mlflow does not downlaod in a tempfolder, so moving the folder alter the mlflow run @pytest.mark.parametrize("artifact_path", [None, "partitioned_data"])