From 37e85bd179b6bea9d8ec92ece3443407c5c38499 Mon Sep 17 00:00:00 2001 From: Galileo Galilei Date: Thu, 1 Oct 2020 23:27:50 +0200 Subject: [PATCH] FIX #74 - Convert MlflowDataset local _filepath to a string to enable remote storage --- CHANGELOG.md | 3 ++- kedro_mlflow/io/mlflow_dataset.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95737bbb..4e79ec32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,10 +13,11 @@ - Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50)) - Force to close all mlflow runs when a pipeline fails. It prevents further execution of the pipeline to be logged within the same mlflow run_id as the failing pipeline. ([#10](https://github.com/Galileo-Galilei/kedro-mlflow/issues/10)) - Fix various documentation typos ([#34](https://github.com/Galileo-Galilei/kedro-mlflow/pull/34), [#35](https://github.com/Galileo-Galilei/kedro-mlflow/pull/35), [#36](https://github.com/Galileo-Galilei/kedro-mlflow/pull/36) and more) +- Fix a bug in `MlflowDataSet` which sometimes fails to log on remote storage (S3, Azure Blob storage) with underlying `log_artifacts` when the kedro's `AbstractDataset._filepath` wath a `pathlib.PurePosixPath` object instead of a string ([#74](https://github.com/Galileo-Galilei/kedro-mlflow/issues/74)). ### Changed -- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project. [#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58) +- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport. - `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) ### Deprecated diff --git a/kedro_mlflow/io/mlflow_dataset.py b/kedro_mlflow/io/mlflow_dataset.py index 5ee611b2..eb58cab5 100644 --- a/kedro_mlflow/io/mlflow_dataset.py +++ b/kedro_mlflow/io/mlflow_dataset.py @@ -23,7 +23,7 @@ def __new__( data_set, data_set_args = parse_dataset_definition(config=data_set) - # fake inheritance : this mlfow class should be a mother class which wraps + # fake inheritance : this mlflow class should be a mother class which wraps # all dataset (i.e. it should replace AbstractVersionedDataSet) # instead and since we can't modify the core package, # we create a subclass which inherits dynamically from the data_set class @@ -41,6 +41,9 @@ def _save(self, data: Any): if hasattr(self, "_version") else self._filepath ) + # it must be converted to a string with as_posix() + # for logging on remote storage like Azure S3 + local_path = local_path.as_posix() super()._save(data) if self.run_id: