From 3acebd815f67bfb4ca52b73a16e192bfc78557ff Mon Sep 17 00:00:00 2001 From: Galileo Galilei Date: Sun, 4 Oct 2020 11:54:41 +0200 Subject: [PATCH] Fix #63 - Deprecate 'MlflowDataSet' and rename it 'MlflowArtifactDataSet' --- CHANGELOG.md | 4 ++ README.md | 2 +- docs/source/01_introduction/02_motivation.md | 2 +- .../02_hello_world_example/02_first_steps.md | 2 +- .../source/03_tutorial/05_version_datasets.md | 26 +++++------ docs/source/05_python_objects/01_DataSets.md | 12 ++--- kedro_mlflow/io/__init__.py | 2 +- kedro_mlflow/io/mlflow_dataset.py | 45 +++++++++++++++---- tests/io/test_mlflow_dataset.py | 15 +++++-- 9 files changed, 75 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68e10106..95737bbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,10 @@ - Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project. [#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58) - `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) +### Deprecated + +- Deprecate `MlflowDataSet` which is renamed as `MlflowArtifactDataSet` for consistency with the other datasets. It will raise a `DeprecationWarning` in this realease, and will be totally supressed in next minor release. Please update your `catalog.yml` entries accordingly as soon as possible. ([#63](https://github.com/Galileo-Galilei/kedro-mlflow/issues/63)) + ## [0.2.1] - 2018-08-06 ### Added diff --git a/README.md b/README.md index 0309b874..38231546 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ The documentation contains: Some frequently asked questions on more advanced features: - You want to log additional metrics to the run? -> [Try ``MlflowMetricsDataSet``](https://kedro-mlflow.readthedocs.io/en/latest/source/03_tutorial/07_version_metrics.html) ! -- You want to log nice dataviz of your pipeline that you register with ``MatplotlibWriter``? -> [Try ``MlflowDataSet`` to log any local files (.png, .pkl, .csv...) *automagically*](https://kedro-mlflow.readthedocs.io/en/latest/source/02_hello_world_example/02_first_steps.html#artifacts)! +- You want to log nice dataviz of your pipeline that you register with ``MatplotlibWriter``? -> [Try ``MlflowArtifactDataSet`` to log any local files (.png, .pkl, .csv...) *automagically*](https://kedro-mlflow.readthedocs.io/en/latest/source/02_hello_world_example/02_first_steps.html#artifacts)! - You want to create easily an API to share your awesome model to anyone? -> [See if ``pipeline_ml`` can fit your needs](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16) - You want to do something that is not straigthforward with current implementation? Open an issue, and let's see what happens! diff --git a/docs/source/01_introduction/02_motivation.md b/docs/source/01_introduction/02_motivation.md index 60f292c3..2d222b78 100644 --- a/docs/source/01_introduction/02_motivation.md +++ b/docs/source/01_introduction/02_motivation.md @@ -33,7 +33,7 @@ Above implementations have the advantage of being very straightforward and *mlfl |:----------------------------|:-----------------------|:-----------------------| |Set up configuration |``mlflow.yml`` |``MlflowPipelineHook`` | |Logging parameters |``run.py`` |``MlflowNodeHook`` | -|Logging artifacts |``catalog.yml`` |``MlflowDataSet`` | +|Logging artifacts |``catalog.yml`` |``MlflowArtifactDataSet`` | |Logging models |NA |NA | |Logging metrics |``catalog.yml`` |``MlflowMetricsDataSet``| |Logging Pipeline as model |``pipeline.py`` |``KedroPipelineModel`` and ``pipeline_ml``| diff --git a/docs/source/02_hello_world_example/02_first_steps.md b/docs/source/02_hello_world_example/02_first_steps.md index 8ab6f969..bed26045 100644 --- a/docs/source/02_hello_world_example/02_first_steps.md +++ b/docs/source/02_hello_world_example/02_first_steps.md @@ -112,7 +112,7 @@ First, open the ``catalog.yml`` file which should like this: ![](../imgs/default_catalog.png) -And persist the model as a pickle with the ``MlflowDataSet`` class: +And persist the model as a pickle with the ``MlflowArtifactDataSet`` class: ![](../imgs/updated_catalog.png) diff --git a/docs/source/03_tutorial/05_version_datasets.md b/docs/source/03_tutorial/05_version_datasets.md index 04f50dc4..958a131e 100644 --- a/docs/source/03_tutorial/05_version_datasets.md +++ b/docs/source/03_tutorial/05_version_datasets.md @@ -12,7 +12,7 @@ Mlflow process for such binding is to : ## How to version data in a kedro project? -kedro-mlflow introduces a new ``AbstractDataSet`` called ``MlflowDataSet``. It is a wrapper for any ``AbstractDataSet`` which decorates the underlying dataset ``save`` method and logs the file automatically in mlflow as an artifact each time the ``save`` method is called. +kedro-mlflow introduces a new ``AbstractDataSet`` called ``MlflowArtifactDataSet``. It is a wrapper for any ``AbstractDataSet`` which decorates the underlying dataset ``save`` method and logs the file automatically in mlflow as an artifact each time the ``save`` method is called. Since it is a ``AbstractDataSet``, it can be used with the YAML API. Assume that you have the following entry in the ``catalog.yml``: @@ -26,7 +26,7 @@ You can change it to: ```yaml my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/LOCAL/destination/file.csv # must be a local file, wherever you want to log the data in the end @@ -34,12 +34,12 @@ my_dataset_to_version: and this dataset will be automatically versioned in each pipeline execution. ## Frequently asked questions -### Can I pass extra parameters to the ``MlflowDataSet`` for finer control? -The ``MlflowDataSet`` takes a ``data_set`` argument which is a python dictionary passed to the ``__init__`` method of the dataset declared in ``type``. It means that you can pass any arguments accepted by the underlying dataset in this dictionary. If you want to pass ``load_args`` and ``save_args`` in the previous example, add them in the ``data_set`` argument: +### Can I pass extra parameters to the ``MlflowArtifactDataSet`` for finer control? +The ``MlflowArtifactDataSet`` takes a ``data_set`` argument which is a python dictionary passed to the ``__init__`` method of the dataset declared in ``type``. It means that you can pass any arguments accepted by the underlying dataset in this dictionary. If you want to pass ``load_args`` and ``save_args`` in the previous example, add them in the ``data_set`` argument: ```yaml my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv @@ -50,12 +50,12 @@ my_dataset_to_version: # ... any other valid arguments for data_set ``` -### Can I use the ``MlflowDataSet`` in interactive mode? -Like all Kedro ``AbstractDataSet``, ``MlflowDataSet`` is callable in the python API: +### Can I use the ``MlflowArtifactDataSet`` in interactive mode? +Like all Kedro ``AbstractDataSet``, ``MlflowArtifactDataSet`` is callable in the python API: ```python -from kedro_mlflow.io import MlflowDataSet +from kedro_mlflow.io import MlflowArtifactDataSet from kedro.extras.datasets.pandas import CSVDataSet -csv_dataset = MlflowDataSet(data_set={"type": CSVDataSet, # either a string "pandas.CSVDataSet" or the class +csv_dataset = MlflowArtifactDataSet(data_set={"type": CSVDataSet, # either a string "pandas.CSVDataSet" or the class "filepath": r"/path/to/a/local/destination/file.csv"}) csv_dataset.save(data=pd.DataFrame({"a":[1,2], "b": [3,4]})) ``` @@ -70,10 +70,10 @@ You can also refer to [this issue](https://github.com/Galileo-Galilei/kedro-mlfl In ``kedro-mlflow==0.2.0`` you must configure these elements by yourself. Further releases will introduce helpers for configuration. ### Can I log an artifact in a specific run? -The ``MlflowDataSet`` has an extra argument ``run_id`` which specifies the run in which the artifact will be logged. **Be cautious, because this argument will take precedence over the current run** when you call ``kedro run``, causing the artifact to be logged in another run that all the other data of the run. +The ``MlflowArtifactDataSet`` has an extra argument ``run_id`` which specifies the run in which the artifact will be logged. **Be cautious, because this argument will take precedence over the current run** when you call ``kedro run``, causing the artifact to be logged in another run that all the other data of the run. ```yaml my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv @@ -81,10 +81,10 @@ my_dataset_to_version: ``` ### Can I create a remote folder/subfolders architecture to organize the artifacts ? -The ``MlflowDataSet`` has an extra argument ``run_id`` which specifies a remote subfolder where the artifact will be logged. It must be a relative path. +The ``MlflowArtifactDataSet`` has an extra argument ``run_id`` which specifies a remote subfolder where the artifact will be logged. It must be a relative path. ```yaml my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv diff --git a/docs/source/05_python_objects/01_DataSets.md b/docs/source/05_python_objects/01_DataSets.md index 6bac2c68..e33515b6 100644 --- a/docs/source/05_python_objects/01_DataSets.md +++ b/docs/source/05_python_objects/01_DataSets.md @@ -1,9 +1,9 @@ # New ``DataSet``: -## ``MlflowDataSet`` -``MlflowDataSet`` is a wrapper for any ``AbstractDataSet`` which logs the dataset automatically in mlflow as an artifact when its ``save`` method is called. It can be used both with the YAML API: +## ``MlflowArtifactDataSet`` +``MlflowArtifactDataSet`` is a wrapper for any ``AbstractDataSet`` which logs the dataset automatically in mlflow as an artifact when its ``save`` method is called. It can be used both with the YAML API: ``` my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv @@ -11,7 +11,7 @@ my_dataset_to_version: or with additional parameters: ``` my_dataset_to_version: - type: kedro_mlflow.io.MlflowDataSet + type: kedro_mlflow.io.MlflowArtifactDataSet data_set: type: pandas.CSVDataSet # or any valid kedro DataSet filepath: /path/to/a/local/destination/file.csv @@ -25,9 +25,9 @@ my_dataset_to_version: ``` or with the python API: ``` -from kedro_mlflow.io import MlflowDataSet +from kedro_mlflow.io import MlflowArtifactDataSet from kedro.extras.datasets.pandas import CSVDataSet -csv_dataset = MlflowDataSet(data_set={"type": CSVDataSet, +csv_dataset = MlflowArtifactDataSet(data_set={"type": CSVDataSet, "filepath": r"/path/to/a/local/destination/file.csv"}) csv_dataset.save(data=pd.DataFrame({"a":[1,2], "b": [3,4]})) ``` diff --git a/kedro_mlflow/io/__init__.py b/kedro_mlflow/io/__init__.py index 39b30a5d..1c37403b 100644 --- a/kedro_mlflow/io/__init__.py +++ b/kedro_mlflow/io/__init__.py @@ -1,2 +1,2 @@ -from .mlflow_dataset import MlflowDataSet +from .mlflow_dataset import MlflowArtifactDataSet, MlflowDataSet from .mlflow_metrics_dataset import MlflowMetricsDataSet diff --git a/kedro_mlflow/io/mlflow_dataset.py b/kedro_mlflow/io/mlflow_dataset.py index 1030c0a1..5ee611b2 100644 --- a/kedro_mlflow/io/mlflow_dataset.py +++ b/kedro_mlflow/io/mlflow_dataset.py @@ -1,4 +1,5 @@ from typing import Any, Dict, Union +from warnings import warn import mlflow from kedro.io import AbstractVersionedDataSet @@ -6,7 +7,7 @@ from mlflow.tracking import MlflowClient -class MlflowDataSet(AbstractVersionedDataSet): +class MlflowArtifactDataSet(AbstractVersionedDataSet): """This class is a wrapper for any kedro AbstractDataSet. It decorates their ``save`` method to log the dataset in mlflow when ``save`` is called. @@ -26,7 +27,7 @@ def __new__( # all dataset (i.e. it should replace AbstractVersionedDataSet) # instead and since we can't modify the core package, # we create a subclass which inherits dynamically from the data_set class - class MlflowDataSetChildren(data_set): + class MlflowArtifactDataSetChildren(data_set): def __init__(self, run_id, artifact_path): super().__init__(**data_set_args) self.run_id = run_id @@ -56,31 +57,59 @@ def _save(self, data: Any): # rename the class parent_name = data_set.__name__ - MlflowDataSetChildren.__name__ = f"Mlflow{parent_name}" - MlflowDataSetChildren.__qualname__ = f"{parent_name}.Mlflow{parent_name}" + MlflowArtifactDataSetChildren.__name__ = f"Mlflow{parent_name}" + MlflowArtifactDataSetChildren.__qualname__ = ( + f"{parent_name}.Mlflow{parent_name}" + ) - mlflow_dataset_instance = MlflowDataSetChildren( + mlflow_dataset_instance = MlflowArtifactDataSetChildren( run_id=run_id, artifact_path=artifact_path ) return mlflow_dataset_instance def _load(self) -> Any: # pragma: no cover """ - MlowDataSet is a factory for DataSet + MlflowArtifactDataSet is a factory for DataSet and consequently does not implements abtracts methods """ pass def _save(self, data: Any) -> None: # pragma: no cover """ - MlowDataSet is a factory for DataSet + MlflowArtifactDataSet is a factory for DataSet and consequently does not implements abtracts methods """ pass def _describe(self) -> Dict[str, Any]: # pragma: no cover """ - MlowDataSet is a factory for DataSet + MlflowArtifactDataSet is a factory for DataSet and consequently does not implements abtracts methods """ pass + + +class MlflowDataSet(MlflowArtifactDataSet): + def __new__( + cls, + data_set: Union[str, Dict], + run_id: str = None, + artifact_path: str = None, + credentials: Dict[str, Any] = None, + ): + deprecation_msg = ( + "'MlflowDataSet' is now deprecated and " + "has been renamed 'MlflowArtifactDataSet' " + "in 'kedro-mlflow>=0.3.0'. " + "\nPlease change your 'catalog.yml' entries accordingly, " + "since 'MlflowDataSet' will be removed in next release." + ) + + warn(deprecation_msg, DeprecationWarning, stacklevel=2) + super().__new__( + cls=cls, + data_set=data_set, + run_id=run_id, + artifact_path=artifact_path, + credentials=credentials, + ) diff --git a/tests/io/test_mlflow_dataset.py b/tests/io/test_mlflow_dataset.py index 422c543f..e05961d9 100644 --- a/tests/io/test_mlflow_dataset.py +++ b/tests/io/test_mlflow_dataset.py @@ -8,7 +8,7 @@ from mlflow.tracking import MlflowClient from pytest_lazyfixture import lazy_fixture -from kedro_mlflow.io import MlflowDataSet +from kedro_mlflow.io import MlflowArtifactDataSet, MlflowDataSet @pytest.fixture @@ -46,7 +46,7 @@ def test_mlflow_csv_data_set_save_reload( mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) filepath = (tmp_path / "data").with_suffix(extension) - mlflow_csv_dataset = MlflowDataSet( + mlflow_csv_dataset = MlflowArtifactDataSet( artifact_path=artifact_path, data_set=dict(type=CSVDataSet, filepath=filepath.as_posix()), ) @@ -91,7 +91,7 @@ def test_mlflow_data_set_save_with_run_id( nb_runs += 1 # then same scenario but the run_id where data is saved is specified - mlflow_csv_dataset = MlflowDataSet( + mlflow_csv_dataset = MlflowArtifactDataSet( data_set=dict(type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix()), run_id=run_id, ) @@ -129,7 +129,7 @@ def test_is_versioned_dataset_logged_correctly_in_mlflow(tmp_path, tracking_uri, run_id = mlflow.active_run().info.run_id active_run_id = mlflow.active_run().info.run_id - mlflow_csv_dataset = MlflowDataSet( + mlflow_csv_dataset = MlflowArtifactDataSet( data_set=dict( type=CSVDataSet, filepath=(tmp_path / "df1.csv").as_posix(), versioned=True ), @@ -159,3 +159,10 @@ def test_is_versioned_dataset_logged_correctly_in_mlflow(tmp_path, tracking_uri, assert df1.equals(mlflow_csv_dataset.load()) # and must loadable mlflow.end_run() + + +def test_raise_deprecation_warning_mlflow_dataset(): + with pytest.deprecated_call(): + MlflowDataSet( + data_set=dict(type="pandas.CSVDataSet", filepath="fake/path/to/file.csv"), + )