From 9ca45854aeb451157be78f3d3ccaefba40f379e0 Mon Sep 17 00:00:00 2001
From: Galileo Galilei <yolan.honore.rouge@gmail.com>
Date: Sat, 24 Oct 2020 00:46:19 +0200
Subject: [PATCH] PARTIAL #73 - Make MLflowMetricsDataSet coverage 100%

---
 CHANGELOG.md                              | 103 +++++++++++-----------
 kedro_mlflow/io/mlflow_metrics_dataset.py |  50 ++++++-----
 tests/conftest.py                         |   9 ++
 tests/io/test_mlflow_metrics_dataset.py   |  64 ++++++++++++++
 4 files changed, 152 insertions(+), 74 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2266df40..3fc7ae86 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,7 +22,6 @@
 - The test coverage now excludes `tests` and `setup.py` (#99).
 - The `KedroPipelineModel` now unpacks the result of the `inference` pipeline and no longer returns a dictionary with the name in the `DataCatalog` but only the predicted value (#93).
 
-
 ### Removed
 
 - `kedro mlflow init` command is no longer declaring hooks in `run.py`. You must now [register your hooks manually](docs/source/03_tutorial/02_setup.md#declaring-kedro-mlflow-hooks) in the ``run.py`` (kedro > 0.16.0), ``.kedro.yml`` (kedro >= 0.16.5) or ``pyproject.toml`` (kedro >= 0.16.5)
@@ -31,33 +30,33 @@
 
 ### Added
 
--   Add dataset `MlflowMetricsDataSet` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics.
--   Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
--   Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
+- Add dataset `MlflowMetricsDataSet` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics.
+- Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
+- Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
 
 ### Fixed
 
--   Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41))
--   MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45))
--   Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50))
--   Force to close all mlflow runs when a pipeline fails. It prevents further execution of the pipeline to be logged within the same mlflow run_id as the failing pipeline. ([#10](https://github.com/Galileo-Galilei/kedro-mlflow/issues/10))
--   Fix various documentation typos ([#34](https://github.com/Galileo-Galilei/kedro-mlflow/pull/34), [#35](https://github.com/Galileo-Galilei/kedro-mlflow/pull/35), [#36](https://github.com/Galileo-Galilei/kedro-mlflow/pull/36) and more)
--   Update README (add badges for readibility, add a "main contributors" section to give credit, fix typo in install command, link to milestones for more up-to-date priorities) ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
--   Fix bug in CI deployment workflow and rename it to `publish` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
--   Fix a bug in `MlflowDataSet` which sometimes failed to log on remote storage (S3, Azure Blob storage) with underlying `log_artifacts` when the kedro's `AbstractDataset._filepath` was a `pathlib.PurePosixPath` object instead of a string ([#74](https://github.com/Galileo-Galilei/kedro-mlflow/issues/74)).
--   Add a CI for release candidate creation and use actions to enforce semantic versioning and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
+- Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41))
+- MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45))
+- Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50))
+- Force to close all mlflow runs when a pipeline fails. It prevents further execution of the pipeline to be logged within the same mlflow run_id as the failing pipeline. ([#10](https://github.com/Galileo-Galilei/kedro-mlflow/issues/10))
+- Fix various documentation typos ([#34](https://github.com/Galileo-Galilei/kedro-mlflow/pull/34), [#35](https://github.com/Galileo-Galilei/kedro-mlflow/pull/35), [#36](https://github.com/Galileo-Galilei/kedro-mlflow/pull/36) and more)
+- Update README (add badges for readibility, add a "main contributors" section to give credit, fix typo in install command, link to milestones for more up-to-date priorities) ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
+- Fix bug in CI deployment workflow and rename it to `publish` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
+- Fix a bug in `MlflowDataSet` which sometimes failed to log on remote storage (S3, Azure Blob storage) with underlying `log_artifacts` when the kedro's `AbstractDataset._filepath` was a `pathlib.PurePosixPath` object instead of a string ([#74](https://github.com/Galileo-Galilei/kedro-mlflow/issues/74)).
+- Add a CI for release candidate creation and use actions to enforce semantic versioning and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
 
 ### Changed
 
--   Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport.
--   `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59))
--   Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
--   The `input_name` attributes of `PipelineML` is now a python property and makes a check at setting time to prevent setting an invalid value. The check ensures that `input_name` is a valid input of the `inference` pipeline.
+- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport.
+- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59))
+- Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
+- The `input_name` attributes of `PipelineML` is now a python property and makes a check at setting time to prevent setting an invalid value. The check ensures that `input_name` is a valid input of the `inference` pipeline.
 
 ### Deprecated
 
--   Deprecate `MlflowDataSet` which is renamed as `MlflowArtifactDataSet` for consistency with the other datasets. It will raise a `DeprecationWarning` in this realease, and will be totally supressed in next minor release. Please update your `catalog.yml` entries accordingly as soon as possible. ([#63](https://github.com/Galileo-Galilei/kedro-mlflow/issues/63))
--   Deprecate `pipeline_ml` which is renamed as `pipeline_ml_factory` to avoid confusion between a `PipelineML` instance and the helper function to create `PipelineMl` instances from Kedro `Pipeline`s.
+- Deprecate `MlflowDataSet` which is renamed as `MlflowArtifactDataSet` for consistency with the other datasets. It will raise a `DeprecationWarning` in this realease, and will be totally supressed in next minor release. Please update your `catalog.yml` entries accordingly as soon as possible. ([#63](https://github.com/Galileo-Galilei/kedro-mlflow/issues/63))
+- Deprecate `pipeline_ml` which is renamed as `pipeline_ml_factory` to avoid confusion between a `PipelineML` instance and the helper function to create `PipelineMl` instances from Kedro `Pipeline`s.
 
 ## [0.2.1] - 2018-08-06
 
@@ -65,58 +64,58 @@
 
 Many documentation improvements:
 
--   Add a Code of conduct
--   Add a Contributing guide
--   Refactor README.md to separate clearly from documentation
--   Fix broken links
--   Fix bad markdown rendering
--   Split old README.md information in dedicated sections
+- Add a Code of conduct
+- Add a Contributing guide
+- Refactor README.md to separate clearly from documentation
+- Fix broken links
+- Fix bad markdown rendering
+- Split old README.md information in dedicated sections
 
 ### Changed
 
--   Enable `pipeline_ml` to accept artifacts (encoder, binarizer...) to be "intermediary" outputs of the pipeline and not only "terminal" outputs (i.e. node outputs which are not re-used as another node input). This closes a bug discovered in a more general discussion in [#16](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16).
--   Only non-empty CLI arguments and options are logged as tags in MLflow ([#32](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16))
+- Enable `pipeline_ml` to accept artifacts (encoder, binarizer...) to be "intermediary" outputs of the pipeline and not only "terminal" outputs (i.e. node outputs which are not re-used as another node input). This closes a bug discovered in a more general discussion in [#16](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16).
+- Only non-empty CLI arguments and options are logged as tags in MLflow ([#32](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16))
 
 ## [0.2.0] - 2020-07-18
 
 ### Added
 
--   Bump the codebase test coverage to 100% to improve stability
--   Improve rendering of template with a trailing newline to make them  `black`-valid
--   Add a `PipelineML.extract_pipeline_artifacts` methods to make artifacts retrieving easier for a given pipeline
--   Use an official kedro release (`>0.16.0, <0.17.0`) instead of the development branch
+- Bump the codebase test coverage to 100% to improve stability
+- Improve rendering of template with a trailing newline to make them  `black`-valid
+- Add a `PipelineML.extract_pipeline_artifacts` methods to make artifacts retrieving easier for a given pipeline
+- Use an official kedro release (`>0.16.0, <0.17.0`) instead of the development branch
 
 ### Changed
 
--   `hooks`, `context` and `cli` folders are moved to `framework` to fit kedro new folder architecture
--   Rename `get_mlflow_conf` in `get_mlflow_config` for consistency (with `ConfigLoader`, `KedroMlflowConfig`...)
--   Rename keys of `KedroMlflowConfig.to_dict()` to remove the "\_opts" suffix for consistency with the `KedroMlflowConfig.from_dict` method
+- `hooks`, `context` and `cli` folders are moved to `framework` to fit kedro new folder architecture
+- Rename `get_mlflow_conf` in `get_mlflow_config` for consistency (with `ConfigLoader`, `KedroMlflowConfig`...)
+- Rename keys of `KedroMlflowConfig.to_dict()` to remove the "\_opts" suffix for consistency with the `KedroMlflowConfig.from_dict` method
 
 ### Fixed
 
--   Add `debug` folder to gitignore for to avoid involuntary data leakage
--   Remove the inadequate warning _"You have not initialized your project yet"_ when calling `kedro mlflow init`
--   Remove useless check to see if the commands are called inside a Kedro project since the commands are dynamically displayed based on whether the call is made inside a kedro project or not
--   Fix typos in error messages
--   Fix hardcoded path to the `run.py` template
--   Make not implemented function raise a `NotImplementError` instead of failing silently
--   Fix wrong parsing when the `mlflow_tracking_uri` key of the `mlflow.yml` configuration file was an absolute local path
--   Remove unused `KedroMlflowContextClass`
--   Force the `MlflowPipelineHook.before_pipeline_run` method to set the `mlflow_tracking_uri` to the one from the configuration to enforce configuration file to be prevalent on environment variables or current active tracking uri in interactive mode
--   Fix wrong environment parsing case when passing a conda environment as a python dictionary in `MlflowPipelineHook`
--   Fix wrong artifact logging of `MlflowDataSet` when a run was already active and the save method was called in an interactive python session.
--   Force the user to declare an `input_name` for a `PipelineMl` object to fix difficult inference of what is the pipeline input
--   Update `run.py` template to fit kedro new one.
--   Force `_generate_kedro_commands` to separate an option and its arguments with a "=" sign for readibility
+- Add `debug` folder to gitignore for to avoid involuntary data leakage
+- Remove the inadequate warning _"You have not initialized your project yet"_ when calling `kedro mlflow init`
+- Remove useless check to see if the commands are called inside a Kedro project since the commands are dynamically displayed based on whether the call is made inside a kedro project or not
+- Fix typos in error messages
+- Fix hardcoded path to the `run.py` template
+- Make not implemented function raise a `NotImplementError` instead of failing silently
+- Fix wrong parsing when the `mlflow_tracking_uri` key of the `mlflow.yml` configuration file was an absolute local path
+- Remove unused `KedroMlflowContextClass`
+- Force the `MlflowPipelineHook.before_pipeline_run` method to set the `mlflow_tracking_uri` to the one from the configuration to enforce configuration file to be prevalent on environment variables or current active tracking uri in interactive mode
+- Fix wrong environment parsing case when passing a conda environment as a python dictionary in `MlflowPipelineHook`
+- Fix wrong artifact logging of `MlflowDataSet` when a run was already active and the save method was called in an interactive python session.
+- Force the user to declare an `input_name` for a `PipelineMl` object to fix difficult inference of what is the pipeline input
+- Update `run.py` template to fit kedro new one.
+- Force `_generate_kedro_commands` to separate an option and its arguments with a "=" sign for readibility
 
 ## [0.1.0] - 2020-04-18
 
 ### Added
 
--   Add cli `kedro mlflow init` to udpdate the template and `kedro mlflow ui` to open `mlflow` user interface with your project configuration
--   Add hooks `MlflowNodeHook` and `MlflowPipelineHook` for parameters autologging and model autologging
--   Add `MlflowDataSet` for artifacts autologging
--   Add `PipelineMl` class and its `pipeline_ml` factory for pipeline packaging and service
+- Add cli `kedro mlflow init` to udpdate the template and `kedro mlflow ui` to open `mlflow` user interface with your project configuration
+- Add hooks `MlflowNodeHook` and `MlflowPipelineHook` for parameters autologging and model autologging
+- Add `MlflowDataSet` for artifacts autologging
+- Add `PipelineMl` class and its `pipeline_ml` factory for pipeline packaging and service
 
 [unreleased]: https://github.com/Galileo-Galilei/kedro-mlflow/compare/0.2.1...HEAD
 
diff --git a/kedro_mlflow/io/mlflow_metrics_dataset.py b/kedro_mlflow/io/mlflow_metrics_dataset.py
index 3b6e961a..2160900b 100644
--- a/kedro_mlflow/io/mlflow_metrics_dataset.py
+++ b/kedro_mlflow/io/mlflow_metrics_dataset.py
@@ -24,6 +24,28 @@ def __init__(
             run_id (str): ID of MLflow run.
         """
         self._prefix = prefix
+        self.run_id = run_id
+
+    @property
+    def run_id(self):
+        """Get run id.
+
+        If active run is not found, tries to find last experiment.
+
+        Raise `DataSetError` exception if run id can't be found.
+
+        Returns:
+            str: String contains run_id.
+        """
+        if self._run_id is not None:
+            return self._run_id
+        run = mlflow.active_run()
+        if run:
+            return run.info.run_id
+        raise DataSetError("Cannot find run id.")
+
+    @run_id.setter
+    def run_id(self, run_id):
         self._run_id = run_id
 
     def _load(self) -> MetricsDict:
@@ -33,7 +55,7 @@ def _load(self) -> MetricsDict:
             Dict[str, Union[int, float]]: Dictionary with MLflow metrics dataset.
         """
         client = MlflowClient()
-        run_id = self._get_run_id()
+        run_id = self.run_id
         all_metrics = client._tracking_client.store.get_all_metrics(run_uuid=run_id)
         dataset_metrics = filter(self._is_dataset_metric, all_metrics)
         dataset = reduce(
@@ -56,7 +78,7 @@ def _save(self, data: MetricsDict) -> None:
         """
         client = MlflowClient()
         try:
-            run_id = self._get_run_id()
+            run_id = self.run_id
         except DataSetError:
             # If run_id can't be found log_metric would create new run.
             run_id = None
@@ -67,7 +89,7 @@ def _save(self, data: MetricsDict) -> None:
             else mlflow.log_metric
         )
         metrics = (
-            self._build_args_list_from_metric_item(k, v) for k, v, in data.items()
+            self._build_args_list_from_metric_item(k, v) for k, v in data.items()
         )
         for k, v, i in chain.from_iterable(metrics):
             log_metric(k, v, step=i)
@@ -79,8 +101,9 @@ def _exists(self) -> bool:
             bool: Is MLflow metrics dataset exists?
         """
         client = MlflowClient()
-        run_id = self._get_run_id()
-        all_metrics = client._tracking_client.store.get_all_metrics(run_uuid=run_id)
+        all_metrics = client._tracking_client.store.get_all_metrics(
+            run_uuid=self.run_id
+        )
         return any(self._is_dataset_metric(x) for x in all_metrics)
 
     def _describe(self) -> Dict[str, Any]:
@@ -94,23 +117,6 @@ def _describe(self) -> Dict[str, Any]:
             "prefix": self._prefix,
         }
 
-    def _get_run_id(self) -> str:
-        """Get run id.
-
-        If active run is not found, tries to find last experiment.
-
-        Raise `DataSetError` exception if run id can't be found.
-
-        Returns:
-            str: String contains run_id.
-        """
-        if self._run_id is not None:
-            return self._run_id
-        run = mlflow.active_run()
-        if run:
-            return run.info.run_id
-        raise DataSetError("Cannot find run id.")
-
     def _is_dataset_metric(self, metric: mlflow.entities.Metric) -> bool:
         """Check if given metric belongs to dataset.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index d465e875..41fa845b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,10 +2,19 @@
 from pathlib import Path
 from typing import Dict
 
+import mlflow
 import pytest
 import yaml
 
 
+@pytest.fixture(autouse=True)
+def cleanup_mlflow_after_runs():
+    # A test function will be run at this point
+    yield
+    while mlflow.active_run():
+        mlflow.end_run()
+
+
 def _write_yaml(filepath: Path, config: Dict):
     filepath.parent.mkdir(parents=True, exist_ok=True)
     yaml_str = yaml.dump(config)
diff --git a/tests/io/test_mlflow_metrics_dataset.py b/tests/io/test_mlflow_metrics_dataset.py
index 16fd557e..cdda54d9 100644
--- a/tests/io/test_mlflow_metrics_dataset.py
+++ b/tests/io/test_mlflow_metrics_dataset.py
@@ -2,6 +2,7 @@
 
 import mlflow
 import pytest
+from kedro.io import DataSetError
 from mlflow.tracking import MlflowClient
 from pytest_lazyfixture import lazy_fixture
 
@@ -105,3 +106,66 @@ def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, p
     for k in catalog_metrics.keys():
         data_key = k.split(".")[-1] if prefix is not None else k
         assert data[data_key] == catalog_metrics[k]
+
+
+def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri, metrics3):
+    """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given,
+    and if logged in mlflow.
+    """
+    prefix = "test_metric"
+
+    mlflow.set_tracking_uri(tracking_uri.as_uri())
+    mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri())
+    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix)
+
+    # a mlflow run_id is automatically created
+    mlflow_metrics_dataset.save(metrics3)
+    run_id = mlflow.active_run().info.run_id
+
+    assert_are_metrics_logged(metrics3, mlflow_client, run_id, prefix)
+
+
+def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3):
+    """Check if MlflowMetricsDataSet is well identified as
+    existing if it has already been saved.
+    """
+    prefix = "test_metric"
+
+    mlflow.set_tracking_uri(tracking_uri.as_uri())
+    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix)
+
+    # a mlflow run_id is automatically created
+    mlflow_metrics_dataset.save(metrics3)
+    assert mlflow_metrics_dataset.exists()
+
+
+def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3):
+    """Check if MlflowMetricsDataSet is well identified as
+    not existingif it has never been saved.
+    """
+
+    mlflow.set_tracking_uri(tracking_uri.as_uri())
+    mlflow.start_run()  # starts a run toenable mlflow_metrics_dataset to know where to seacrh
+    run_id = mlflow.active_run().info.run_id
+    mlflow.end_run()
+    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric", run_id=run_id)
+    # a mlflow run_id is automatically created
+    assert not mlflow_metrics_dataset.exists()
+
+
+def test_mlflow_metrics_dataset_fails_with_invalid_metric(
+    tmp_path, tracking_uri, metrics3
+):
+    """Check if MlflowMetricsDataSet is well identified as
+    not existingif it has never been saved.
+    """
+
+    mlflow.set_tracking_uri(tracking_uri.as_uri())
+    mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric")
+
+    with pytest.raises(
+        DataSetError, match="Unexpected metric value. Should be of type"
+    ):
+        mlflow_metrics_dataset.save(
+            {"metric1": 1}
+        )  # key: value is not valid, you must specify {key: {value, step}}