From 9ca45854aeb451157be78f3d3ccaefba40f379e0 Mon Sep 17 00:00:00 2001 From: Galileo Galilei Date: Sat, 24 Oct 2020 00:46:19 +0200 Subject: [PATCH] PARTIAL #73 - Make MLflowMetricsDataSet coverage 100% --- CHANGELOG.md | 103 +++++++++++----------- kedro_mlflow/io/mlflow_metrics_dataset.py | 50 ++++++----- tests/conftest.py | 9 ++ tests/io/test_mlflow_metrics_dataset.py | 64 ++++++++++++++ 4 files changed, 152 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2266df40..3fc7ae86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,6 @@ - The test coverage now excludes `tests` and `setup.py` (#99). - The `KedroPipelineModel` now unpacks the result of the `inference` pipeline and no longer returns a dictionary with the name in the `DataCatalog` but only the predicted value (#93). - ### Removed - `kedro mlflow init` command is no longer declaring hooks in `run.py`. You must now [register your hooks manually](docs/source/03_tutorial/02_setup.md#declaring-kedro-mlflow-hooks) in the ``run.py`` (kedro > 0.16.0), ``.kedro.yml`` (kedro >= 0.16.5) or ``pyproject.toml`` (kedro >= 0.16.5) @@ -31,33 +30,33 @@ ### Added -- Add dataset `MlflowMetricsDataSet` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics. -- Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) -- Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) +- Add dataset `MlflowMetricsDataSet` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics. +- Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) +- Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) ### Fixed -- Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41)) -- MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45)) -- Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50)) -- Force to close all mlflow runs when a pipeline fails. It prevents further execution of the pipeline to be logged within the same mlflow run_id as the failing pipeline. ([#10](https://github.com/Galileo-Galilei/kedro-mlflow/issues/10)) -- Fix various documentation typos ([#34](https://github.com/Galileo-Galilei/kedro-mlflow/pull/34), [#35](https://github.com/Galileo-Galilei/kedro-mlflow/pull/35), [#36](https://github.com/Galileo-Galilei/kedro-mlflow/pull/36) and more) -- Update README (add badges for readibility, add a "main contributors" section to give credit, fix typo in install command, link to milestones for more up-to-date priorities) ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) -- Fix bug in CI deployment workflow and rename it to `publish` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) -- Fix a bug in `MlflowDataSet` which sometimes failed to log on remote storage (S3, Azure Blob storage) with underlying `log_artifacts` when the kedro's `AbstractDataset._filepath` was a `pathlib.PurePosixPath` object instead of a string ([#74](https://github.com/Galileo-Galilei/kedro-mlflow/issues/74)). -- Add a CI for release candidate creation and use actions to enforce semantic versioning and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +- Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41)) +- MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45)) +- Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50)) +- Force to close all mlflow runs when a pipeline fails. It prevents further execution of the pipeline to be logged within the same mlflow run_id as the failing pipeline. ([#10](https://github.com/Galileo-Galilei/kedro-mlflow/issues/10)) +- Fix various documentation typos ([#34](https://github.com/Galileo-Galilei/kedro-mlflow/pull/34), [#35](https://github.com/Galileo-Galilei/kedro-mlflow/pull/35), [#36](https://github.com/Galileo-Galilei/kedro-mlflow/pull/36) and more) +- Update README (add badges for readibility, add a "main contributors" section to give credit, fix typo in install command, link to milestones for more up-to-date priorities) ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) +- Fix bug in CI deployment workflow and rename it to `publish` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) +- Fix a bug in `MlflowDataSet` which sometimes failed to log on remote storage (S3, Azure Blob storage) with underlying `log_artifacts` when the kedro's `AbstractDataset._filepath` was a `pathlib.PurePosixPath` object instead of a string ([#74](https://github.com/Galileo-Galilei/kedro-mlflow/issues/74)). +- Add a CI for release candidate creation and use actions to enforce semantic versioning and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. ### Changed -- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport. -- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) -- Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) -- The `input_name` attributes of `PipelineML` is now a python property and makes a check at setting time to prevent setting an invalid value. The check ensures that `input_name` is a valid input of the `inference` pipeline. +- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport. +- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) +- Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68)) +- The `input_name` attributes of `PipelineML` is now a python property and makes a check at setting time to prevent setting an invalid value. The check ensures that `input_name` is a valid input of the `inference` pipeline. ### Deprecated -- Deprecate `MlflowDataSet` which is renamed as `MlflowArtifactDataSet` for consistency with the other datasets. It will raise a `DeprecationWarning` in this realease, and will be totally supressed in next minor release. Please update your `catalog.yml` entries accordingly as soon as possible. ([#63](https://github.com/Galileo-Galilei/kedro-mlflow/issues/63)) -- Deprecate `pipeline_ml` which is renamed as `pipeline_ml_factory` to avoid confusion between a `PipelineML` instance and the helper function to create `PipelineMl` instances from Kedro `Pipeline`s. +- Deprecate `MlflowDataSet` which is renamed as `MlflowArtifactDataSet` for consistency with the other datasets. It will raise a `DeprecationWarning` in this realease, and will be totally supressed in next minor release. Please update your `catalog.yml` entries accordingly as soon as possible. ([#63](https://github.com/Galileo-Galilei/kedro-mlflow/issues/63)) +- Deprecate `pipeline_ml` which is renamed as `pipeline_ml_factory` to avoid confusion between a `PipelineML` instance and the helper function to create `PipelineMl` instances from Kedro `Pipeline`s. ## [0.2.1] - 2018-08-06 @@ -65,58 +64,58 @@ Many documentation improvements: -- Add a Code of conduct -- Add a Contributing guide -- Refactor README.md to separate clearly from documentation -- Fix broken links -- Fix bad markdown rendering -- Split old README.md information in dedicated sections +- Add a Code of conduct +- Add a Contributing guide +- Refactor README.md to separate clearly from documentation +- Fix broken links +- Fix bad markdown rendering +- Split old README.md information in dedicated sections ### Changed -- Enable `pipeline_ml` to accept artifacts (encoder, binarizer...) to be "intermediary" outputs of the pipeline and not only "terminal" outputs (i.e. node outputs which are not re-used as another node input). This closes a bug discovered in a more general discussion in [#16](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16). -- Only non-empty CLI arguments and options are logged as tags in MLflow ([#32](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16)) +- Enable `pipeline_ml` to accept artifacts (encoder, binarizer...) to be "intermediary" outputs of the pipeline and not only "terminal" outputs (i.e. node outputs which are not re-used as another node input). This closes a bug discovered in a more general discussion in [#16](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16). +- Only non-empty CLI arguments and options are logged as tags in MLflow ([#32](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16)) ## [0.2.0] - 2020-07-18 ### Added -- Bump the codebase test coverage to 100% to improve stability -- Improve rendering of template with a trailing newline to make them `black`-valid -- Add a `PipelineML.extract_pipeline_artifacts` methods to make artifacts retrieving easier for a given pipeline -- Use an official kedro release (`>0.16.0, <0.17.0`) instead of the development branch +- Bump the codebase test coverage to 100% to improve stability +- Improve rendering of template with a trailing newline to make them `black`-valid +- Add a `PipelineML.extract_pipeline_artifacts` methods to make artifacts retrieving easier for a given pipeline +- Use an official kedro release (`>0.16.0, <0.17.0`) instead of the development branch ### Changed -- `hooks`, `context` and `cli` folders are moved to `framework` to fit kedro new folder architecture -- Rename `get_mlflow_conf` in `get_mlflow_config` for consistency (with `ConfigLoader`, `KedroMlflowConfig`...) -- Rename keys of `KedroMlflowConfig.to_dict()` to remove the "\_opts" suffix for consistency with the `KedroMlflowConfig.from_dict` method +- `hooks`, `context` and `cli` folders are moved to `framework` to fit kedro new folder architecture +- Rename `get_mlflow_conf` in `get_mlflow_config` for consistency (with `ConfigLoader`, `KedroMlflowConfig`...) +- Rename keys of `KedroMlflowConfig.to_dict()` to remove the "\_opts" suffix for consistency with the `KedroMlflowConfig.from_dict` method ### Fixed -- Add `debug` folder to gitignore for to avoid involuntary data leakage -- Remove the inadequate warning _"You have not initialized your project yet"_ when calling `kedro mlflow init` -- Remove useless check to see if the commands are called inside a Kedro project since the commands are dynamically displayed based on whether the call is made inside a kedro project or not -- Fix typos in error messages -- Fix hardcoded path to the `run.py` template -- Make not implemented function raise a `NotImplementError` instead of failing silently -- Fix wrong parsing when the `mlflow_tracking_uri` key of the `mlflow.yml` configuration file was an absolute local path -- Remove unused `KedroMlflowContextClass` -- Force the `MlflowPipelineHook.before_pipeline_run` method to set the `mlflow_tracking_uri` to the one from the configuration to enforce configuration file to be prevalent on environment variables or current active tracking uri in interactive mode -- Fix wrong environment parsing case when passing a conda environment as a python dictionary in `MlflowPipelineHook` -- Fix wrong artifact logging of `MlflowDataSet` when a run was already active and the save method was called in an interactive python session. -- Force the user to declare an `input_name` for a `PipelineMl` object to fix difficult inference of what is the pipeline input -- Update `run.py` template to fit kedro new one. -- Force `_generate_kedro_commands` to separate an option and its arguments with a "=" sign for readibility +- Add `debug` folder to gitignore for to avoid involuntary data leakage +- Remove the inadequate warning _"You have not initialized your project yet"_ when calling `kedro mlflow init` +- Remove useless check to see if the commands are called inside a Kedro project since the commands are dynamically displayed based on whether the call is made inside a kedro project or not +- Fix typos in error messages +- Fix hardcoded path to the `run.py` template +- Make not implemented function raise a `NotImplementError` instead of failing silently +- Fix wrong parsing when the `mlflow_tracking_uri` key of the `mlflow.yml` configuration file was an absolute local path +- Remove unused `KedroMlflowContextClass` +- Force the `MlflowPipelineHook.before_pipeline_run` method to set the `mlflow_tracking_uri` to the one from the configuration to enforce configuration file to be prevalent on environment variables or current active tracking uri in interactive mode +- Fix wrong environment parsing case when passing a conda environment as a python dictionary in `MlflowPipelineHook` +- Fix wrong artifact logging of `MlflowDataSet` when a run was already active and the save method was called in an interactive python session. +- Force the user to declare an `input_name` for a `PipelineMl` object to fix difficult inference of what is the pipeline input +- Update `run.py` template to fit kedro new one. +- Force `_generate_kedro_commands` to separate an option and its arguments with a "=" sign for readibility ## [0.1.0] - 2020-04-18 ### Added -- Add cli `kedro mlflow init` to udpdate the template and `kedro mlflow ui` to open `mlflow` user interface with your project configuration -- Add hooks `MlflowNodeHook` and `MlflowPipelineHook` for parameters autologging and model autologging -- Add `MlflowDataSet` for artifacts autologging -- Add `PipelineMl` class and its `pipeline_ml` factory for pipeline packaging and service +- Add cli `kedro mlflow init` to udpdate the template and `kedro mlflow ui` to open `mlflow` user interface with your project configuration +- Add hooks `MlflowNodeHook` and `MlflowPipelineHook` for parameters autologging and model autologging +- Add `MlflowDataSet` for artifacts autologging +- Add `PipelineMl` class and its `pipeline_ml` factory for pipeline packaging and service [unreleased]: https://github.com/Galileo-Galilei/kedro-mlflow/compare/0.2.1...HEAD diff --git a/kedro_mlflow/io/mlflow_metrics_dataset.py b/kedro_mlflow/io/mlflow_metrics_dataset.py index 3b6e961a..2160900b 100644 --- a/kedro_mlflow/io/mlflow_metrics_dataset.py +++ b/kedro_mlflow/io/mlflow_metrics_dataset.py @@ -24,6 +24,28 @@ def __init__( run_id (str): ID of MLflow run. """ self._prefix = prefix + self.run_id = run_id + + @property + def run_id(self): + """Get run id. + + If active run is not found, tries to find last experiment. + + Raise `DataSetError` exception if run id can't be found. + + Returns: + str: String contains run_id. + """ + if self._run_id is not None: + return self._run_id + run = mlflow.active_run() + if run: + return run.info.run_id + raise DataSetError("Cannot find run id.") + + @run_id.setter + def run_id(self, run_id): self._run_id = run_id def _load(self) -> MetricsDict: @@ -33,7 +55,7 @@ def _load(self) -> MetricsDict: Dict[str, Union[int, float]]: Dictionary with MLflow metrics dataset. """ client = MlflowClient() - run_id = self._get_run_id() + run_id = self.run_id all_metrics = client._tracking_client.store.get_all_metrics(run_uuid=run_id) dataset_metrics = filter(self._is_dataset_metric, all_metrics) dataset = reduce( @@ -56,7 +78,7 @@ def _save(self, data: MetricsDict) -> None: """ client = MlflowClient() try: - run_id = self._get_run_id() + run_id = self.run_id except DataSetError: # If run_id can't be found log_metric would create new run. run_id = None @@ -67,7 +89,7 @@ def _save(self, data: MetricsDict) -> None: else mlflow.log_metric ) metrics = ( - self._build_args_list_from_metric_item(k, v) for k, v, in data.items() + self._build_args_list_from_metric_item(k, v) for k, v in data.items() ) for k, v, i in chain.from_iterable(metrics): log_metric(k, v, step=i) @@ -79,8 +101,9 @@ def _exists(self) -> bool: bool: Is MLflow metrics dataset exists? """ client = MlflowClient() - run_id = self._get_run_id() - all_metrics = client._tracking_client.store.get_all_metrics(run_uuid=run_id) + all_metrics = client._tracking_client.store.get_all_metrics( + run_uuid=self.run_id + ) return any(self._is_dataset_metric(x) for x in all_metrics) def _describe(self) -> Dict[str, Any]: @@ -94,23 +117,6 @@ def _describe(self) -> Dict[str, Any]: "prefix": self._prefix, } - def _get_run_id(self) -> str: - """Get run id. - - If active run is not found, tries to find last experiment. - - Raise `DataSetError` exception if run id can't be found. - - Returns: - str: String contains run_id. - """ - if self._run_id is not None: - return self._run_id - run = mlflow.active_run() - if run: - return run.info.run_id - raise DataSetError("Cannot find run id.") - def _is_dataset_metric(self, metric: mlflow.entities.Metric) -> bool: """Check if given metric belongs to dataset. diff --git a/tests/conftest.py b/tests/conftest.py index d465e875..41fa845b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,10 +2,19 @@ from pathlib import Path from typing import Dict +import mlflow import pytest import yaml +@pytest.fixture(autouse=True) +def cleanup_mlflow_after_runs(): + # A test function will be run at this point + yield + while mlflow.active_run(): + mlflow.end_run() + + def _write_yaml(filepath: Path, config: Dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) diff --git a/tests/io/test_mlflow_metrics_dataset.py b/tests/io/test_mlflow_metrics_dataset.py index 16fd557e..cdda54d9 100644 --- a/tests/io/test_mlflow_metrics_dataset.py +++ b/tests/io/test_mlflow_metrics_dataset.py @@ -2,6 +2,7 @@ import mlflow import pytest +from kedro.io import DataSetError from mlflow.tracking import MlflowClient from pytest_lazyfixture import lazy_fixture @@ -105,3 +106,66 @@ def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, p for k in catalog_metrics.keys(): data_key = k.split(".")[-1] if prefix is not None else k assert data[data_key] == catalog_metrics[k] + + +def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri, metrics3): + """Check if MlflowMetricsDataSet can be saved in catalog when filepath is given, + and if logged in mlflow. + """ + prefix = "test_metric" + + mlflow.set_tracking_uri(tracking_uri.as_uri()) + mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) + mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix) + + # a mlflow run_id is automatically created + mlflow_metrics_dataset.save(metrics3) + run_id = mlflow.active_run().info.run_id + + assert_are_metrics_logged(metrics3, mlflow_client, run_id, prefix) + + +def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3): + """Check if MlflowMetricsDataSet is well identified as + existing if it has already been saved. + """ + prefix = "test_metric" + + mlflow.set_tracking_uri(tracking_uri.as_uri()) + mlflow_metrics_dataset = MlflowMetricsDataSet(prefix=prefix) + + # a mlflow run_id is automatically created + mlflow_metrics_dataset.save(metrics3) + assert mlflow_metrics_dataset.exists() + + +def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3): + """Check if MlflowMetricsDataSet is well identified as + not existingif it has never been saved. + """ + + mlflow.set_tracking_uri(tracking_uri.as_uri()) + mlflow.start_run() # starts a run toenable mlflow_metrics_dataset to know where to seacrh + run_id = mlflow.active_run().info.run_id + mlflow.end_run() + mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric", run_id=run_id) + # a mlflow run_id is automatically created + assert not mlflow_metrics_dataset.exists() + + +def test_mlflow_metrics_dataset_fails_with_invalid_metric( + tmp_path, tracking_uri, metrics3 +): + """Check if MlflowMetricsDataSet is well identified as + not existingif it has never been saved. + """ + + mlflow.set_tracking_uri(tracking_uri.as_uri()) + mlflow_metrics_dataset = MlflowMetricsDataSet(prefix="test_metric") + + with pytest.raises( + DataSetError, match="Unexpected metric value. Should be of type" + ): + mlflow_metrics_dataset.save( + {"metric1": 1} + ) # key: value is not valid, you must specify {key: {value, step}}