FIX #69 - Handle parameters which exceed mlflow limit of 250 characters

Galileo-Galilei · Dec 18, 2020 · 7195abc · 7195abc
1 parent c030ce9
commit 7195abc
Show file tree

Hide file tree

Showing 8 changed files with 306 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Added
+
+- A new key `long_parameters_strategy` is added in the `mlflow.yml` (under in the hook/node section). You can specify different strategies (`fail`, `truncate` or `tag`) to handle parameters over 250 characters which cause crashes for some mlflow backend. ([#69](https://github.com/Galileo-Galilei/kedro-mlflow/issues/69))
+
 ## [0.4.1] - 2020-12-03
 
 ### Added

diff --git a/docs/source/04_experimentation_tracking/01_configuration.md b/docs/source/04_experimentation_tracking/01_configuration.md
@@ -89,10 +89,17 @@ hooks:
     flatten_dict_params: False  # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
     recursive: True  # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
     sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow.
+    long_parameters_strategy: fail  # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?
 ```
 
 If you set `flatten_dict_params` to `True`, each key of the dictionary will be logged as a mlflow parameters, instead of a single parameter for the whole dictionary. Note that it is recommended to facilitate run comparison.
 
+The `long_parameters_strategy` key enable to define different way to handle parameters over the mlflow limit (currently 250 characters):
+
+- `fail`: no special management of characters above the limit. They will be send to mlflow and as a result, in some backend they will be stored normally ([e.g. for FileStore backend](https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425)) and for some others logging will fail.
+- `truncate`: All parameters above the limit will be automatically truncated to a 250-character length to make sure logging will pass for all mlflow backend.
+- `tag`: Any parameter above the limit will be registered as a tag instead of a parameter as it seems to be the [recommended mlflow way to deal with long parameters](https://github.com/mlflow/mlflow/issues/1976).
+
 ### Configure the user interface
 
 You can configure mlflow user interface default params inside the `mlflow.yml`:

diff --git a/kedro_mlflow/framework/context/config.py b/kedro_mlflow/framework/context/config.py
@@ -18,7 +18,14 @@ class KedroMlflowConfig:
 
     UI_OPTS = {"port": None, "host": None}
 
-    NODE_HOOK_OPTS = {"flatten_dict_params": False, "recursive": True, "sep": "."}
+    NODE_HOOK_OPTS = {
+        "flatten_dict_params": False,
+        "recursive": True,
+        "sep": ".",
+        "long_parameters_strategy": "fail",
+    }
+
+    AVAILABLE_LONG_PARAMETERS_STRATEGY = ["fail", "truncate", "tag"]
 
     def __init__(
         self,
@@ -134,8 +141,21 @@ def from_dict(self, configuration: Dict[str, str]):
             opts=node_hook_opts, default=self.NODE_HOOK_OPTS
         )
 
+        # this parameter validation should likely be elsewhere
+        # when refactoring KedroMlflowConfig, we shoudl use an object
+        # to validate data with a @property
+        if (
+            self.node_hook_opts["long_parameters_strategy"]
+            not in self.AVAILABLE_LONG_PARAMETERS_STRATEGY
+        ):
+            strategy_list = ", ".join(self.AVAILABLE_LONG_PARAMETERS_STRATEGY)
+            raise KedroMlflowConfigError(
+                f"'long_parameters_strategy' must be one of [{strategy_list}], "
+                f"got '{self.node_hook_opts['long_parameters_strategy']}'"
+            )
+
         # instantiate mlflow objects to interact with the database
-        # the client must not be create dbefore carefully checking the uri,
+        # the client must not be created before carefully checking the uri,
         # otherwise mlflow creates a mlruns folder to the current location
         self.mlflow_client = mlflow.tracking.MlflowClient(
             tracking_uri=self.mlflow_tracking_uri

diff --git a/kedro_mlflow/framework/hooks/node_hook.py b/kedro_mlflow/framework/hooks/node_hook.py
@@ -1,11 +1,13 @@
-from typing import Any, Dict
+import logging
+from typing import Any, Dict, Union
 
 import mlflow
 from kedro.framework.context import load_context
 from kedro.framework.hooks import hook_impl
 from kedro.io import DataCatalog
 from kedro.pipeline import Pipeline
 from kedro.pipeline.node import Node
+from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH
 
 from kedro_mlflow.framework.context import get_mlflow_config
 
@@ -16,6 +18,11 @@ def __init__(self):
         self.flatten = False
         self.recursive = True
         self.sep = "."
+        self.long_parameters_strategy = "fail"
+
+    @property
+    def _logger(self) -> logging.Logger:
+        return logging.getLogger(__name__)
 
     @hook_impl
     def before_pipeline_run(
@@ -50,9 +57,13 @@ def before_pipeline_run(
             extra_params=run_params["extra_params"],
         )
         config = get_mlflow_config(self.context)
+
         self.flatten = config.node_hook_opts["flatten_dict_params"]
         self.recursive = config.node_hook_opts["recursive"]
         self.sep = config.node_hook_opts["sep"]
+        self.long_parameters_strategy = config.node_hook_opts[
+            "long_parameters_strategy"
+        ]
 
     @hook_impl
     def before_node_run(
@@ -76,6 +87,7 @@ def before_node_run(
         # only parameters will be logged. Artifacts must be declared manually in the catalog
         params_inputs = {}
         for k, v in inputs.items():
+            # detect parameters automatically based on kedro reserved names
             if k.startswith("params:"):
                 params_inputs[k[7:]] = v
             elif k == "parameters":
@@ -87,9 +99,36 @@ def before_node_run(
                 d=params_inputs, recursive=self.recursive, sep=self.sep
             )
 
-        mlflow.log_params(params_inputs)
-
+        # logging parameters based on defined strategy
+        for k, v in params_inputs.items():
+            self.log_param(k, v)
 
+    def log_param(self, name: str, value: Union[Dict, int, bool, str]) -> None:
+        str_value = str(value)
+        str_value_length = len(str_value)
+        if str_value_length <= MAX_PARAM_VAL_LENGTH:
+            return mlflow.log_param(name, value)
+        else:
+            if self.long_parameters_strategy == "fail":
+                raise ValueError(
+                    f"Parameter '{name}' length is {str_value_length}, "
+                    f"while mlflow forces it to be lower than '{MAX_PARAM_VAL_LENGTH}'. "
+                    "If you want to bypass it, try to change 'long_parameters_strategy' to"
+                    " 'tag' or 'truncate' in the 'mlflow.yml'configuration file."
+                )
+            elif self.long_parameters_strategy == "tag":
+                self._logger.warning(
+                    f"Parameter '{name}' (value length {str_value_length}) is set as a tag."
+                )
+                mlflow.set_tag(name, value)
+            elif self.long_parameters_strategy == "truncate":
+                self._logger.warning(
+                    f"Parameter '{name}' (value length {str_value_length}) is truncated to its {MAX_PARAM_VAL_LENGTH} first characters."
+                )
+                mlflow.log_param(name, str_value[0:MAX_PARAM_VAL_LENGTH])
+
+
+# this hooks instaitation is necessary for auto-registration
 mlflow_node_hook = MlflowNodeHook()
 
 

diff --git a/kedro_mlflow/template/project/mlflow.yml b/kedro_mlflow/template/project/mlflow.yml
@@ -41,6 +41,7 @@ hooks:
     flatten_dict_params: False  # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
     recursive: True  # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
     sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow.
+    long_parameters_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?
 
 
 # UI-RELATED PARAMETERS -----------------

diff --git a/tests/framework/context/test_config.py b/tests/framework/context/test_config.py
@@ -64,6 +64,18 @@ def test_kedro_mlflow_config_init(tmp_path):
     )
 
 
+def test_kedro_mlflow_config_bad_long_parameters_strategy(tmp_path):
+    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
+    (tmp_path / ".kedro.yml").write_text(yaml.dump(dict(context_path="fake/path")))
+    with pytest.raises(
+        KedroMlflowConfigError, match="'long_parameters_strategy' must be one of "
+    ):
+        KedroMlflowConfig(
+            project_path=tmp_path,
+            node_hook_opts=dict(long_parameters_strategy="does_not_exist"),
+        )
+
+
 def test_kedro_mlflow_config_new_experiment_does_not_exists(
     mocker, tmp_path, config_dir
 ):

diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py
@@ -27,7 +27,14 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
             experiment=dict(name="fake_package", create=True),
             run=dict(id="123456789", name="my_run", nested=True),
             ui=dict(port="5151", host="localhost"),
-            hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")),
+            hooks=dict(
+                node=dict(
+                    flatten_dict_params=True,
+                    recursive=False,
+                    sep="-",
+                    long_parameters_strategy="truncate",
+                )
+            ),
         ),
     )
     expected = {
@@ -37,7 +44,12 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
         "run": {"id": "123456789", "name": "my_run", "nested": True},
         "ui": {"port": "5151", "host": "localhost"},
         "hooks": {
-            "node": {"flatten_dict_params": True, "recursive": False, "sep": "-"}
+            "node": {
+                "flatten_dict_params": True,
+                "recursive": False,
+                "sep": "-",
+                "long_parameters_strategy": "truncate",
+            }
         },
     }
     context = load_context(tmp_path)
@@ -54,7 +66,14 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
             experiment=dict(name="fake_package", create=True),
             run=dict(id="123456789", name="my_run", nested=True),
             ui=dict(port="5151", host="localhost"),
-            hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")),
+            hooks=dict(
+                node=dict(
+                    flatten_dict_params=True,
+                    recursive=False,
+                    sep="-",
+                    long_parameters_strategy="truncate",
+                )
+            ),
         ),
     )
 
@@ -70,7 +89,12 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
         "run": {"id": "123456789", "name": "my_run", "nested": True},
         "ui": {"port": "5151", "host": "localhost"},
         "hooks": {
-            "node": {"flatten_dict_params": True, "recursive": False, "sep": "-"}
+            "node": {
+                "flatten_dict_params": True,
+                "recursive": False,
+                "sep": "-",
+                "long_parameters_strategy": "truncate",
+            }
         },
     }