Skip to content

Commit

Permalink
FIX #69 - Handle parameters which exceed mlflow limit of 250 characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Galileo-Galilei committed Dec 18, 2020
1 parent 7014215 commit a48c333
Show file tree
Hide file tree
Showing 8 changed files with 306 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

### Added

- A new key `long_parameters_strategy` is added in the `mlflow.yml` (under in the hook/node section). You can specify different strategies (`fail`, `truncate` or `tag`) to handle parameters over 250 characters which cause crashes for some mlflow backend. ([#69](https://github.com/Galileo-Galilei/kedro-mlflow/issues/69))

## [0.4.1] - 2020-12-03

### Added
Expand Down
7 changes: 7 additions & 0 deletions docs/source/04_experimentation_tracking/01_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,17 @@ hooks:
flatten_dict_params: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
recursive: True # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow.
long_parameters_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?
```
If you set `flatten_dict_params` to `True`, each key of the dictionary will be logged as a mlflow parameters, instead of a single parameter for the whole dictionary. Note that it is recommended to facilitate run comparison.

The `long_parameters_strategy` key enable to define different way to handle parameters over the mlflow limit (currently 250 characters):

- `fail`: no special management of characters above the limit. They will be send to mlflow and as a result, in some backend they will be stored normally ([e.g. for FileStore backend](https://github.com/mlflow/mlflow/issues/2814#issuecomment-628284425)) and for some others logging will fail.
- `truncate`: All parameters above the limit will be automatically truncated to a 250-character length to make sure logging will pass for all mlflow backend.
- `tag`: Any parameter above the limit will be registered as a tag instead of a parameter as it seems to be the [recommended mlflow way to deal with long parameters](https://github.com/mlflow/mlflow/issues/1976).

### Configure the user interface

You can configure mlflow user interface default params inside the `mlflow.yml`:
Expand Down
24 changes: 22 additions & 2 deletions kedro_mlflow/framework/context/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ class KedroMlflowConfig:

UI_OPTS = {"port": None, "host": None}

NODE_HOOK_OPTS = {"flatten_dict_params": False, "recursive": True, "sep": "."}
NODE_HOOK_OPTS = {
"flatten_dict_params": False,
"recursive": True,
"sep": ".",
"long_parameters_strategy": "fail",
}

AVAILABLE_LONG_PARAMETERS_STRATEGY = ["fail", "truncate", "tag"]

def __init__(
self,
Expand Down Expand Up @@ -134,8 +141,21 @@ def from_dict(self, configuration: Dict[str, str]):
opts=node_hook_opts, default=self.NODE_HOOK_OPTS
)

# this parameter validation should likely be elsewhere
# when refactoring KedroMlflowConfig, we shoudl use an object
# to validate data with a @property
if (
self.node_hook_opts["long_parameters_strategy"]
not in self.AVAILABLE_LONG_PARAMETERS_STRATEGY
):
strategy_list = ", ".join(self.AVAILABLE_LONG_PARAMETERS_STRATEGY)
raise KedroMlflowConfigError(
f"'long_parameters_strategy' must be one of [{strategy_list}], "
f"got '{self.node_hook_opts['long_parameters_strategy']}'"
)

# instantiate mlflow objects to interact with the database
# the client must not be create dbefore carefully checking the uri,
# the client must not be created before carefully checking the uri,
# otherwise mlflow creates a mlruns folder to the current location
self.mlflow_client = mlflow.tracking.MlflowClient(
tracking_uri=self.mlflow_tracking_uri
Expand Down
45 changes: 42 additions & 3 deletions kedro_mlflow/framework/hooks/node_hook.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from typing import Any, Dict
import logging
from typing import Any, Dict, Union

import mlflow
from kedro.framework.context import load_context
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline
from kedro.pipeline.node import Node
from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH

from kedro_mlflow.framework.context import get_mlflow_config

Expand All @@ -16,6 +18,11 @@ def __init__(self):
self.flatten = False
self.recursive = True
self.sep = "."
self.long_parameters_strategy = "fail"

@property
def _logger(self) -> logging.Logger:
return logging.getLogger(__name__)

@hook_impl
def before_pipeline_run(
Expand Down Expand Up @@ -50,9 +57,13 @@ def before_pipeline_run(
extra_params=run_params["extra_params"],
)
config = get_mlflow_config(self.context)

self.flatten = config.node_hook_opts["flatten_dict_params"]
self.recursive = config.node_hook_opts["recursive"]
self.sep = config.node_hook_opts["sep"]
self.long_parameters_strategy = config.node_hook_opts[
"long_parameters_strategy"
]

@hook_impl
def before_node_run(
Expand All @@ -76,6 +87,7 @@ def before_node_run(
# only parameters will be logged. Artifacts must be declared manually in the catalog
params_inputs = {}
for k, v in inputs.items():
# detect parameters automatically based on kedro reserved names
if k.startswith("params:"):
params_inputs[k[7:]] = v
elif k == "parameters":
Expand All @@ -87,9 +99,36 @@ def before_node_run(
d=params_inputs, recursive=self.recursive, sep=self.sep
)

mlflow.log_params(params_inputs)

# logging parameters based on defined strategy
for k, v in params_inputs.items():
self.log_param(k, v)

def log_param(self, name: str, value: Union[Dict, int, bool, str]) -> None:
str_value = str(value)
str_value_length = len(str_value)
if str_value_length <= MAX_PARAM_VAL_LENGTH:
return mlflow.log_param(name, value)
else:
if self.long_parameters_strategy == "fail":
raise ValueError(
f"Parameter '{name}' length is {str_value_length}, "
f"while mlflow forces it to be lower than '{MAX_PARAM_VAL_LENGTH}'. "
"If you want to bypass it, try to change 'long_parameters_strategy' to"
" 'tag' or 'truncate' in the 'mlflow.yml'configuration file."
)
elif self.long_parameters_strategy == "tag":
self._logger.warning(
f"Parameter '{name}' (value length {str_value_length}) is set as a tag."
)
mlflow.set_tag(name, value)
elif self.long_parameters_strategy == "truncate":
self._logger.warning(
f"Parameter '{name}' (value length {str_value_length}) is truncated to its {MAX_PARAM_VAL_LENGTH} first characters."
)
mlflow.log_param(name, str_value[0:MAX_PARAM_VAL_LENGTH])


# this hooks instaitation is necessary for auto-registration
mlflow_node_hook = MlflowNodeHook()


Expand Down
1 change: 1 addition & 0 deletions kedro_mlflow/template/project/mlflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ hooks:
flatten_dict_params: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
recursive: True # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow.
long_parameters_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?


# UI-RELATED PARAMETERS -----------------
Expand Down
12 changes: 12 additions & 0 deletions tests/framework/context/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ def test_kedro_mlflow_config_init(tmp_path):
)


def test_kedro_mlflow_config_bad_long_parameters_strategy(tmp_path):
# create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
(tmp_path / ".kedro.yml").write_text(yaml.dump(dict(context_path="fake/path")))
with pytest.raises(
KedroMlflowConfigError, match="'long_parameters_strategy' must be one of "
):
KedroMlflowConfig(
project_path=tmp_path,
node_hook_opts=dict(long_parameters_strategy="does_not_exist"),
)


def test_kedro_mlflow_config_new_experiment_does_not_exists(
mocker, tmp_path, config_dir
):
Expand Down
32 changes: 28 additions & 4 deletions tests/framework/context/test_mlflow_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
experiment=dict(name="fake_package", create=True),
run=dict(id="123456789", name="my_run", nested=True),
ui=dict(port="5151", host="localhost"),
hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")),
hooks=dict(
node=dict(
flatten_dict_params=True,
recursive=False,
sep="-",
long_parameters_strategy="truncate",
)
),
),
)
expected = {
Expand All @@ -37,7 +44,12 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
"run": {"id": "123456789", "name": "my_run", "nested": True},
"ui": {"port": "5151", "host": "localhost"},
"hooks": {
"node": {"flatten_dict_params": True, "recursive": False, "sep": "-"}
"node": {
"flatten_dict_params": True,
"recursive": False,
"sep": "-",
"long_parameters_strategy": "truncate",
}
},
}
context = load_context(tmp_path)
Expand All @@ -54,7 +66,14 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
experiment=dict(name="fake_package", create=True),
run=dict(id="123456789", name="my_run", nested=True),
ui=dict(port="5151", host="localhost"),
hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")),
hooks=dict(
node=dict(
flatten_dict_params=True,
recursive=False,
sep="-",
long_parameters_strategy="truncate",
)
),
),
)

Expand All @@ -70,7 +89,12 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
"run": {"id": "123456789", "name": "my_run", "nested": True},
"ui": {"port": "5151", "host": "localhost"},
"hooks": {
"node": {"flatten_dict_params": True, "recursive": False, "sep": "-"}
"node": {
"flatten_dict_params": True,
"recursive": False,
"sep": "-",
"long_parameters_strategy": "truncate",
}
},
}

Expand Down
Loading

0 comments on commit a48c333

Please sign in to comment.