Skip to content

Commit

Permalink
PARTIAL #92 - Add possibility to deactivate mlflow tracking for given…
Browse files Browse the repository at this point in the history
… pipelines in configuration
  • Loading branch information
Galileo-Galilei committed Apr 9, 2021
1 parent 1953eee commit b18f1e0
Show file tree
Hide file tree
Showing 22 changed files with 670 additions and 114 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

### Added

- It is now possible to deactivate tracking (for parameters and datasets) by specifying a key `disabled_tracking: pipelines: [<pipeline-name>]` in the `mlflow.yml` configuration file. ([#92](https://github.com/Galileo-Galilei/kedro-mlflow/issues/92))

## [0.7.0] - 2021-03-17

### Added
Expand Down
14 changes: 14 additions & 0 deletions docs/source/04_experimentation_tracking/01_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,20 @@ credentials: my_mlflow_credentials

For safety reasons, the credentials will not be accessible within `KedroMlflowConfig` objects. They wil be exported as environment variables *on the fly* when running the pipeline.

### Deactivate tracking under conditions

`kedro-mlflow` logs every run parameters in mlflow. You may want to avoid tracking some runs (for instance while debugging to avoid polluting your mlflow database, or because some pipelines are not ml related and it does not makes sense to log their parameters).

You can specify the name of the pipelines you want to turn off:

```yaml
disable_tracking:
pipelines:
- <pipeline-name>
```

Notice that it will stop autologging parameters but also any `Mlflow<Artifact/Metrics/ModelLogger>Dataset` you may have in these deactivated pipelines.

### Configure mlflow experiment

Mlflow enable the user to create "experiments" to organize his work. The different experiments will be visible on the left panel of the mlflow user interface. You can create an experiment through the `mlflow.yml` file witht the `experiment` key:
Expand Down
2 changes: 1 addition & 1 deletion kedro_mlflow/framework/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def render_jinja_template(

template_loader = FileSystemLoader(searchpath=src.parent.as_posix())
# the keep_trailing_new_line option is mandatory to
# make sure that black formatting wil be preserved
# make sure that black formatting will be preserved
template_env = Environment(loader=template_loader, keep_trailing_newline=True)
template = template_env.get_template(src.name)
if is_cookiecutter:
Expand Down
23 changes: 19 additions & 4 deletions kedro_mlflow/framework/context/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

class KedroMlflowConfig:

DISABLE_TRACKING_OPTS = {"pipelines": []}

EXPERIMENT_OPTS = {"name": "Default", "create": True}

RUN_OPTS = {"id": None, "name": None, "nested": True}
Expand All @@ -32,6 +34,7 @@ def __init__(
project_path: Union[str, Path],
mlflow_tracking_uri: str = "mlruns",
credentials: Optional[Dict[str, str]] = None,
disable_tracking_opts: Optional[Dict[str, str]] = None,
experiment_opts: Union[Dict[str, Any], None] = None,
run_opts: Union[Dict[str, Any], None] = None,
ui_opts: Union[Dict[str, Any], None] = None,
Expand All @@ -51,6 +54,7 @@ def __init__(
self.credentials = (
credentials or {}
) # replace None by {} but o not default to empty dict which is mutable
self.disable_tracking_opts = None
self.experiment_opts = None
self.run_opts = None
self.ui_opts = None
Expand All @@ -66,6 +70,7 @@ def __init__(
configuration = dict(
mlflow_tracking_uri=mlflow_tracking_uri,
credentials=credentials,
disable_tracking=disable_tracking_opts,
experiment=experiment_opts,
run=run_opts,
ui=ui_opts,
Expand All @@ -76,8 +81,7 @@ def __init__(
def setup(self):
"""Setup all the mlflow configuration"""

session = get_current_session()
self._export_credentials(session.load_context())
self._export_credentials()

# we set the configuration now: it takes priority
# if it has already be set in export_credentials
Expand All @@ -96,6 +100,10 @@ def from_dict(self, configuration: Dict[str, str]):
{
mlflow_tracking_uri: a valid string for mlflow tracking storage,
credentials: a valid string which exists in credentials.yml,
tracking:
{
pipelines {List[str]}: the name of pipeline which do not trigger tracking
},
experiments:
{
name {str}: the name of the experiment
Expand Down Expand Up @@ -125,13 +133,17 @@ def from_dict(self, configuration: Dict[str, str]):

mlflow_tracking_uri = configuration.get("mlflow_tracking_uri")
credentials = configuration.get("credentials")
disable_tracking_opts = configuration.get("disable_tracking")
experiment_opts = configuration.get("experiment")
run_opts = configuration.get("run")
ui_opts = configuration.get("ui")
node_hook_opts = configuration.get("hooks", {}).get("node")

self.mlflow_tracking_uri = self._validate_uri(uri=mlflow_tracking_uri)
self.credentials = credentials # do not replace by value here for safety
self.disable_tracking_opts = _validate_opts(
opts=disable_tracking_opts, default=self.DISABLE_TRACKING_OPTS
)
self.experiment_opts = _validate_opts(
opts=experiment_opts, default=self.EXPERIMENT_OPTS
)
Expand Down Expand Up @@ -183,6 +195,7 @@ def to_dict(self):
info = {
"mlflow_tracking_uri": self.mlflow_tracking_uri,
"credentials": self.credentials,
"disable_tracking": self.disable_tracking_opts,
"experiments": self.experiment_opts,
"run": self.run_opts,
"ui": self.ui_opts,
Expand Down Expand Up @@ -240,7 +253,7 @@ def _validate_uri(self, uri: Union[str, None]) -> str:
if parsed.scheme == "":
# if it is a local relative path, make it absolute
# .resolve() does not work well on windows
# .absolute is undocumented and have knwon bugs
# .absolute is undocumented and have known bugs
# Path.cwd() / uri is the recommend way by core developpers.
# See : https://discuss.python.org/t/pathlib-absolute-vs-resolve/2573/6
valid_uri = (self.project_path / uri).as_uri()
Expand All @@ -250,7 +263,9 @@ def _validate_uri(self, uri: Union[str, None]) -> str:

return valid_uri

def _export_credentials(self, context):
def _export_credentials(self):
session = get_current_session()
context = session.load_context()
conf_creds = context._get_config_credentials()
mlflow_creds = conf_creds.get(self.credentials, {})
for key, value in mlflow_creds.items():
Expand Down
53 changes: 29 additions & 24 deletions kedro_mlflow/framework/hooks/node_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH

from kedro_mlflow.framework.context import get_mlflow_config
from kedro_mlflow.framework.hooks.utils import _assert_mlflow_enabled


class MlflowNodeHook:
Expand All @@ -17,6 +18,7 @@ def __init__(self):
self.recursive = True
self.sep = "."
self.long_parameters_strategy = "fail"
self._is_mlflow_enabled = True

@property
def _logger(self) -> logging.Logger:
Expand Down Expand Up @@ -48,15 +50,17 @@ def before_pipeline_run(
pipeline: The ``Pipeline`` that will be run.
catalog: The ``DataCatalog`` to be used during the run.
"""
self._is_mlflow_enabled = _assert_mlflow_enabled(run_params["pipeline_name"])

config = get_mlflow_config()
if self._is_mlflow_enabled:
config = get_mlflow_config()

self.flatten = config.node_hook_opts["flatten_dict_params"]
self.recursive = config.node_hook_opts["recursive"]
self.sep = config.node_hook_opts["sep"]
self.long_parameters_strategy = config.node_hook_opts[
"long_parameters_strategy"
]
self.flatten = config.node_hook_opts["flatten_dict_params"]
self.recursive = config.node_hook_opts["recursive"]
self.sep = config.node_hook_opts["sep"]
self.long_parameters_strategy = config.node_hook_opts[
"long_parameters_strategy"
]

@hook_impl
def before_node_run(
Expand All @@ -78,23 +82,24 @@ def before_node_run(
"""

# only parameters will be logged. Artifacts must be declared manually in the catalog
params_inputs = {}
for k, v in inputs.items():
# detect parameters automatically based on kedro reserved names
if k.startswith("params:"):
params_inputs[k[7:]] = v
elif k == "parameters":
params_inputs[k] = v

# dictionary parameters may be flattened for readibility
if self.flatten:
params_inputs = flatten_dict(
d=params_inputs, recursive=self.recursive, sep=self.sep
)

# logging parameters based on defined strategy
for k, v in params_inputs.items():
self.log_param(k, v)
if self._is_mlflow_enabled:
params_inputs = {}
for k, v in inputs.items():
# detect parameters automatically based on kedro reserved names
if k.startswith("params:"):
params_inputs[k[7:]] = v
elif k == "parameters":
params_inputs[k] = v

# dictionary parameters may be flattened for readibility
if self.flatten:
params_inputs = flatten_dict(
d=params_inputs, recursive=self.recursive, sep=self.sep
)

# logging parameters based on defined strategy
for k, v in params_inputs.items():
self.log_param(k, v)

def log_param(self, name: str, value: Union[Dict, int, bool, str]) -> None:
str_value = str(value)
Expand Down
Loading

0 comments on commit b18f1e0

Please sign in to comment.