PARTIAL #92 - Add possibility to deactivate mlflow tracking for given…

… pipelines in configuration
Galileo-Galilei · Apr 9, 2021 · b18f1e0 · b18f1e0
1 parent 1953eee
commit b18f1e0
Show file tree

Hide file tree

Showing 22 changed files with 670 additions and 114 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Added
+
+-   It is now possible to deactivate tracking (for parameters and datasets) by specifying a key `disabled_tracking: pipelines: [<pipeline-name>]` in the `mlflow.yml` configuration file. ([#92](https://github.com/Galileo-Galilei/kedro-mlflow/issues/92))
+
 ## [0.7.0] - 2021-03-17
 
 ### Added

diff --git a/docs/source/04_experimentation_tracking/01_configuration.md b/docs/source/04_experimentation_tracking/01_configuration.md
@@ -49,6 +49,20 @@ credentials: my_mlflow_credentials
 
 For safety reasons, the credentials will not be accessible within `KedroMlflowConfig` objects. They wil be exported as environment variables *on the fly* when running the pipeline.
 
+### Deactivate tracking under conditions
+
+`kedro-mlflow` logs every run parameters in mlflow. You may want to avoid tracking some runs (for instance while debugging to avoid polluting your mlflow database, or because some pipelines are not ml related and it does not makes sense to log their parameters).
+
+You can specify the name of the pipelines you want to turn off:
+
+```yaml
+disable_tracking:
+  pipelines:
+    - <pipeline-name>
+```
+
+Notice that it will stop autologging parameters but also any `Mlflow<Artifact/Metrics/ModelLogger>Dataset` you may have in these deactivated pipelines.
+
 ### Configure mlflow experiment
 
 Mlflow enable the user to create "experiments" to organize his work. The different experiments will be visible on the left panel of the mlflow user interface. You can create an experiment through the `mlflow.yml` file witht the `experiment` key:

diff --git a/kedro_mlflow/framework/cli/cli_utils.py b/kedro_mlflow/framework/cli/cli_utils.py
@@ -20,7 +20,7 @@ def render_jinja_template(
 
     template_loader = FileSystemLoader(searchpath=src.parent.as_posix())
     # the keep_trailing_new_line option is mandatory to
-    # make sure that black formatting wil be preserved
+    # make sure that black formatting will be preserved
     template_env = Environment(loader=template_loader, keep_trailing_newline=True)
     template = template_env.get_template(src.name)
     if is_cookiecutter:

diff --git a/kedro_mlflow/framework/context/config.py b/kedro_mlflow/framework/context/config.py
@@ -12,6 +12,8 @@
 
 class KedroMlflowConfig:
 
+    DISABLE_TRACKING_OPTS = {"pipelines": []}
+
     EXPERIMENT_OPTS = {"name": "Default", "create": True}
 
     RUN_OPTS = {"id": None, "name": None, "nested": True}
@@ -32,6 +34,7 @@ def __init__(
         project_path: Union[str, Path],
         mlflow_tracking_uri: str = "mlruns",
         credentials: Optional[Dict[str, str]] = None,
+        disable_tracking_opts: Optional[Dict[str, str]] = None,
         experiment_opts: Union[Dict[str, Any], None] = None,
         run_opts: Union[Dict[str, Any], None] = None,
         ui_opts: Union[Dict[str, Any], None] = None,
@@ -51,6 +54,7 @@ def __init__(
         self.credentials = (
             credentials or {}
         )  # replace None by {} but o not default to empty dict which is mutable
+        self.disable_tracking_opts = None
         self.experiment_opts = None
         self.run_opts = None
         self.ui_opts = None
@@ -66,6 +70,7 @@ def __init__(
         configuration = dict(
             mlflow_tracking_uri=mlflow_tracking_uri,
             credentials=credentials,
+            disable_tracking=disable_tracking_opts,
             experiment=experiment_opts,
             run=run_opts,
             ui=ui_opts,
@@ -76,8 +81,7 @@ def __init__(
     def setup(self):
         """Setup all the mlflow configuration"""
 
-        session = get_current_session()
-        self._export_credentials(session.load_context())
+        self._export_credentials()
 
         # we set the configuration now: it takes priority
         # if it has already be set in export_credentials
@@ -96,6 +100,10 @@ def from_dict(self, configuration: Dict[str, str]):
             {
                 mlflow_tracking_uri: a valid string for mlflow tracking storage,
                 credentials: a valid string which exists in credentials.yml,
+                tracking:
+                    {
+                        pipelines {List[str]}: the name of pipeline which do not trigger tracking
+                    },
                 experiments:
                     {
                         name {str}: the name of the experiment
@@ -125,13 +133,17 @@ def from_dict(self, configuration: Dict[str, str]):
 
         mlflow_tracking_uri = configuration.get("mlflow_tracking_uri")
         credentials = configuration.get("credentials")
+        disable_tracking_opts = configuration.get("disable_tracking")
         experiment_opts = configuration.get("experiment")
         run_opts = configuration.get("run")
         ui_opts = configuration.get("ui")
         node_hook_opts = configuration.get("hooks", {}).get("node")
 
         self.mlflow_tracking_uri = self._validate_uri(uri=mlflow_tracking_uri)
         self.credentials = credentials  # do not replace by value here for safety
+        self.disable_tracking_opts = _validate_opts(
+            opts=disable_tracking_opts, default=self.DISABLE_TRACKING_OPTS
+        )
         self.experiment_opts = _validate_opts(
             opts=experiment_opts, default=self.EXPERIMENT_OPTS
         )
@@ -183,6 +195,7 @@ def to_dict(self):
         info = {
             "mlflow_tracking_uri": self.mlflow_tracking_uri,
             "credentials": self.credentials,
+            "disable_tracking": self.disable_tracking_opts,
             "experiments": self.experiment_opts,
             "run": self.run_opts,
             "ui": self.ui_opts,
@@ -240,7 +253,7 @@ def _validate_uri(self, uri: Union[str, None]) -> str:
             if parsed.scheme == "":
                 # if it is a local relative path, make it absolute
                 # .resolve() does not work well on windows
-                # .absolute is undocumented and have knwon bugs
+                # .absolute is undocumented and have known bugs
                 # Path.cwd() / uri is the recommend way by core developpers.
                 # See : https://discuss.python.org/t/pathlib-absolute-vs-resolve/2573/6
                 valid_uri = (self.project_path / uri).as_uri()
@@ -250,7 +263,9 @@ def _validate_uri(self, uri: Union[str, None]) -> str:
 
         return valid_uri
 
-    def _export_credentials(self, context):
+    def _export_credentials(self):
+        session = get_current_session()
+        context = session.load_context()
         conf_creds = context._get_config_credentials()
         mlflow_creds = conf_creds.get(self.credentials, {})
         for key, value in mlflow_creds.items():

diff --git a/kedro_mlflow/framework/hooks/node_hook.py b/kedro_mlflow/framework/hooks/node_hook.py
@@ -9,6 +9,7 @@
 from mlflow.utils.validation import MAX_PARAM_VAL_LENGTH
 
 from kedro_mlflow.framework.context import get_mlflow_config
+from kedro_mlflow.framework.hooks.utils import _assert_mlflow_enabled
 
 
 class MlflowNodeHook:
@@ -17,6 +18,7 @@ def __init__(self):
         self.recursive = True
         self.sep = "."
         self.long_parameters_strategy = "fail"
+        self._is_mlflow_enabled = True
 
     @property
     def _logger(self) -> logging.Logger:
@@ -48,15 +50,17 @@ def before_pipeline_run(
             pipeline: The ``Pipeline`` that will be run.
             catalog: The ``DataCatalog`` to be used during the run.
         """
+        self._is_mlflow_enabled = _assert_mlflow_enabled(run_params["pipeline_name"])
 
-        config = get_mlflow_config()
+        if self._is_mlflow_enabled:
+            config = get_mlflow_config()
 
-        self.flatten = config.node_hook_opts["flatten_dict_params"]
-        self.recursive = config.node_hook_opts["recursive"]
-        self.sep = config.node_hook_opts["sep"]
-        self.long_parameters_strategy = config.node_hook_opts[
-            "long_parameters_strategy"
-        ]
+            self.flatten = config.node_hook_opts["flatten_dict_params"]
+            self.recursive = config.node_hook_opts["recursive"]
+            self.sep = config.node_hook_opts["sep"]
+            self.long_parameters_strategy = config.node_hook_opts[
+                "long_parameters_strategy"
+            ]
 
     @hook_impl
     def before_node_run(
@@ -78,23 +82,24 @@ def before_node_run(
         """
 
         # only parameters will be logged. Artifacts must be declared manually in the catalog
-        params_inputs = {}
-        for k, v in inputs.items():
-            # detect parameters automatically based on kedro reserved names
-            if k.startswith("params:"):
-                params_inputs[k[7:]] = v
-            elif k == "parameters":
-                params_inputs[k] = v
-
-        # dictionary parameters may be flattened for readibility
-        if self.flatten:
-            params_inputs = flatten_dict(
-                d=params_inputs, recursive=self.recursive, sep=self.sep
-            )
-
-        # logging parameters based on defined strategy
-        for k, v in params_inputs.items():
-            self.log_param(k, v)
+        if self._is_mlflow_enabled:
+            params_inputs = {}
+            for k, v in inputs.items():
+                # detect parameters automatically based on kedro reserved names
+                if k.startswith("params:"):
+                    params_inputs[k[7:]] = v
+                elif k == "parameters":
+                    params_inputs[k] = v
+
+            # dictionary parameters may be flattened for readibility
+            if self.flatten:
+                params_inputs = flatten_dict(
+                    d=params_inputs, recursive=self.recursive, sep=self.sep
+                )
+
+            # logging parameters based on defined strategy
+            for k, v in params_inputs.items():
+                self.log_param(k, v)
 
     def log_param(self, name: str, value: Union[Dict, int, bool, str]) -> None:
         str_value = str(value)