From c695813c5124dfd7dbffb18347ad97112c5d8ddd Mon Sep 17 00:00:00 2001 From: zilto Date: Thu, 6 Jun 2024 14:05:25 -0400 Subject: [PATCH 01/12] added MLFlow model materialziers with tests --- hamilton/plugins/mlflow_extensions.py | 165 ++++++++++++++++++++++ tests/plugins/test_mlflow_extension.py | 186 +++++++++++++++++++++++++ 2 files changed, 351 insertions(+) create mode 100644 hamilton/plugins/mlflow_extensions.py create mode 100644 tests/plugins/test_mlflow_extension.py diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py new file mode 100644 index 000000000..340ed7a1d --- /dev/null +++ b/hamilton/plugins/mlflow_extensions.py @@ -0,0 +1,165 @@ +import dataclasses +import pathlib +from typing import Any, Callable, Collection, Dict, Literal, Optional, Tuple, Type, Union + +try: + import mlflow +except ImportError: + raise NotImplementedError("MLFlow is not installed.") + +from hamilton import registry +from hamilton.io.data_adapters import DataLoader, DataSaver + + +@dataclasses.dataclass +class MLFlowModelSaver(DataSaver): + def __init__( + self, + path: Union[str, pathlib.Path] = "model", + mode: Literal["save", "log"] = "save", + flavor: Optional[str] = None, + run_id: Optional[str] = None, + **kwargs, + ): + """ + :param path: Specify a filesystem path or model URI for MLFlow runs or registry + :param mode: `save` will store to local filesystem; `log` will add to MLFlow registry + :param flavor: sklearn, xgboost, etc. + :param run_id: Explicit run id used for `mode=log`. Otherwise, will use active run or create one. + :param kwargs: additional arguments to pass to `.save_model()` and `.log_model()`. + They can be flavor-specific. + """ + self.path = path + self.mode = mode + self.flavor = flavor + self.run_id = run_id + self.kwargs = kwargs + + @classmethod + def name(cls) -> str: + return "mlflow" + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [Callable] + + def save_data(self, data) -> Dict[str, Any]: + if self.flavor: + flavor = self.flavor + else: + # infer the flavor from the base module of the data class + # for example, extract `sklearn` from `sklearn.linear_model._base` + flavor, _, _ = data.__module__.partition(".") + + # retrieve the `mlflow.FLAVOR` submodule to use `.save_model()` and `.log_model()` + try: + flavor_module = getattr(mlflow, flavor) + except ImportError: + raise ImportError(f"Flavor {flavor} is unsupported by MLFlow") + + if self.mode == "save": + # .save_model() doesn't return anything + flavor_module.save_model(data, self.path, **self.kwargs) + metadata = dict(path=self.path, mode="save", flavor=flavor, **self.kwargs) + + elif self.mode == "log": + # handle `run_id` and active run conflicts + if mlflow.active_run() and self.run_id: + if mlflow.active_run().info.run_id != self.run_id: + raise RuntimeError( + "The MLFlowModelSaver `run_id` doesn't match the active `run_id`\n", + "Leave the `run_id` to None to save to the active MLFlow run.", + ) + + # save to active run + if mlflow.active_run(): + model_info = flavor_module.log_model(data, self.path, **self.kwargs) + # create a run with `run_id` and save to it + else: + with mlflow.start_run(run_id=self.run_id): + model_info = flavor_module.log_model(data, self.path, **self.kwargs) + + metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} + return metadata + + +# TODO handle loading from file, run, or registry + + +@dataclasses.dataclass +class MLFlowModelLoader(DataLoader): + def __init__( + self, + flavor: str, + path: Union[str, pathlib.Path] = "model", + model_uri: Optional[str] = None, + mode: Literal["filesystem", "runs", "registry"] = "filesystem", + run_id: Optional[str] = None, + model_name: Optional[str] = None, + version: Union[str, int] = "latest", + **kwargs, + ): + """ """ + self.flavor = flavor + self.path = path + self.model_uri = model_uri + self.mode = mode + self.run_id = run_id + self.model_name = model_name + self.version = version + self.kwargs = kwargs + + # if self.model_uri: + # if "runs:/" in self.model_uri: + # self.mode = "runs" + # # extract info from run model_uri + # _, _, remainder = self.model_uri.partition("runs:/") + # run_id, _, inferred_path = remainder.partition("/") + # self.run_id = run_id + # self.path = inferred_path + + # elif "models:/" in self.model_uri: + # self.mode = "registry" + # # extract info from registry model_uri + # _, _, remainder = self.model_uri.partition("models:/") + # model_name, _, version = remainder.partition("/") + # self.model_name = model_name + # self.model_version = version + if not self.model_uri: + if self.mode == "filesystem": + self.model_uri = pathlib.Path(self.path).as_uri() + elif self.mode == "runs": + self.model_uri = f"runs:/{self.run_id}/{self.path}" + elif self.mode == "registry": + self.model_uri = f"models:/{self.model_name}/{self.version}" + + @classmethod + def name(cls) -> str: + return "mlflow" + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [Callable] + + def load_data(self, type_: Type) -> Tuple[Any, Dict[str, Any]]: + try: + flavor_module = getattr(mlflow, self.flavor) + except ImportError: + raise ImportError(f"Flavor {self.flavor} is unsupported by MLFlow") + + model = flavor_module.load_model(model_uri=self.model_uri) + model_info = mlflow.models.model.get_model_info(self.model_uri) + metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} + return model, metadata + + +def register_data_loaders(): + """Function to register the data loaders for this extension.""" + for loader in [ + MLFlowModelSaver, + MLFlowModelLoader, + ]: + registry.register_adapter(loader) + + +register_data_loaders() diff --git a/tests/plugins/test_mlflow_extension.py b/tests/plugins/test_mlflow_extension.py new file mode 100644 index 000000000..e784eba22 --- /dev/null +++ b/tests/plugins/test_mlflow_extension.py @@ -0,0 +1,186 @@ +from pathlib import Path + +import mlflow +import numpy as np +import pytest +from sklearn.base import BaseEstimator +from sklearn.linear_model import LinearRegression + +from hamilton.plugins.mlflow_extensions import MLFlowModelLoader, MLFlowModelSaver + + +@pytest.fixture +def fitted_sklearn_model() -> BaseEstimator: + model = LinearRegression() + model.fit([[0]], [[0]]) + return model + + +def coefficients_are_equal(model1, model2) -> bool: + """Check if two linear models have the same coefficients""" + return np.allclose(model1.coef_, model2.coef_) and np.allclose( + model1.intercept_, model2.intercept_ + ) + + +def test_mlflow_save_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + model_path = tmp_path / "sklearn_model" + saver = MLFlowModelSaver(path=model_path, mode="save", flavor="sklearn") + expected_files = ["model.pkl", "conda.yaml", "MLmodel", "requirements.txt", "python_env.yaml"] + + # using MLFlow saver + saver.save_data(fitted_sklearn_model) + created_files = [str(p.name) for p in model_path.iterdir()] + # loading the saved model + loaded_model = mlflow.sklearn.load_model(model_path) + + assert model_path.exists() + assert set(created_files) == set(expected_files) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + +def test_mlflow_log_model_to_active_run(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + model_path = tmp_path / "sklearn_model" + saver = MLFlowModelSaver(mode="log", flavor="sklearn") + + mlflow.set_tracking_uri(model_path.as_uri()) + with mlflow.start_run(): + # save model + metadata = saver.save_data(fitted_sklearn_model) + # reload model + loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) + + assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( + fitted_sklearn_model.intercept_, loaded_model.intercept_ + ) + + +def test_mlflow_log_model_to_specific_run(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + model_path = tmp_path / "sklearn_model" + # create a "previous run" + mlflow.set_tracking_uri(model_path.as_uri()) + mlflow.start_run() + run_id = mlflow.active_run().info.run_id + mlflow.end_run() + saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + + # save model + metadata = saver.save_data(fitted_sklearn_model) + # reload model + loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) + + assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( + fitted_sklearn_model.intercept_, loaded_model.intercept_ + ) + + +def test_mlflow_log_model_active_and_specific_run_ids_are_equal( + fitted_sklearn_model: BaseEstimator, tmp_path: Path +): + model_path = tmp_path / "sklearn_model" + + mlflow.set_tracking_uri(model_path.as_uri()) + with mlflow.start_run(): + run_id = mlflow.active_run().info.run_id + saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + # save model + metadata = saver.save_data(fitted_sklearn_model) + # reload model + loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) + + assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( + fitted_sklearn_model.intercept_, loaded_model.intercept_ + ) + + +def test_mlflow_log_model_active_and_specific_run_ids_are_unequal( + fitted_sklearn_model: BaseEstimator, tmp_path: Path +): + model_path = tmp_path / "sklearn_model" + mlflow.set_tracking_uri(model_path.as_uri()) + mlflow.start_run() + run_id = mlflow.active_run().info.run_id + mlflow.end_run() + saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + + with mlflow.start_run(): + # save model + with pytest.raises(RuntimeError): + saver.save_data(fitted_sklearn_model) + + +def test_mlflow_load_local_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + model_path = tmp_path / "sklearn_model" + mlflow.sklearn.save_model(fitted_sklearn_model, model_path) + loader = MLFlowModelLoader(path=model_path, flavor="sklearn") + + loaded_model, metadata = loader.load_data(LinearRegression) + + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + +def test_mlflow_load_runs_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + mlflow_path = tmp_path / "mlflow_path" + artifact_path = "model" + mlflow.set_tracking_uri(mlflow_path.as_uri()) + with mlflow.start_run(): + run_id = mlflow.active_run().info.run_id + mlflow.sklearn.log_model(fitted_sklearn_model, artifact_path=artifact_path) + + # specify run via model_uri + loader = MLFlowModelLoader(model_uri=f"runs:/{run_id}/{artifact_path}", flavor="sklearn") + loaded_model, metadata = loader.load_data(LinearRegression) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + # specify run via arguments + loader = MLFlowModelLoader(path=artifact_path, run_id=run_id, mode="runs", flavor="sklearn") + loaded_model, metadata = loader.load_data(LinearRegression) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + +def test_mlflow_load_registry_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + mlflow_path = tmp_path / "mlflow_path" + artifact_path = "model" + model_name = "my_registered_model" + version = 1 + # track a model + mlflow.set_tracking_uri(mlflow_path.as_uri()) + with mlflow.start_run(): + run_id = mlflow.active_run().info.run_id + mlflow.sklearn.log_model(fitted_sklearn_model, artifact_path=artifact_path) + # register the model + run_model_uri = f"runs:/{run_id}/{artifact_path}" + mlflow.register_model(run_model_uri, model_name) + + # specify via model_uri + loader = MLFlowModelLoader(model_uri=f"models:/{model_name}/{version}", flavor="sklearn") + loaded_model, metadata = loader.load_data(LinearRegression) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + # specify via arguments + loader = MLFlowModelLoader( + mode="registry", model_name=model_name, version=version, flavor="sklearn" + ) + loaded_model, metadata = loader.load_data(LinearRegression) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) + + +def test_mlflow_infer_flavor(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + model_path = tmp_path / "sklearn_model" + saver = MLFlowModelSaver(path=model_path) + + metadata = saver.save_data(fitted_sklearn_model) + + assert metadata["flavor"] == "sklearn" + + +def test_mlflow_handle_saver_kwargs(): + path = "tmp/path" + mode = "save" + flavor = "sklearn" + saver = MLFlowModelSaver(path=path, mode=mode, flavor=flavor, unknown_kwarg=True) + + assert saver.path == path + assert saver.mode == mode + assert saver.flavor == flavor + assert saver.kwargs.get("unknown_kwarg") is True From 3676d55e7147d9c654695db823cfd5d001d5513c Mon Sep 17 00:00:00 2001 From: zilto Date: Thu, 6 Jun 2024 15:53:41 -0400 Subject: [PATCH 02/12] added saving example and model registration --- examples/mlflow/train_dataflow.py | 55 ++++++++++ hamilton/function_modifiers/base.py | 1 + hamilton/plugins/mlflow_extensions.py | 141 +++++++++++++------------- 3 files changed, 127 insertions(+), 70 deletions(-) create mode 100644 examples/mlflow/train_dataflow.py diff --git a/examples/mlflow/train_dataflow.py b/examples/mlflow/train_dataflow.py new file mode 100644 index 000000000..8e4c06302 --- /dev/null +++ b/examples/mlflow/train_dataflow.py @@ -0,0 +1,55 @@ +from typing import Dict, Union + +import pandas as pd +from sklearn.base import BaseEstimator +from sklearn.datasets import fetch_openml +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + +from hamilton.function_modifiers import extract_fields + + +@extract_fields( + {"X_train": pd.DataFrame, "X_test": pd.DataFrame, "y_train": pd.Series, "y_test": pd.Series} +) +def dataset_splits() -> Dict[str, Union[pd.DataFrame, pd.Series]]: + """Load the titanic dataset and partition it in X_train, y_train, X_test, y_test""" + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) + + feature_cols = ["fare", "age"] + X = X[feature_cols].fillna(0) + + X_train, X_test, y_train, y_test = train_test_split(X[feature_cols], y) + return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} + + +def trained_model( + X_train: pd.DataFrame, + y_train: pd.Series, +) -> BaseEstimator: + """Fit a binary classifier on the training data""" + model = LogisticRegression() + model.fit(X_train, y_train) + return model + + +if __name__ == "__main__": + import __main__ + + from hamilton import driver + from hamilton.io.materialization import to + + dr = ( + driver.Builder() + .with_modules(__main__) + .with_materializers( + to.mlflow( + id="trained_model__mlflow", + dependencies=["trained_model"], + ), + ) + .build() + ) + + results = dr.execute(["trained_model__mlflow"]) + print(results) diff --git a/hamilton/function_modifiers/base.py b/hamilton/function_modifiers/base.py index 2a876c21f..77c990143 100644 --- a/hamilton/function_modifiers/base.py +++ b/hamilton/function_modifiers/base.py @@ -40,6 +40,7 @@ "dlt", "kedro", "huggingface", + "mlflow", ] for plugin_module in plugins_modules: try: diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py index 340ed7a1d..5601b26f3 100644 --- a/hamilton/plugins/mlflow_extensions.py +++ b/hamilton/plugins/mlflow_extensions.py @@ -1,5 +1,6 @@ import dataclasses import pathlib +import shutil from typing import Any, Callable, Collection, Dict, Literal, Optional, Tuple, Type, Union try: @@ -13,27 +14,33 @@ @dataclasses.dataclass class MLFlowModelSaver(DataSaver): - def __init__( - self, - path: Union[str, pathlib.Path] = "model", - mode: Literal["save", "log"] = "save", - flavor: Optional[str] = None, - run_id: Optional[str] = None, - **kwargs, - ): - """ - :param path: Specify a filesystem path or model URI for MLFlow runs or registry - :param mode: `save` will store to local filesystem; `log` will add to MLFlow registry - :param flavor: sklearn, xgboost, etc. - :param run_id: Explicit run id used for `mode=log`. Otherwise, will use active run or create one. - :param kwargs: additional arguments to pass to `.save_model()` and `.log_model()`. - They can be flavor-specific. - """ - self.path = path - self.mode = mode - self.flavor = flavor - self.run_id = run_id - self.kwargs = kwargs + """ + :param path: Specify a filesystem path or model URI for MLFlow runs or registry + :param mode: `save` will store to local filesystem; `log` will add to MLFlow registry + :param flavor: sklearn, xgboost, etc. + :param run_id: Explicit run id used for `mode=log`. Otherwise, will use active run or create one. + :param kwargs: additional arguments to pass to `.save_model()` and `.log_model()`. + They can be flavor-specific. + """ + + path: Union[str, pathlib.Path] = "model" + mode: Literal["filesystem", "runs"] = "filesystem" + flavor: Optional[str] = None + run_id: Optional[str] = None + overwrite: bool = False + register: bool = False + model_name: Optional[str] = None + kwargs: Optional[Dict[str, Any]] = None + # kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) + + # A lot of dancing around because dataclass doesn't accept kwargs + # and hamilton.function_modifiers.adapters throws `InvalidDecoratorException` for dataclasses.field() defaults + def __post_init__(self): + self.kwargs = self.kwargs if self.kwargs else {} + + # ensures that model_name is not None in case register=True + if self.model_name is None: + self.model_name = pathlib.Path(self.path).name @classmethod def name(cls) -> str: @@ -41,7 +48,7 @@ def name(cls) -> str: @classmethod def applicable_types(cls) -> Collection[Type]: - return [Callable] + return [Any] def save_data(self, data) -> Dict[str, Any]: if self.flavor: @@ -57,12 +64,16 @@ def save_data(self, data) -> Dict[str, Any]: except ImportError: raise ImportError(f"Flavor {flavor} is unsupported by MLFlow") - if self.mode == "save": + if self.mode == "filesystem": + # have to manually delete directory to avoid MLFlow exception + if self.overwrite is True: + shutil.rmtree(self.path) + # .save_model() doesn't return anything flavor_module.save_model(data, self.path, **self.kwargs) - metadata = dict(path=self.path, mode="save", flavor=flavor, **self.kwargs) + model_info = mlflow.models.get_model_info(self.path) - elif self.mode == "log": + elif self.mode == "runs": # handle `run_id` and active run conflicts if mlflow.active_run() and self.run_id: if mlflow.active_run().info.run_id != self.run_id: @@ -79,7 +90,15 @@ def save_data(self, data) -> Dict[str, Any]: with mlflow.start_run(run_id=self.run_id): model_info = flavor_module.log_model(data, self.path, **self.kwargs) - metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} + metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} + if self.register: + model_version = mlflow.register_model( + model_uri=metadata["model_uri"], name=self.model_name + ) + metadata["registered_model"] = { + k.strip("_"): v for k, v in model_version.__dict__.items() + } + return metadata @@ -88,50 +107,30 @@ def save_data(self, data) -> Dict[str, Any]: @dataclasses.dataclass class MLFlowModelLoader(DataLoader): - def __init__( - self, - flavor: str, - path: Union[str, pathlib.Path] = "model", - model_uri: Optional[str] = None, - mode: Literal["filesystem", "runs", "registry"] = "filesystem", - run_id: Optional[str] = None, - model_name: Optional[str] = None, - version: Union[str, int] = "latest", - **kwargs, - ): - """ """ - self.flavor = flavor - self.path = path - self.model_uri = model_uri - self.mode = mode - self.run_id = run_id - self.model_name = model_name - self.version = version - self.kwargs = kwargs - - # if self.model_uri: - # if "runs:/" in self.model_uri: - # self.mode = "runs" - # # extract info from run model_uri - # _, _, remainder = self.model_uri.partition("runs:/") - # run_id, _, inferred_path = remainder.partition("/") - # self.run_id = run_id - # self.path = inferred_path - - # elif "models:/" in self.model_uri: - # self.mode = "registry" - # # extract info from registry model_uri - # _, _, remainder = self.model_uri.partition("models:/") - # model_name, _, version = remainder.partition("/") - # self.model_name = model_name - # self.model_version = version - if not self.model_uri: - if self.mode == "filesystem": - self.model_uri = pathlib.Path(self.path).as_uri() - elif self.mode == "runs": - self.model_uri = f"runs:/{self.run_id}/{self.path}" - elif self.mode == "registry": - self.model_uri = f"models:/{self.model_name}/{self.version}" + flavor: str + path: Union[str, pathlib.Path] = "model" + model_uri: Optional[str] = None + mode: Literal["filesystem", "runs", "registry"] = "filesystem" + run_id: Optional[str] = None + model_name: Optional[str] = None + version: Union[str, int] = "latest" + kwargs: Optional[Dict[str, Any]] = None + # kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) + + # A lot of dancing around because dataclass doesn't accept kwargs + # and hamilton.function_modifiers.adapters throws `InvalidDecoratorException` for dataclasses.field() defaults + def __post_init__(self): + self.kwargs = self.kwargs if self.kwargs else {} + + if self.model_uri: + return + + if self.mode == "filesystem": + self.model_uri = pathlib.Path(self.path).as_uri() + elif self.mode == "runs": + self.model_uri = f"runs:/{self.run_id}/{self.path}" + elif self.mode == "registry": + self.model_uri = f"models:/{self.model_name}/{self.version}" @classmethod def name(cls) -> str: @@ -163,3 +162,5 @@ def register_data_loaders(): register_data_loaders() + +COLUMN_FRIENDLY_DF_TYPE = False From e82c887b102add3d783587093ff9b3e0b7112962 Mon Sep 17 00:00:00 2001 From: zilto Date: Mon, 10 Jun 2024 16:20:19 -0400 Subject: [PATCH 03/12] added MLFlowTracker; updated tests; updated materializer API --- examples/mlflow/README.md | 1 + examples/mlflow/requirements.txt | 5 + examples/mlflow/train_dataflow.py | 55 -- examples/mlflow/tutorial.ipynb | 662 +++++++++++++++++++++++++ hamilton/plugins/h_mlflow.py | 289 +++++++++++ hamilton/plugins/mlflow_extensions.py | 158 +++--- tests/plugins/test_mlflow_extension.py | 65 +-- 7 files changed, 1062 insertions(+), 173 deletions(-) create mode 100644 examples/mlflow/README.md create mode 100644 examples/mlflow/requirements.txt delete mode 100644 examples/mlflow/train_dataflow.py create mode 100644 examples/mlflow/tutorial.ipynb create mode 100644 hamilton/plugins/h_mlflow.py diff --git a/examples/mlflow/README.md b/examples/mlflow/README.md new file mode 100644 index 000000000..34efe08bb --- /dev/null +++ b/examples/mlflow/README.md @@ -0,0 +1 @@ +# MLFLow plugin for Hamilton diff --git a/examples/mlflow/requirements.txt b/examples/mlflow/requirements.txt new file mode 100644 index 000000000..4d5aed0a1 --- /dev/null +++ b/examples/mlflow/requirements.txt @@ -0,0 +1,5 @@ +mlflow +numpy +pandas +scikit-learn +sf-hamilton[visualization] diff --git a/examples/mlflow/train_dataflow.py b/examples/mlflow/train_dataflow.py deleted file mode 100644 index 8e4c06302..000000000 --- a/examples/mlflow/train_dataflow.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Dict, Union - -import pandas as pd -from sklearn.base import BaseEstimator -from sklearn.datasets import fetch_openml -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split - -from hamilton.function_modifiers import extract_fields - - -@extract_fields( - {"X_train": pd.DataFrame, "X_test": pd.DataFrame, "y_train": pd.Series, "y_test": pd.Series} -) -def dataset_splits() -> Dict[str, Union[pd.DataFrame, pd.Series]]: - """Load the titanic dataset and partition it in X_train, y_train, X_test, y_test""" - X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) - - feature_cols = ["fare", "age"] - X = X[feature_cols].fillna(0) - - X_train, X_test, y_train, y_test = train_test_split(X[feature_cols], y) - return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} - - -def trained_model( - X_train: pd.DataFrame, - y_train: pd.Series, -) -> BaseEstimator: - """Fit a binary classifier on the training data""" - model = LogisticRegression() - model.fit(X_train, y_train) - return model - - -if __name__ == "__main__": - import __main__ - - from hamilton import driver - from hamilton.io.materialization import to - - dr = ( - driver.Builder() - .with_modules(__main__) - .with_materializers( - to.mlflow( - id="trained_model__mlflow", - dependencies=["trained_model"], - ), - ) - .build() - ) - - results = dr.execute(["trained_model__mlflow"]) - print(results) diff --git a/examples/mlflow/tutorial.ipynb b/examples/mlflow/tutorial.ipynb new file mode 100644 index 000000000..210a62395 --- /dev/null +++ b/examples/mlflow/tutorial.ipynb @@ -0,0 +1,662 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext hamilton.plugins.jupyter_magic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Training Dataflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Define" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "algo\n", + "\n", + "\n", + "\n", + "algo\n", + "logistic_regression\n", + "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_train\n", + "\n", + "y_train\n", + "Series\n", + "\n", + "\n", + "\n", + "trained_model\n", + "\n", + "trained_model: algo\n", + "BaseEstimator\n", + "\n", + "\n", + "\n", + "y_train->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_train\n", + "\n", + "X_train\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "X_train->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "split_dataset\n", + "\n", + "split_dataset\n", + "dict\n", + "\n", + "\n", + "\n", + "y->split_dataset\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_preprocessed\n", + "\n", + "X_preprocessed\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "X_preprocessed->split_dataset\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "split_dataset->y_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "split_dataset->X_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test\n", + "\n", + "X_test\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "split_dataset->X_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test\n", + "\n", + "y_test\n", + "Series\n", + "\n", + "\n", + "\n", + "split_dataset->y_test\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "test_scatter_plot\n", + "\n", + "test_scatter_plot\n", + "Figure\n", + "\n", + "\n", + "\n", + "y_test_predictions\n", + "\n", + "y_test_predictions\n", + "Series\n", + "\n", + "\n", + "\n", + "y_test_predictions->test_scatter_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "test_performance\n", + "\n", + "test_performance: algo\n", + "float\n", + "\n", + "\n", + "\n", + "y_test_predictions->test_performance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "trained_model->y_test_predictions\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test->test_scatter_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X_test->y_test_predictions\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X->X_preprocessed\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test->test_scatter_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_test->test_performance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_split_dataset_inputs\n", + "\n", + "test_size_fraction\n", + "float\n", + "\n", + "\n", + "\n", + "_split_dataset_inputs->split_dataset\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "config\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module model_training --display --config algo=logistic_regression\n", + "from typing import Dict, Union\n", + "\n", + "import pandas as pd\n", + "import matplotlib.figure\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.linear_model import LogisticRegression, LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import balanced_accuracy_score, mean_absolute_error\n", + "from hamilton.function_modifiers import extract_fields, tag, config\n", + "\n", + "\n", + "@extract_fields(dict(X=pd.DataFrame, y=pd.Series))\n", + "def load_data() -> dict:\n", + " X, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n", + " return dict(X=X, y=y)\n", + "\n", + "\n", + "def X_preprocessed(X: pd.DataFrame) -> pd.DataFrame:\n", + " column_selection = [\"fare\", \"age\"]\n", + " X = X[column_selection]\n", + " X = X.fillna(0)\n", + " return X \n", + "\n", + "\n", + "@extract_fields(dict(\n", + " X_train=pd.DataFrame,\n", + " y_train=pd.Series,\n", + " X_test=pd.DataFrame,\n", + " y_test=pd.Series,\n", + "))\n", + "def split_dataset(\n", + " X_preprocessed: pd.DataFrame,\n", + " y: pd.Series,\n", + " test_size_fraction: float = 0.3\n", + ") -> dict:\n", + " \"\"\"Load the titanic dataset and partition it in X_train, y_train, X_test, y_test\"\"\"\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X_preprocessed, y, test_size=test_size_fraction,\n", + " )\n", + " return dict(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " X_test=X_test,\n", + " y_test=y_test,\n", + " )\n", + "\n", + "@tag(team=\"forecast\")\n", + "@config.when(algo=\"logistic_regression\")\n", + "def trained_model__loistic(X_train: pd.DataFrame, y_train: pd.Series, hparams: dict) -> BaseEstimator:\n", + " \"\"\"Fit a binary classifier on the training data\"\"\"\n", + " model = LogisticRegression()\n", + " model.fit(X_train, y_train)\n", + " return model\n", + "\n", + "\n", + "@tag(team=\"forecast\")\n", + "@config.when(algo=\"linear_regression\")\n", + "def trained_model__linear(X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator:\n", + " \"\"\"Fit a binary classifier on the training data\"\"\"\n", + " model = LinearRegression()\n", + " model.fit(X_train, y_train)\n", + " return model\n", + "\n", + "def y_test_predictions(trained_model: BaseEstimator, X_test: pd.DataFrame) -> pd.Series:\n", + " return trained_model.predict(X_test)\n", + "\n", + "@config.when(algo=\"logistic_regression\")\n", + "def test_performance__logistic(y_test: pd.Series, y_test_predictions: pd.Series) -> float:\n", + " return balanced_accuracy_score(y_test, y_test_predictions)\n", + "\n", + "@config.when(algo=\"linear_regression\")\n", + "def test_performance__linear(y_test: pd.Series, y_test_predictions: pd.Series) -> float:\n", + " return mean_absolute_error(y_test, y_test_predictions)\n", + "\n", + "def test_scatter_plot(\n", + " X_test: pd.DataFrame,\n", + " y_test: pd.Series,\n", + " y_test_predictions: pd.Series,\n", + ") -> matplotlib.figure.Figure:\n", + " correctly_predicted = y_test == y_test_predictions\n", + " feature_1 = X_test.iloc[:, 0]\n", + " feature_2 = X_test.iloc[:, 1]\n", + "\n", + " fig = plt.figure()\n", + " plt.scatter(feature_1, feature_2, c=correctly_predicted)\n", + " return fig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Assemble" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hamilton import driver\n", + "from hamilton.io.materialization import to\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_modules(model_training)\n", + " .with_config(dict(algo=\"logistic_regression\"))\n", + " .with_materializers(\n", + " to.mlflow(\n", + " id=\"trained_model__mlflow\",\n", + " dependencies=[\"trained_model\"],\n", + " mode=\"runs\",\n", + " register=True,\n", + " model_name=\"my_classifier\",\n", + " ),\n", + " )\n", + " .build()\n", + ")\n", + "dr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Execute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_vars = [\"trained_model__mlflow\", \"y_test_predictions\", \"test_performance\"]\n", + "results = dr.execute(final_vars)\n", + "dr.visualize_execution(final_vars)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# balanced accuracy on test set\n", + "print(results[\"test_performance\"])\n", + "print()\n", + "# metadata of stored model\n", + "results[\"trained_model__mlflow\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Inference Dataflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Define" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%cell_to_module model_inference --display\n", + "from typing import Dict, Union\n", + "\n", + "import pandas as pd\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import balanced_accuracy_score\n", + "from hamilton.function_modifiers import extract_fields\n", + "\n", + "# import the preprocessing function from the module above\n", + "from model_training import X_preprocessed\n", + "\n", + "def preprocessed_inputs(user_input: dict) -> pd.DataFrame:\n", + " df = pd.DataFrame(user_input, index=[0])\n", + " df = X_preprocessed(df)\n", + " return df\n", + "\n", + "def prediction(preprocessed_inputs: pd.DataFrame, model: BaseEstimator) -> int:\n", + " return model.predict(preprocessed_inputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Assemble" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hamilton import driver\n", + "from hamilton.io.materialization import from_\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_modules(model_inference)\n", + " .with_materializers(\n", + " from_.mlflow(\n", + " target=\"model\",\n", + " mode=\"registry\",\n", + " model_name=\"my_classifier\",\n", + " ),\n", + " )\n", + " .build()\n", + ")\n", + "dr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Execute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs = dict(user_input={\"fare\": 10.72, \"age\": 48})\n", + "\n", + "final_vars = [\"prediction\", \"load_data.model\"]\n", + "results = dr.execute(final_vars, inputs=inputs)\n", + "dr.visualize_execution(final_vars, inputs=inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(results[\"prediction\"])\n", + "results[\"load_data.model\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MLFlowTracker" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import importlib\n", + "from hamilton import driver\n", + "import hamilton.plugins.h_mlflow\n", + "from hamilton.io.materialization import to\n", + "from hamilton.plugins.h_mlflow import MLFlowTracker\n", + "importlib.reload(hamilton.plugins.h_mlflow)\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_modules(model_training)\n", + " .with_config(dict(algo=\"logistic_regression\"))\n", + " .with_adapters(\n", + " hamilton.plugins.h_mlflow.MLFlowTracker()\n", + " )\n", + " .with_materializers(\n", + " to.mlflow(\n", + " id=\"trained_model__mlflow\",\n", + " dependencies=[\"trained_model\"],\n", + " mode=\"runs\",\n", + " register=True,\n", + " model_name=\"my_loom_video\",\n", + " ),\n", + " )\n", + " .build()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tjean/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/_distutils_hack/__init__.py:11: UserWarning: Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.\n", + " warnings.warn(\n", + "/home/tjean/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.\n", + " warnings.warn(\"Setuptools is replacing distutils.\")\n", + "Registered model 'my_loom_video' already exists. Creating a new version of this model...\n", + "Created version '2' of model 'my_loom_video'.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "inputs = dict(test_size_fraction=0.3)\n", + "results = dr.execute(\n", + " [\"trained_model__mlflow\", \"test_performance\", \"test_scatter_plot\"],\n", + " inputs=inputs\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hamilton/plugins/h_mlflow.py b/hamilton/plugins/h_mlflow.py new file mode 100644 index 000000000..193bc68f9 --- /dev/null +++ b/hamilton/plugins/h_mlflow.py @@ -0,0 +1,289 @@ +import logging +import pickle +from typing import Any, Dict, List, Optional, Type, Union + +import mlflow +import mlflow.data + +from hamilton import graph_types +from hamilton.lifecycle import GraphConstructionHook, GraphExecutionHook, NodeExecutionHook + +FIGURE_TYPES = [] +try: + import matplotlib.figure + + FIGURE_TYPES.append(matplotlib.figure.Figure) +except ImportError: + pass + +try: + import plotly.graph_objects + + FIGURE_TYPES.append(plotly.graph_objects.Figure) +except ImportError: + pass + + +logger = logging.getLogger(__name__) + + +def get_path_from_metadata(metadata: dict) -> Union[str, None]: + """Retrieve the `path` attribute from DataSaver output metadata""" + path = None + if "path" in metadata: + path = metadata["path"] + elif "file_metadata" in metadata: + path = metadata["file_metadata"]["path"] + + return path + + +# NOTE `mlflow.client.MLFlowClient` is preferred to top-level `mlflow.` methods in MLFlowTracker +# because the latter relies on hard-to-debug global variables. Yet, we set an `active_run` by using +# `mlflow.start_run()` in pre_graph_execution to ensure the user-specified MLFlow code +# and MLFlow materializers log metrics and models to the same run as the MLFlowTracker +class MLFlowTracker( + NodeExecutionHook, + GraphExecutionHook, + GraphConstructionHook, +): + """Driver adapter logging Hamilton execution results to an MLFlow server.""" + + def __init__( + self, + tracking_uri: Optional[str] = None, + registry_uri: Optional[str] = None, + artifact_location: Optional[str] = None, + experiment_name: str = "Hamilton", + experiment_tags: Optional[dict] = None, + experiment_description: Optional[str] = None, + run_id: Optional[str] = None, + run_name: Optional[str] = None, + run_tags: Optional[dict] = None, + run_description: Optional[str] = None, + log_system_metrics: bool = False, + ): + """Configure the MLFlow client and experiment for the lifetime of the tracker + + :param tracking_uri: Destination of the logged artifacts and metadata. It can be a filesystem, database, or server. [reference](https://mlflow.org/docs/latest/getting-started/tracking-server-overview/index.html) + :param registry_uri: Destination of the registered models. By default it's the same as the tracking destination, but they can be different. [reference](https://mlflow.org/docs/latest/getting-started/registering-first-model/index.html) + :param artifact_location: Root path on tracking server where experiment is stored + :param experiment_name: MLFlow experiment name used to group runs. + :param experiment_tags: Tags to query experiments programmatically (not displayed). + :param experiment_description: Description of the experiment displayed + :param run_id: Run id to log to an existing run (every execution logs to the same run) + :param run_name: Run name displayed and used to query runs. You can have multiple runs with the same name but different run ids. + :param run_tags: Tags to query runs and appears as columns in the UI for filtering and grouping. It automatically includes serializable inputs and Driver config. + :param run_description: Description of the run displayed + :param log_system_metrics: Log system metrics to display (requires additonal dependencies) + """ + self.client = mlflow.client.MlflowClient(tracking_uri, registry_uri) + + # experiment setup + experiment_tags = experiment_tags if experiment_tags else {} + if experiment_description: + # mlflow.note.content is the description field + experiment_tags["mlflow.note.content"] = experiment_description + + # TODO link HamiltonTracker project and MLFlowTracker experiment + experiment = self.client.get_experiment_by_name(experiment_name) + if experiment: + experiment_id = experiment.experiment_id + # update tags and description of an existing experiment + if experiment_tags: + for k, v in experiment_tags.items(): + self.client.set_experiment_tag(experiment_id, key=k, value=v) + # create an experiment + else: + experiment_id = self.client.create_experiment( + name=experiment_name, + artifact_location=artifact_location, + tags=experiment_tags, + ) + self.experiment_id = experiment_id + + # run setup + # TODO link HamiltonTracker and MLFlowTracker run ids + self.mlflow_run_id = run_id + self.run_name = run_name + self.run_tags = run_tags if run_tags else {} + if run_description: + # mlflow.note.content is the description field + self.run_tags["mlflow.note.content"] = run_description + + self.log_system_metrics = log_system_metrics + + def run_after_graph_construction(self, *, config: dict[str, Any], **kwargs): + """Store the Driver config before creating the graph""" + self.config = config + + def run_before_graph_execution( + self, + *, + run_id: str, + final_vars: List[str], + inputs: Dict[str, Any], + graph: graph_types.HamiltonGraph, + **kwargs, + ): + """Create and start MLFlow run. Log graph version, run_id, inputs, overrides""" + # add Hamilton metadata to run tags + run_tags = self.run_tags + run_tags["hamilton_run_id"] = run_id # the Hamilton run_id + run_tags["code_version"] = graph.version + + # create Hamilton run + self.run = self.client.create_run( + experiment_id=self.experiment_id, + tags=run_tags, + run_name=self.run_name, + ) + self.run_id = self.run.info.run_id + # start run to set `active_run` and allow user-defined callbacks and materializers + # to log to the same run as the HamiltonTracker + mlflow.start_run( + run_id=self.run_id, + experiment_id=self.experiment_id, + tags=run_tags, + log_system_metrics=self.log_system_metrics, + ) + + # log config to artifacts + self.client.log_dict(self.run_id, self.config, "config.json") + + # log HamiltonGraph to reproduce the run + self.graph = graph + graph_as_json = {n.name: n.as_dict() for n in graph.nodes} + self.client.log_dict(self.run_id, graph_as_json, "hamilton_graph.json") + + # log config and inputs as `param` which creates columns in the UI to filter runs + # `log_param()` accepts `value: Any` and will stringify complex objects + for value_sets in [self.config, inputs]: + for node_name, value in value_sets.items(): + self.client.log_param(self.run_id, key=node_name, value=value) + + self.final_vars = final_vars + + # TODO log DataLoaders as MLFlow datasets + def run_after_node_execution( + self, + *, + node_name: str, + node_return_type: Type, + node_tags: dict, + node_kwargs: dict, + result: Any, + **kwargs, + ): + """Log materializers and final vars as artifacts""" + # log DataSavers as artifacts + if node_tags.get("hamilton.data_saver") is True: + # don't log mlflow materializers as artifact since they already create models + # instead, use the Materializer metadata to add metadata to registered models + if node_tags["hamilton.data_saver.sink"] == "mlflow": + # skip if not registered model + if "registered_model" not in result.keys(): + return + + # get the registered model name (param of MLFlowModelSaver) + model_name = result["registered_model"]["name"] + version = result["registered_model"]["version"] + materializer_node = self.graph[node_name] + # get the "materialized node" defining the model + materialized_node = self.graph[materializer_node.required_dependencies.pop()] + # add the materialized node docstring as description + # registered models have multiple versions + self.client.update_registered_model(model_name, materialized_node.documentation) + self.client.update_model_version( + model_name, version, materialized_node.documentation + ) + + # add the materialized node @tag values as tags + for k, v in materialized_node.tags.items(): + # skip internal Hamilton tags + if "hamilton." in k: + continue + self.client.set_registered_model_tag(model_name, key=k, value=v) + self.client.set_model_version_tag(model_name, version, key=k, value=v) + # TODO automatically collect model input signature; maybe simpler from user code + + # special case for matplotlib and plotly + # log materialized figure. Allows great degree of control over rendering format + # and also save interactive plotly visualization as HTML + elif node_tags["hamilton.data_saver.sink"] in ["plt", "plotly"]: + materializer_node = self.graph[node_name] + materialized_node = self.graph[materializer_node.required_dependencies.pop()] + figure = node_kwargs[materialized_node.name] + + path = get_path_from_metadata(result) + if path: + self.client.log_figure(self.run_id, figure, path) + else: + logger.warning( + f"Materialization result from node={node_name} has no recordable path: {result}. Materializer must have either " + f"'path' or 'file_metadata' keys." + ) + + else: + # log the materializer path as an artifact + path = get_path_from_metadata(result) + if path: + self.client.log_artifact(self.run_id, path, node_name) + else: + logger.warning( + f"Materialization result from node={node_name} has no recordable path: {result}. Materializer must have either " + f"'path' or 'file_metadata' keys." + ) + return + + # log final_vars as artifacts + if node_name not in self.final_vars: + return + + # log float and int as metrics + if node_return_type in [float, int]: + self.client.log_metric(self.run_id, key=node_name, value=float(result)) + + # log str as text in .txt format + elif isinstance(node_return_type, str): + file_path = f"{node_name}.txt" + with open(file_path, "w") as f: + f.write(result) + self.client.log_text(self.run_id, result, file_path) + + # log_dict (JSON) dictionary types; pickle if not json-serializable + elif isinstance(node_return_type, dict): + try: + file_path = f"{node_name}.json" + self.client.log_dict(self.run_id, result, file_path) + # not json-serializable + except TypeError: + file_path = f"{node_name}.pickle" + with open(file_path, "wb") as f: + pickle.dump(result, file=f) + self.client.log_dict(self.run_id, result, file_path) + + # this puts less burden on users by not having to define materializers + # for viz, but less control over rendering format + elif node_return_type in FIGURE_TYPES: + file_path = f"{node_name}.png" + self.client.log_figure(self.run_id, result, file_path) + + # default to log_artifact in .pickle format + else: + file_path = f"{node_name}.pickle" + with open(file_path, "wb") as f: + pickle.dump(result, f) + self.client.log_dict(self.run_id, result, file_path) + + def run_after_graph_execution(self, success: bool, *args, **kwargs): + """End the MLFlow run""" + # `status` is an enum value of mlflow.entities.RunStatus + if success: + self.client.set_terminated(self.run_id, status="FINISHED") + else: + self.client.set_terminated(self.run_id, status="FAILED") + mlflow.end_run() + + def run_before_node_execution(self, *args, **kwargs): + """Placeholder required to subclass NodeExecutionHook""" diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py index 5601b26f3..c09789ebd 100644 --- a/hamilton/plugins/mlflow_extensions.py +++ b/hamilton/plugins/mlflow_extensions.py @@ -1,7 +1,6 @@ import dataclasses import pathlib -import shutil -from typing import Any, Callable, Collection, Dict, Literal, Optional, Tuple, Type, Union +from typing import Any, Collection, Dict, Literal, Optional, Tuple, Type, Union try: import mlflow @@ -14,34 +13,25 @@ @dataclasses.dataclass class MLFlowModelSaver(DataSaver): - """ - :param path: Specify a filesystem path or model URI for MLFlow runs or registry - :param mode: `save` will store to local filesystem; `log` will add to MLFlow registry - :param flavor: sklearn, xgboost, etc. - :param run_id: Explicit run id used for `mode=log`. Otherwise, will use active run or create one. - :param kwargs: additional arguments to pass to `.save_model()` and `.log_model()`. - They can be flavor-specific. + """Save model to the MLFlow tracking server using `.log_model()` + + :param path: Run relative path to store model. Will constitute the model URI. + :param register_as: If not None, register the model under the specified name. + :param flavor: Library format to save the model (sklearn, xgboost, etc.). Automatically inferred if None. + :param run_id: Log model to a specific run. Leave to `None` if using the `MLFlowTracker` + :param kwargs: Arguments for `.log_model()`. Can be flavor-specific. """ path: Union[str, pathlib.Path] = "model" - mode: Literal["filesystem", "runs"] = "filesystem" + register_as: Optional[str] = None + alias: Optional[str] = None flavor: Optional[str] = None run_id: Optional[str] = None - overwrite: bool = False - register: bool = False - model_name: Optional[str] = None - kwargs: Optional[Dict[str, Any]] = None - # kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) + kwargs: Dict[str, Any] = None - # A lot of dancing around because dataclass doesn't accept kwargs - # and hamilton.function_modifiers.adapters throws `InvalidDecoratorException` for dataclasses.field() defaults def __post_init__(self): self.kwargs = self.kwargs if self.kwargs else {} - # ensures that model_name is not None in case register=True - if self.model_name is None: - self.model_name = pathlib.Path(self.path).name - @classmethod def name(cls) -> str: return "mlflow" @@ -58,43 +48,38 @@ def save_data(self, data) -> Dict[str, Any]: # for example, extract `sklearn` from `sklearn.linear_model._base` flavor, _, _ = data.__module__.partition(".") - # retrieve the `mlflow.FLAVOR` submodule to use `.save_model()` and `.log_model()` + # retrieve the `mlflow.FLAVOR` submodule to `.log_model()` try: flavor_module = getattr(mlflow, flavor) except ImportError: raise ImportError(f"Flavor {flavor} is unsupported by MLFlow") - if self.mode == "filesystem": - # have to manually delete directory to avoid MLFlow exception - if self.overwrite is True: - shutil.rmtree(self.path) - - # .save_model() doesn't return anything - flavor_module.save_model(data, self.path, **self.kwargs) - model_info = mlflow.models.get_model_info(self.path) - - elif self.mode == "runs": - # handle `run_id` and active run conflicts - if mlflow.active_run() and self.run_id: - if mlflow.active_run().info.run_id != self.run_id: - raise RuntimeError( - "The MLFlowModelSaver `run_id` doesn't match the active `run_id`\n", - "Leave the `run_id` to None to save to the active MLFlow run.", - ) - - # save to active run - if mlflow.active_run(): + # handle `run_id` and active run conflicts + if mlflow.active_run() and self.run_id: + if mlflow.active_run().info.run_id != self.run_id: + raise RuntimeError( + "The MLFlowModelSaver `run_id` doesn't match the active `run_id`\n", + "Set `run_id=None` to save to the active MLFlow run.", + ) + + # save to active run + if mlflow.active_run(): + model_info = flavor_module.log_model(data, self.path, **self.kwargs) + # create a run with `run_id` and save to it + else: + with mlflow.start_run(run_id=self.run_id): model_info = flavor_module.log_model(data, self.path, **self.kwargs) - # create a run with `run_id` and save to it - else: - with mlflow.start_run(run_id=self.run_id): - model_info = flavor_module.log_model(data, self.path, **self.kwargs) + # create metadata from ModelInfo object metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} - if self.register: + + if self.register_as: model_version = mlflow.register_model( - model_uri=metadata["model_uri"], name=self.model_name + model_uri=metadata["model_uri"], name=self.register_as ) + # update metadata with the registered ModelVersion + # there's a contract between this key and the MLFlowTracker's + # post_node_execute() reads this metadata metadata["registered_model"] = { k.strip("_"): v for k, v in model_version.__dict__.items() } @@ -102,35 +87,61 @@ def save_data(self, data) -> Dict[str, Any]: return metadata -# TODO handle loading from file, run, or registry - - @dataclasses.dataclass class MLFlowModelLoader(DataLoader): - flavor: str - path: Union[str, pathlib.Path] = "model" + """Load model from the MLFlow tracking server or model registry using .load_model() + You can pass a model URI or the necessary metadata to retrieve the model + + :param model_uri: Model location starting as `runs:/` for tracking or `models:/` for registry + :param mode: `tracking` or registry`. tracking needs `run_id` and `path`. registry needs `model_name` and `version` or `version_alias`. + :param run_id: Run id of the model on the tracking server + :param path: Run relative path where the model is stored + :param model_name: Name of the registered model (equivalent to `register_as` in model saver) + :param version: Version of the registered model. Can pass as string `v1` or integer `1` + :param version_alias: Version alias of the registered model. Specify either this or `version` + :param flavor: Library format to load the model (sklearn, xgboost, etc.). Automatically inferred if None. + :param kwargs: Arguments for `.load_model()`. Can be flavor-specific. + """ + model_uri: Optional[str] = None - mode: Literal["filesystem", "runs", "registry"] = "filesystem" + mode: Literal["tracking", "registry"] = "tracking" run_id: Optional[str] = None + path: Union[str, pathlib.Path] = "model" model_name: Optional[str] = None - version: Union[str, int] = "latest" - kwargs: Optional[Dict[str, Any]] = None - # kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) + version: Optional[Union[str, int]] = None + version_alias: Optional[str] = None + flavor: Optional[str] = None + kwargs: Dict[str, Any] = None - # A lot of dancing around because dataclass doesn't accept kwargs - # and hamilton.function_modifiers.adapters throws `InvalidDecoratorException` for dataclasses.field() defaults + # __post_init__ is required to set kwargs as empty dict because + # can't set: kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict) + # otherwise raises `InvalidDecoratorException` because materializer factory check + # for all params being set and `kwargs` would be unset until instantiation. def __post_init__(self): self.kwargs = self.kwargs if self.kwargs else {} if self.model_uri: return - if self.mode == "filesystem": - self.model_uri = pathlib.Path(self.path).as_uri() - elif self.mode == "runs": + if self.mode == "tracking": + if (not self.run_id) or (not self.path): + raise ValueError("Using `mode='tracking'` requires passing `run_id` and `path`") + self.model_uri = f"runs:/{self.run_id}/{self.path}" + elif self.mode == "registry": - self.model_uri = f"models:/{self.model_name}/{self.version}" + if not self.model_name: + raise ValueError("Using `mode='registry` requires passing `model_name`") + + if bool(self.version) and bool(self.version_alias): + raise ValueError( + "If using `mode='registry'` requires passing `version` OR `version_alias" + ) + + if self.version: + self.model_uri = f"models:/{self.model_name}/{self.version}" + elif self.version: + self.model_uri = f"models:/{self.model_name}@{self.version_alias}" @classmethod def name(cls) -> str: @@ -138,17 +149,28 @@ def name(cls) -> str: @classmethod def applicable_types(cls) -> Collection[Type]: - return [Callable] + return [Any] def load_data(self, type_: Type) -> Tuple[Any, Dict[str, Any]]: + model_info = mlflow.models.model.get_model_info(self.model_uri) + metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} + + flavor = self.flavor + # if flavor not explicitly passed, retrieve flavor from the ModelInfo + if flavor is None: + # prioritize the library specific flavor. Default to `pyfunc` if none available. + try: + flavor = next(f for f in metadata["flavors"].keys() if f != "python_function") + except StopIteration: + flavor = "pyfunc" + + # retrieve the `mlflow.FLAVOR` submodule to `.log_model()` try: - flavor_module = getattr(mlflow, self.flavor) + flavor_module = getattr(mlflow, flavor) except ImportError: - raise ImportError(f"Flavor {self.flavor} is unsupported by MLFlow") + raise ImportError(f"Flavor {flavor} is unsupported by MLFlow") model = flavor_module.load_model(model_uri=self.model_uri) - model_info = mlflow.models.model.get_model_info(self.model_uri) - metadata = {k.strip("_"): v for k, v in model_info.__dict__.items()} return model, metadata diff --git a/tests/plugins/test_mlflow_extension.py b/tests/plugins/test_mlflow_extension.py index e784eba22..860e108ab 100644 --- a/tests/plugins/test_mlflow_extension.py +++ b/tests/plugins/test_mlflow_extension.py @@ -23,25 +23,9 @@ def coefficients_are_equal(model1, model2) -> bool: ) -def test_mlflow_save_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): - model_path = tmp_path / "sklearn_model" - saver = MLFlowModelSaver(path=model_path, mode="save", flavor="sklearn") - expected_files = ["model.pkl", "conda.yaml", "MLmodel", "requirements.txt", "python_env.yaml"] - - # using MLFlow saver - saver.save_data(fitted_sklearn_model) - created_files = [str(p.name) for p in model_path.iterdir()] - # loading the saved model - loaded_model = mlflow.sklearn.load_model(model_path) - - assert model_path.exists() - assert set(created_files) == set(expected_files) - assert coefficients_are_equal(fitted_sklearn_model, loaded_model) - - def test_mlflow_log_model_to_active_run(fitted_sklearn_model: BaseEstimator, tmp_path: Path): model_path = tmp_path / "sklearn_model" - saver = MLFlowModelSaver(mode="log", flavor="sklearn") + saver = MLFlowModelSaver(flavor="sklearn") mlflow.set_tracking_uri(model_path.as_uri()) with mlflow.start_run(): @@ -50,9 +34,7 @@ def test_mlflow_log_model_to_active_run(fitted_sklearn_model: BaseEstimator, tmp # reload model loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) - assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( - fitted_sklearn_model.intercept_, loaded_model.intercept_ - ) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) def test_mlflow_log_model_to_specific_run(fitted_sklearn_model: BaseEstimator, tmp_path: Path): @@ -62,16 +44,14 @@ def test_mlflow_log_model_to_specific_run(fitted_sklearn_model: BaseEstimator, t mlflow.start_run() run_id = mlflow.active_run().info.run_id mlflow.end_run() - saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + saver = MLFlowModelSaver(flavor="sklearn", run_id=run_id) # save model metadata = saver.save_data(fitted_sklearn_model) # reload model loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) - assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( - fitted_sklearn_model.intercept_, loaded_model.intercept_ - ) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) def test_mlflow_log_model_active_and_specific_run_ids_are_equal( @@ -82,15 +62,13 @@ def test_mlflow_log_model_active_and_specific_run_ids_are_equal( mlflow.set_tracking_uri(model_path.as_uri()) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id - saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + saver = MLFlowModelSaver(flavor="sklearn", run_id=run_id) # save model metadata = saver.save_data(fitted_sklearn_model) # reload model loaded_model = mlflow.sklearn.load_model(metadata["model_uri"]) - assert np.allclose(fitted_sklearn_model.coef_, loaded_model.coef_) and np.allclose( - fitted_sklearn_model.intercept_, loaded_model.intercept_ - ) + assert coefficients_are_equal(fitted_sklearn_model, loaded_model) def test_mlflow_log_model_active_and_specific_run_ids_are_unequal( @@ -101,7 +79,7 @@ def test_mlflow_log_model_active_and_specific_run_ids_are_unequal( mlflow.start_run() run_id = mlflow.active_run().info.run_id mlflow.end_run() - saver = MLFlowModelSaver(mode="log", flavor="sklearn", run_id=run_id) + saver = MLFlowModelSaver(flavor="sklearn", run_id=run_id) with mlflow.start_run(): # save model @@ -109,16 +87,6 @@ def test_mlflow_log_model_active_and_specific_run_ids_are_unequal( saver.save_data(fitted_sklearn_model) -def test_mlflow_load_local_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): - model_path = tmp_path / "sklearn_model" - mlflow.sklearn.save_model(fitted_sklearn_model, model_path) - loader = MLFlowModelLoader(path=model_path, flavor="sklearn") - - loaded_model, metadata = loader.load_data(LinearRegression) - - assert coefficients_are_equal(fitted_sklearn_model, loaded_model) - - def test_mlflow_load_runs_model(fitted_sklearn_model: BaseEstimator, tmp_path: Path): mlflow_path = tmp_path / "mlflow_path" artifact_path = "model" @@ -129,12 +97,12 @@ def test_mlflow_load_runs_model(fitted_sklearn_model: BaseEstimator, tmp_path: P # specify run via model_uri loader = MLFlowModelLoader(model_uri=f"runs:/{run_id}/{artifact_path}", flavor="sklearn") - loaded_model, metadata = loader.load_data(LinearRegression) + loaded_model, _ = loader.load_data(LinearRegression) assert coefficients_are_equal(fitted_sklearn_model, loaded_model) # specify run via arguments - loader = MLFlowModelLoader(path=artifact_path, run_id=run_id, mode="runs", flavor="sklearn") - loaded_model, metadata = loader.load_data(LinearRegression) + loader = MLFlowModelLoader(mode="tracking", path=artifact_path, run_id=run_id, flavor="sklearn") + loaded_model, _ = loader.load_data(LinearRegression) assert coefficients_are_equal(fitted_sklearn_model, loaded_model) @@ -154,33 +122,30 @@ def test_mlflow_load_registry_model(fitted_sklearn_model: BaseEstimator, tmp_pat # specify via model_uri loader = MLFlowModelLoader(model_uri=f"models:/{model_name}/{version}", flavor="sklearn") - loaded_model, metadata = loader.load_data(LinearRegression) + loaded_model, _ = loader.load_data(LinearRegression) assert coefficients_are_equal(fitted_sklearn_model, loaded_model) # specify via arguments loader = MLFlowModelLoader( mode="registry", model_name=model_name, version=version, flavor="sklearn" ) - loaded_model, metadata = loader.load_data(LinearRegression) + loaded_model, _ = loader.load_data(LinearRegression) assert coefficients_are_equal(fitted_sklearn_model, loaded_model) def test_mlflow_infer_flavor(fitted_sklearn_model: BaseEstimator, tmp_path: Path): - model_path = tmp_path / "sklearn_model" - saver = MLFlowModelSaver(path=model_path) + saver = MLFlowModelSaver(path="model") metadata = saver.save_data(fitted_sklearn_model) - assert metadata["flavor"] == "sklearn" + assert "sklearn" in metadata["flavors"].keys() def test_mlflow_handle_saver_kwargs(): path = "tmp/path" - mode = "save" flavor = "sklearn" - saver = MLFlowModelSaver(path=path, mode=mode, flavor=flavor, unknown_kwarg=True) + saver = MLFlowModelSaver(path=path, flavor=flavor, kwargs=dict(unknown_kwarg=True)) assert saver.path == path - assert saver.mode == mode assert saver.flavor == flavor assert saver.kwargs.get("unknown_kwarg") is True From be20b05310f3aed5d26cb3831b509f716740e15c Mon Sep 17 00:00:00 2001 From: zilto Date: Mon, 10 Jun 2024 18:40:45 -0400 Subject: [PATCH 04/12] removed alias from Saver --- hamilton/plugins/mlflow_extensions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py index c09789ebd..73f966751 100644 --- a/hamilton/plugins/mlflow_extensions.py +++ b/hamilton/plugins/mlflow_extensions.py @@ -24,7 +24,6 @@ class MLFlowModelSaver(DataSaver): path: Union[str, pathlib.Path] = "model" register_as: Optional[str] = None - alias: Optional[str] = None flavor: Optional[str] = None run_id: Optional[str] = None kwargs: Dict[str, Any] = None From 70e620bb7f31e18be0cde9fc1b26aaf872ab65e6 Mon Sep 17 00:00:00 2001 From: zilto Date: Mon, 10 Jun 2024 19:42:43 -0400 Subject: [PATCH 05/12] added README & tutorial; updated test requirements --- examples/mlflow/README.md | 34 + examples/mlflow/tutorial.ipynb | 1739 +++++++++++++++++++------ hamilton/plugins/h_mlflow.py | 6 + hamilton/plugins/mlflow_extensions.py | 2 +- requirements-test.txt | 1 + 5 files changed, 1406 insertions(+), 376 deletions(-) diff --git a/examples/mlflow/README.md b/examples/mlflow/README.md index 34efe08bb..ba5d49592 100644 --- a/examples/mlflow/README.md +++ b/examples/mlflow/README.md @@ -1 +1,35 @@ # MLFLow plugin for Hamilton + +[MLFlow](https://mlflow.org/) is an open-source Python framework for experiment tracking. It allows data science teams to store results, artifacts (machine learning models, figures, tables), and metadata in a principled way when executing data pipelines. + +The MLFlow plugin for Hamilton includes two sets of features: +- Save and load machine learning models with the `MLFlowModelSaver` and `MLFlowModelLoader` materializers +- Automatically track data pipeline results in MLFlow with the `MLFlowTracker`. + +This pairs nicely with the `HamiltonTracker` and the [Hamilton UI](https://hamilton.dagworks.io/en/latest/hamilton-ui/ui/) which gives you execution observability. + +We're looking forward to better link Hamilton "projects" with MLFlow "experiments" and runs from both projects. + +## Instructions +1. Create a virtual environment and activate it + ```console + python -m venv venv && . venv/bin/active + ``` + +2. Install requirements for the Hamilton code + ```console + pip install -r requirements.txt + ``` + +3. Explore the notebook `tutorial.ipynb` + + +4. Launch the MLFlow user interface to explore results + ```console + mlflow ui + ``` + +## Going further +- Learn the basics of Hamilton via the `Concepts/` [documentation section](https://hamilton.dagworks.io/en/latest/concepts/node/) +- Visit [tryhamilton.dev](tryhamilton.dev) for an interactive tutorial in your browser +- Visit the [DAGWorks blog](https://blog.dagworks.io/) for more detailed guides diff --git a/examples/mlflow/tutorial.ipynb b/examples/mlflow/tutorial.ipynb index 210a62395..e0b7ac547 100644 --- a/examples/mlflow/tutorial.ipynb +++ b/examples/mlflow/tutorial.ipynb @@ -1,5 +1,25 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLFlow plugin tutorial\n", + "This notebook shows to use the MLFlow plugin for Hamilton. The first three sections present minimal examples to introduce the core functionalities:\n", + "1. Training and saving a model with `MLFlowModelSaver`\n", + "2. Loading a model for inference with `MLFlowModelLoader`\n", + "3. Automatically tracking execution results with `MLFlowTracker`\n", + "\n", + "The following sections give details about individual features. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the [notebook extension](https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/jupyter_notebook_magic) for Hamilton. It allows us to define a dataflow in a code cell." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -13,14 +33,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Model Training Dataflow" + "## 1. Training and saving a model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1. Define" + "### 1.1 Define\n", + "We define a simple dataflow that loads the titanic dataset and trains a logistic regression to predict survival. The function parameters specify the dependencies between nodes of the dataflow.\n", + "\n", + "The first line of the cell `%%cell_to_module model_training --display` is related to the notebook extension and means this cell will define a self-contained Python module named `model_training`" ] }, { @@ -37,262 +60,1266 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "trained_model\n", + "\n", + "trained_model\n", + "LogisticRegression\n", + "\n", + "\n", + "\n", + "y->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module model_training --display\n", + "import pandas as pd\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.linear_model import LogisticRegression\n", + "from hamilton.function_modifiers import extract_fields\n", + "\n", + "# split the returned dictionary into 2 nodes: `X` and `y`\n", + "@extract_fields(dict(X=pd.DataFrame, y=pd.Series))\n", + "def load_data() -> dict:\n", + " \"\"\"Load the titanic dataset and split it in X and y. \n", + " Only keep the columns `fare` and `age` and fill null values.\n", + " \"\"\"\n", + " X, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n", + " X = X[[\"fare\", \"age\"]].fillna(0)\n", + " return dict(X=X, y=y)\n", + "\n", + "def trained_model(X: pd.DataFrame, y: pd.Series) -> LogisticRegression:\n", + " \"\"\"Fit a binary classifier on the data\"\"\"\n", + " model = LogisticRegression()\n", + " model.fit(X, y)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 Assemble\n", + "To execute code, we build the `Driver` with the module `model_training` defined in the previous cell. \n", + "\n", + "The statement `to.mlflow()` creates a `MLFlowModelSaver` that registers the model returned by `trained_model()` as `my_predictor` in the MLFlow model registry. We add this to the `Driver` using\n", + "`.with_materializers()`. A new node will be displayed in the visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "%3\n", - "\n", + "\n", "\n", "cluster__legend\n", - "\n", - "Legend\n", + "\n", + "Legend\n", "\n", - "\n", + "\n", "\n", - "algo\n", - "\n", - "\n", - "\n", - "algo\n", - "logistic_regression\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "trained_model\n", + "\n", + "trained_model\n", + "LogisticRegression\n", + "\n", + "\n", + "\n", + "X->trained_model\n", + "\n", + "\n", "\n", "\n", "\n", "load_data\n", - "\n", - "load_data\n", - "dict\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", "\n", "\n", "\n", "y\n", - "\n", - "y\n", - "Series\n", + "\n", + "y\n", + "Series\n", "\n", "\n", - "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "trained_model__mlflow\n", + "\n", + "\n", + "trained_model__mlflow\n", + "MLFlowModelSaver\n", + "\n", + "\n", + "\n", + "trained_model->trained_model__mlflow\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n", + "materializer\n", + "\n", + "\n", + "materializer\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hamilton import driver\n", + "from hamilton.io.materialization import to\n", + "\n", + "model_saver = to.mlflow(\n", + " id=\"trained_model__mlflow\", # name given to the saver\n", + " dependencies=[\"trained_model\"], # node returning the model\n", + " register_as=\"my_predictor\", # name of the model in the MLFlow registry\n", + ")\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_modules(model_training)\n", + " .with_materializers(model_saver)\n", + " .build()\n", + ")\n", + "dr" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mInit signature:\u001b[0m\n", + "\u001b[0mMLFlowModelSaver\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpathlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'model'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mregister_as\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mDict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "Save model to the MLFlow tracking server using `.log_model()`\n", + "\n", + ":param path: Run relative path to store model. Will constitute the model URI.\n", + ":param register_as: If not None, register the model under the specified name.\n", + ":param flavor: Library format to save the model (sklearn, xgboost, etc.). Automatically inferred if None.\n", + ":param run_id: Log model to a specific run. Leave to `None` if using the `MLFlowTracker`\n", + ":param kwargs: Arguments for `.log_model()`. Can be flavor-specific.\n", + "\u001b[0;31mFile:\u001b[0m ~/projects/dagworks/hamilton/hamilton/plugins/mlflow_extensions.py\n", + "\u001b[0;31mType:\u001b[0m ABCMeta\n", + "\u001b[0;31mSubclasses:\u001b[0m " + ] + } + ], + "source": [ + "# see the full API\n", + "from hamilton.plugins.mlflow_extensions import MLFlowModelSaver\n", + "MLFlowModelSaver?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.3 Execute\n", + "We execute our dataflow by calling `Driver.execute()` and requesting node names. Requesting `trained_model` will train the model and return it. Requesting `trained_model__mlflow` will train the model, save it, and return metadata. We then visualize the execution path " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'my_predictor' already exists. Creating a new version of this model...\n", + "Created version '7' of model 'my_predictor'.\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", "load_data->y\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "X\n", - "\n", - "X\n", - "DataFrame\n", + "\n", + "X\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "load_data->X\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_train\n", - "\n", - "y_train\n", - "Series\n", + "\n", + "\n", + "trained_model__mlflow\n", + "\n", + "\n", + "trained_model__mlflow\n", + "MLFlowModelSaver\n", "\n", "\n", - "\n", + "\n", + "trained_model\n", + "\n", + "trained_model\n", + "LogisticRegression\n", + "\n", + "\n", + "\n", + "trained_model->trained_model__mlflow\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "X->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n", + "output\n", + "\n", + "output\n", + "\n", + "\n", + "\n", + "materializer\n", + "\n", + "\n", + "materializer\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = dr.execute([\"trained_model\", \"trained_model__mlflow\"])\n", + "dr.visualize_execution([\"trained_model\", \"trained_model__mlflow\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'artifact_path': 'model',\n", + " 'flavors': {'python_function': {'model_path': 'model.pkl',\n", + " 'predict_fn': 'predict',\n", + " 'loader_module': 'mlflow.sklearn',\n", + " 'python_version': '3.11.1',\n", + " 'env': {'conda': 'conda.yaml', 'virtualenv': 'python_env.yaml'}},\n", + " 'sklearn': {'pickled_model': 'model.pkl',\n", + " 'sklearn_version': '1.5.0',\n", + " 'serialization_format': 'cloudpickle',\n", + " 'code': None}},\n", + " 'model_uri': 'runs:/67ebe9a6acdd402785428defe00b8c03/model',\n", + " 'model_uuid': '6641c1731cf549a08655dc0836ea51b1',\n", + " 'run_id': '67ebe9a6acdd402785428defe00b8c03',\n", + " 'saved_input_example_info': None,\n", + " 'signature_dict': None,\n", + " 'signature': None,\n", + " 'utc_time_created': '2024-06-10 23:38:25.769987',\n", + " 'mlflow_version': '2.13.2',\n", + " 'metadata': None,\n", + " 'registered_model': {'name': 'my_predictor',\n", + " 'version': 7,\n", + " 'creation_time': 1718062707186,\n", + " 'last_updated_timestamp': 1718062707186,\n", + " 'description': None,\n", + " 'user_id': None,\n", + " 'current_stage': 'None',\n", + " 'source': 'file:///home/tjean/projects/dagworks/hamilton/examples/mlflow/mlruns/0/67ebe9a6acdd402785428defe00b8c03/artifacts/model',\n", + " 'run_id': '67ebe9a6acdd402785428defe00b8c03',\n", + " 'run_link': None,\n", + " 'status': 'READY',\n", + " 'status_message': None,\n", + " 'tags': {},\n", + " 'aliases': []}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can inspect the model metadata\n", + "results[\"trained_model__mlflow\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Model Inference Dataflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Define\n", + "We define a simple dataflow that uses a trained model to make predictions on user inputs. Parameters that point to no other functions (e.g, `user_input`) are called \"inputs\" as you see on the visualization.\n", + "\n", + "We annotate `model: BaseEstimator` to allow any scikit-learn model to be passed." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "preprocessed_inputs\n", + "\n", + "preprocessed_inputs\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "prediction\n", + "\n", + "prediction\n", + "int\n", + "\n", + "\n", + "\n", + "preprocessed_inputs->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs\n", + "\n", + "user_input\n", + "dict\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs->preprocessed_inputs\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_prediction_inputs\n", + "\n", + "model\n", + "BaseEstimator\n", + "\n", + "\n", + "\n", + "_prediction_inputs->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%cell_to_module model_inference --display\n", + "import pandas as pd\n", + "from sklearn.base import BaseEstimator\n", + "\n", + "def preprocessed_inputs(user_input: dict) -> pd.DataFrame:\n", + " df = pd.DataFrame(user_input, index=[0])\n", + " return df\n", + "\n", + "def prediction(preprocessed_inputs: pd.DataFrame, model: BaseEstimator) -> int:\n", + " return model.predict(preprocessed_inputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Assemble\n", + "Again, we create a `Driver`, but this time we use `from_.mlflow()` to create a `MLFlowModelLoader` that looks for model `my_predictor` in the MLFlow registry. We pass this object through `.with_materializers()`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "model\n", + "\n", + "model\n", + "BaseEstimator\n", + "\n", + "\n", + "\n", + "prediction\n", + "\n", + "prediction\n", + "int\n", + "\n", + "\n", + "\n", + "model->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "preprocessed_inputs\n", + "\n", + "preprocessed_inputs\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "preprocessed_inputs->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "load_data.model\n", + "\n", + "load_data.model\n", + "Tuple\n", + "\n", + "\n", + "\n", + "load_data.model->model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs\n", + "\n", + "user_input\n", + "dict\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs->preprocessed_inputs\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hamilton import driver\n", + "from hamilton.io.materialization import from_\n", + "\n", + "model_loader = from_.mlflow(\n", + " target=\"model\",\n", + " mode=\"registry\",\n", + " model_name=\"my_predictor\",\n", + " version=1,\n", + ")\n", + "\n", + "dr = (\n", + " driver.Builder()\n", + " .with_modules(model_inference)\n", + " .with_materializers(model_loader)\n", + " .build()\n", + ")\n", + "dr" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mInit signature:\u001b[0m\n", + "\u001b[0mMLFlowModelLoader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_uri\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mLiteral\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'tracking'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'registry'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'tracking'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpathlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'model'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mversion\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNoneType\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mversion_alias\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mDict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "Load model from the MLFlow tracking server or model registry using .load_model()\n", + "You can pass a model URI or the necessary metadata to retrieve the model\n", + "\n", + ":param model_uri: Model location starting as `runs:/` for tracking or `models:/` for registry\n", + ":param mode: `tracking` or registry`. tracking needs `run_id` and `path`. registry needs `model_name` and `version` or `version_alias`.\n", + ":param run_id: Run id of the model on the tracking server\n", + ":param path: Run relative path where the model is stored\n", + ":param model_name: Name of the registered model (equivalent to `register_as` in model saver)\n", + ":param version: Version of the registered model. Can pass as string `v1` or integer `1`\n", + ":param version_alias: Version alias of the registered model. Specify either this or `version`\n", + ":param flavor: Library format to load the model (sklearn, xgboost, etc.). Automatically inferred if None.\n", + ":param kwargs: Arguments for `.load_model()`. Can be flavor-specific.\n", + "\u001b[0;31mFile:\u001b[0m ~/projects/dagworks/hamilton/hamilton/plugins/mlflow_extensions.py\n", + "\u001b[0;31mType:\u001b[0m ABCMeta\n", + "\u001b[0;31mSubclasses:\u001b[0m " + ] + } + ], + "source": [ + "# see the full API\n", + "from hamilton.plugins.mlflow_extensions import MLFlowModelLoader\n", + "MLFlowModelLoader?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Execute\n", + "We simulate user inputs that match the `fare` and `age` columns of the training data. Then, we request `prediction` and `load_data.model` to return the loaded model." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "model\n", + "\n", + "model\n", + "BaseEstimator\n", + "\n", + "\n", + "\n", + "prediction\n", + "\n", + "prediction\n", + "int\n", + "\n", + "\n", + "\n", + "model->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "preprocessed_inputs\n", + "\n", + "preprocessed_inputs\n", + "DataFrame\n", + "\n", + "\n", + "\n", + "preprocessed_inputs->prediction\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "load_data.model\n", + "\n", + "load_data.model\n", + "Tuple\n", + "\n", + "\n", + "\n", + "load_data.model->model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs\n", + "\n", + "user_input\n", + "dict\n", + "\n", + "\n", + "\n", + "_preprocessed_inputs_inputs->preprocessed_inputs\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "input\n", + "\n", + "input\n", + "\n", + "\n", + "\n", + "function\n", + "\n", + "function\n", + "\n", + "\n", + "\n", + "output\n", + "\n", + "output\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = dict(user_input={\"fare\": 10.72, \"age\": 48})\n", + "\n", + "results = dr.execute([\"prediction\", \"load_data.model\"], inputs=inputs)\n", + "dr.visualize_execution([\"prediction\", \"load_data.model\"], inputs=inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['0']\n" + ] + }, + { + "data": { + "text/plain": [ + "(LogisticRegression(),\n", + " {'artifact_path': 'model',\n", + " 'flavors': {'python_function': {'env': {'conda': 'conda.yaml',\n", + " 'virtualenv': 'python_env.yaml'},\n", + " 'loader_module': 'mlflow.sklearn',\n", + " 'model_path': 'model.pkl',\n", + " 'predict_fn': 'predict',\n", + " 'python_version': '3.11.1'},\n", + " 'sklearn': {'code': None,\n", + " 'pickled_model': 'model.pkl',\n", + " 'serialization_format': 'cloudpickle',\n", + " 'sklearn_version': '1.5.0'}},\n", + " 'model_uri': 'models:/my_predictor/1',\n", + " 'model_uuid': '5cfda7f11ed440e6823c13a99dc47471',\n", + " 'run_id': '8099b8e575d04476b47960431d17f9f5',\n", + " 'saved_input_example_info': None,\n", + " 'signature_dict': None,\n", + " 'signature': None,\n", + " 'utc_time_created': '2024-06-10 22:43:07.254057',\n", + " 'mlflow_version': '2.13.2',\n", + " 'metadata': None})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can inspect the prediction\n", + "# load_data.model returns a tuple (model, model metadata)\n", + "print(results[\"prediction\"])\n", + "results[\"load_data.model\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. MLFlowTracker\n", + "So far, we saved and loaded models, but the MLFlow metadata is almost empty. By adding the `MLFlowTracker()`, we can automatically track run configurations, metrics, figures, and other artifacts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 Define\n", + "We define a slightly more complex pipeline that splits the dataset into training and test sets. Then, we compute the model performance on each set and produce a scatter plot of features and correct/incorrect predictions.\n", + "\n", + "Notice that no `mlflow` statements is needed in our dataflow definition." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "cluster__legend\n", + "\n", + "Legend\n", + "\n", + "\n", + "\n", + "test_scatter_plot\n", + "\n", + "test_scatter_plot\n", + "Figure\n", + "\n", + "\n", + "\n", + "train_performance\n", + "\n", + "train_performance\n", + "float\n", + "\n", + "\n", + "\n", + "y_test\n", + "\n", + "y_test\n", + "Series\n", + "\n", + "\n", + "\n", + "y_test->test_scatter_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "test_performance\n", + "\n", + "test_performance\n", + "float\n", + "\n", + "\n", + "\n", + "y_test->test_performance\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y_train\n", + "\n", + "y_train\n", + "Series\n", + "\n", + "\n", + "\n", + "y_train->train_performance\n", + "\n", + "\n", + "\n", + "\n", + "\n", "trained_model\n", - "\n", - "trained_model: algo\n", - "BaseEstimator\n", + "\n", + "trained_model\n", + "BaseEstimator\n", "\n", "\n", - "\n", + "\n", "y_train->trained_model\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_train\n", - "\n", - "X_train\n", - "DataFrame\n", + "\n", + "\n", + "test_predictions\n", + "\n", + "test_predictions\n", + "Series\n", "\n", - "\n", - "\n", - "X_train->trained_model\n", - "\n", - "\n", + "\n", + "\n", + "test_predictions->test_scatter_plot\n", + "\n", + "\n", "\n", - "\n", + "\n", + "\n", + "test_predictions->test_performance\n", + "\n", + "\n", + "\n", + "\n", "\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", + "\n", + "\n", + "\n", "split_dataset\n", - "\n", - "split_dataset\n", - "dict\n", + "\n", + "split_dataset\n", + "dict\n", "\n", - "\n", - "\n", - "y->split_dataset\n", - "\n", - "\n", + "\n", + "\n", + "X->split_dataset\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_preprocessed\n", - "\n", - "X_preprocessed\n", - "DataFrame\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", "\n", - "\n", - "\n", - "X_preprocessed->split_dataset\n", - "\n", - "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "split_dataset->y_test\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "split_dataset->y_train\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "split_dataset->X_train\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "X_test\n", - "\n", - "X_test\n", - "DataFrame\n", + "\n", + "X_test\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "split_dataset->X_test\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_test\n", - "\n", - "y_test\n", - "Series\n", + "\n", + "\n", + "X_train\n", + "\n", + "X_train\n", + "DataFrame\n", "\n", - "\n", - "\n", - "split_dataset->y_test\n", - "\n", - "\n", + "\n", + "\n", + "split_dataset->X_train\n", + "\n", + "\n", "\n", - "\n", - "\n", - "test_scatter_plot\n", - "\n", - "test_scatter_plot\n", - "Figure\n", + "\n", + "\n", + "train_predictions\n", + "\n", + "train_predictions\n", + "Series\n", "\n", - "\n", - "\n", - "y_test_predictions\n", - "\n", - "y_test_predictions\n", - "Series\n", + "\n", + "\n", + "train_predictions->train_performance\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "y_test_predictions->test_scatter_plot\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "test_performance\n", - "\n", - "test_performance: algo\n", - "float\n", + "trained_model->test_predictions\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "y_test_predictions->test_performance\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "trained_model->y_test_predictions\n", - "\n", - "\n", + "trained_model->train_predictions\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "X_test->test_scatter_plot\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_test->y_test_predictions\n", - "\n", - "\n", + "\n", + "\n", + "X_test->test_predictions\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X->X_preprocessed\n", - "\n", - "\n", + "\n", + "\n", + "X_train->train_predictions\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_test->test_scatter_plot\n", - "\n", - "\n", + "\n", + "\n", + "X_train->trained_model\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_test->test_performance\n", - "\n", - "\n", + "\n", + "\n", + "y->split_dataset\n", + "\n", + "\n", "\n", "\n", "\n", "_split_dataset_inputs\n", - "\n", - "test_size_fraction\n", - "float\n", + "\n", + "test_size_fraction\n", + "float\n", "\n", "\n", - "\n", + "\n", "_split_dataset_inputs->split_dataset\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "config\n", - "\n", - "\n", - "\n", - "config\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "input\n", - "\n", - "input\n", + "\n", + "input\n", "\n", "\n", - "\n", + "\n", "function\n", - "\n", - "function\n", + "\n", + "function\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -300,90 +1327,70 @@ } ], "source": [ - "%%cell_to_module model_training --display --config algo=logistic_regression\n", - "from typing import Dict, Union\n", - "\n", + "%%cell_to_module model_training_2 --display\n", "import pandas as pd\n", "import matplotlib.figure\n", "import matplotlib.pyplot as plt\n", - "import numpy as np\n", "from sklearn.base import BaseEstimator\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.linear_model import LogisticRegression, LinearRegression\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import balanced_accuracy_score, mean_absolute_error\n", - "from hamilton.function_modifiers import extract_fields, tag, config\n", + "from sklearn.metrics import balanced_accuracy_score\n", + "from hamilton.function_modifiers import extract_fields\n", "\n", "\n", "@extract_fields(dict(X=pd.DataFrame, y=pd.Series))\n", "def load_data() -> dict:\n", + " \"\"\"Load the titanic dataset and split it in X and y. \n", + " Only keep the columns `fare` and `age` and fill null values.\n", + " \"\"\"\n", " X, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n", + " X = X[[\"fare\", \"age\"]].fillna(0)\n", " return dict(X=X, y=y)\n", "\n", "\n", - "def X_preprocessed(X: pd.DataFrame) -> pd.DataFrame:\n", - " column_selection = [\"fare\", \"age\"]\n", - " X = X[column_selection]\n", - " X = X.fillna(0)\n", - " return X \n", - "\n", - "\n", "@extract_fields(dict(\n", - " X_train=pd.DataFrame,\n", - " y_train=pd.Series,\n", - " X_test=pd.DataFrame,\n", - " y_test=pd.Series,\n", + " X_train=pd.DataFrame, y_train=pd.Series,\n", + " X_test=pd.DataFrame, y_test=pd.Series,\n", "))\n", "def split_dataset(\n", - " X_preprocessed: pd.DataFrame,\n", + " X: pd.DataFrame,\n", " y: pd.Series,\n", " test_size_fraction: float = 0.3\n", ") -> dict:\n", - " \"\"\"Load the titanic dataset and partition it in X_train, y_train, X_test, y_test\"\"\"\n", + " \"\"\"Partition the dataset into training and testing sets.\"\"\"\n", " X_train, X_test, y_train, y_test = train_test_split(\n", - " X_preprocessed, y, test_size=test_size_fraction,\n", - " )\n", - " return dict(\n", - " X_train=X_train,\n", - " y_train=y_train,\n", - " X_test=X_test,\n", - " y_test=y_test,\n", + " X, y, test_size=test_size_fraction,\n", " )\n", + " return dict(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)\n", "\n", - "@tag(team=\"forecast\")\n", - "@config.when(algo=\"logistic_regression\")\n", - "def trained_model__loistic(X_train: pd.DataFrame, y_train: pd.Series, hparams: dict) -> BaseEstimator:\n", - " \"\"\"Fit a binary classifier on the training data\"\"\"\n", + "def trained_model(X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator:\n", + " \"\"\"Binary classifier fitted on the training data\"\"\"\n", " model = LogisticRegression()\n", " model.fit(X_train, y_train)\n", " return model\n", "\n", + "def train_predictions(trained_model: BaseEstimator, X_train: pd.DataFrame) -> pd.Series:\n", + " return trained_model.predict(X_train)\n", "\n", - "@tag(team=\"forecast\")\n", - "@config.when(algo=\"linear_regression\")\n", - "def trained_model__linear(X_train: pd.DataFrame, y_train: pd.Series) -> BaseEstimator:\n", - " \"\"\"Fit a binary classifier on the training data\"\"\"\n", - " model = LinearRegression()\n", - " model.fit(X_train, y_train)\n", - " return model\n", + "def train_performance(y_train: pd.Series, train_predictions: pd.Series) -> float:\n", + " \"\"\"Balanced accuracy on the training set\"\"\"\n", + " return balanced_accuracy_score(y_train, train_predictions)\n", "\n", - "def y_test_predictions(trained_model: BaseEstimator, X_test: pd.DataFrame) -> pd.Series:\n", + "def test_predictions(trained_model: BaseEstimator, X_test: pd.DataFrame) -> pd.Series:\n", " return trained_model.predict(X_test)\n", "\n", - "@config.when(algo=\"logistic_regression\")\n", - "def test_performance__logistic(y_test: pd.Series, y_test_predictions: pd.Series) -> float:\n", - " return balanced_accuracy_score(y_test, y_test_predictions)\n", - "\n", - "@config.when(algo=\"linear_regression\")\n", - "def test_performance__linear(y_test: pd.Series, y_test_predictions: pd.Series) -> float:\n", - " return mean_absolute_error(y_test, y_test_predictions)\n", + "def test_performance(y_test: pd.Series, test_predictions: pd.Series) -> float:\n", + " \"\"\"Balanced accuracy on the training set\"\"\"\n", + " return balanced_accuracy_score(y_test, test_predictions)\n", "\n", "def test_scatter_plot(\n", " X_test: pd.DataFrame,\n", " y_test: pd.Series,\n", - " y_test_predictions: pd.Series,\n", + " test_predictions: pd.Series,\n", ") -> matplotlib.figure.Figure:\n", - " correctly_predicted = y_test == y_test_predictions\n", + " \"\"\"Scatter plot of fare and age with colors for correct/incorrect predictions\"\"\"\n", + " correctly_predicted = y_test == test_predictions\n", " feature_1 = X_test.iloc[:, 0]\n", " feature_2 = X_test.iloc[:, 1]\n", "\n", @@ -396,245 +1403,227 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Assemble" + "### 3.2 Assemble\n", + "This code is just like Section #1.2, but we add a `MLFlowTracker()` to the `Driver` by passing it to `.with_materializers()`. This objects accepts many arguments to set the right tracking and registry server, specify the experiment names, and set other metadata. Generally, the defaults are sufficient if you're developing locally. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from hamilton import driver\n", "from hamilton.io.materialization import to\n", + "from hamilton.plugins.h_mlflow import MLFlowTracker\n", "\n", "dr = (\n", " driver.Builder()\n", - " .with_modules(model_training)\n", - " .with_config(dict(algo=\"logistic_regression\"))\n", + " .with_modules(model_training_2)\n", + " .with_adapters(MLFlowTracker())\n", " .with_materializers(\n", " to.mlflow(\n", " id=\"trained_model__mlflow\",\n", " dependencies=[\"trained_model\"],\n", - " mode=\"runs\",\n", - " register=True,\n", - " model_name=\"my_classifier\",\n", + " register_as=\"my_new_model\",\n", " ),\n", " )\n", " .build()\n", - ")\n", - "dr" + ")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 14, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mInit signature:\u001b[0m\n", + "\u001b[0mMLFlowTracker\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtracking_uri\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mregistry_uri\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0martifact_location\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mexperiment_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Hamilton'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mexperiment_tags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mexperiment_description\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_tags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrun_description\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlog_system_metrics\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m Driver adapter logging Hamilton execution results to an MLFlow server.\n", + "\u001b[0;31mInit docstring:\u001b[0m\n", + "Configure the MLFlow client and experiment for the lifetime of the tracker\n", + "\n", + ":param tracking_uri: Destination of the logged artifacts and metadata. It can be a filesystem, database, or server. [reference](https://mlflow.org/docs/latest/getting-started/tracking-server-overview/index.html)\n", + ":param registry_uri: Destination of the registered models. By default it's the same as the tracking destination, but they can be different. [reference](https://mlflow.org/docs/latest/getting-started/registering-first-model/index.html)\n", + ":param artifact_location: Root path on tracking server where experiment is stored\n", + ":param experiment_name: MLFlow experiment name used to group runs.\n", + ":param experiment_tags: Tags to query experiments programmatically (not displayed).\n", + ":param experiment_description: Description of the experiment displayed\n", + ":param run_id: Run id to log to an existing run (every execution logs to the same run)\n", + ":param run_name: Run name displayed and used to query runs. You can have multiple runs with the same name but different run ids.\n", + ":param run_tags: Tags to query runs and appears as columns in the UI for filtering and grouping. It automatically includes serializable inputs and Driver config.\n", + ":param run_description: Description of the run displayed\n", + ":param log_system_metrics: Log system metrics to display (requires additonal dependencies)\n", + "\u001b[0;31mFile:\u001b[0m ~/projects/dagworks/hamilton/hamilton/plugins/h_mlflow.py\n", + "\u001b[0;31mType:\u001b[0m ABCMeta\n", + "\u001b[0;31mSubclasses:\u001b[0m " + ] + } + ], "source": [ - "### 3. Execute" + "MLFlowTracker?" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "final_vars = [\"trained_model__mlflow\", \"y_test_predictions\", \"test_performance\"]\n", - "results = dr.execute(final_vars)\n", - "dr.visualize_execution(final_vars)" + "### 3.3 Execute\n", + "Like before, we request nodes for execution. But this time, all requested nodes will be logged in MLFlow, not just model savers! Inputs and other metadata will also be automatically available (see next section for details)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'my_new_model' already exists. Creating a new version of this model...\n", + "Created version '5' of model 'my_new_model'.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# balanced accuracy on test set\n", - "print(results[\"test_performance\"])\n", - "print()\n", - "# metadata of stored model\n", - "results[\"trained_model__mlflow\"]" + "results = dr.execute(\n", + " [\"trained_model__mlflow\", \"train_performance\", \"test_performance\", \"test_scatter_plot\"],\n", + " inputs=dict(test_size_fraction=0.3)\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Model Inference Dataflow" + "## 4. Feature list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1. Define" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%cell_to_module model_inference --display\n", - "from typing import Dict, Union\n", - "\n", - "import pandas as pd\n", - "from sklearn.base import BaseEstimator\n", - "from sklearn.datasets import fetch_openml\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import balanced_accuracy_score\n", - "from hamilton.function_modifiers import extract_fields\n", - "\n", - "# import the preprocessing function from the module above\n", - "from model_training import X_preprocessed\n", - "\n", - "def preprocessed_inputs(user_input: dict) -> pd.DataFrame:\n", - " df = pd.DataFrame(user_input, index=[0])\n", - " df = X_preprocessed(df)\n", - " return df\n", - "\n", - "def prediction(preprocessed_inputs: pd.DataFrame, model: BaseEstimator) -> int:\n", - " return model.predict(preprocessed_inputs)" + "Automatically tracks `.execute(inputs=...)` and `Builder().with_config()` as MLFlow params. This creates columns that you can use to filter runs in the UI." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2. Assemble" + "The run tag `code_version` is automatically added by the `MLFlowTracker`. This allows you to know exactly what code was executed and group runs that use the same code, but vary in terms of inputs. " ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from hamilton import driver\n", - "from hamilton.io.materialization import from_\n", - "\n", - "dr = (\n", - " driver.Builder()\n", - " .with_modules(model_inference)\n", - " .with_materializers(\n", - " from_.mlflow(\n", - " target=\"model\",\n", - " mode=\"registry\",\n", - " model_name=\"my_classifier\",\n", - " ),\n", - " )\n", - " .build()\n", - ")\n", - "dr" + "Store the entire `HamiltonGraph` as an artifact `hamilton_graph.json`. This contains the source code of the executed dataflow." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Execute" + "Automatically log `plotly` and `matplotlib` figures as `.png` artifacts. For more control, you can use the `to.plotly()` and `to.plt()` savers. Notably, this allows you to save interactive plotly visualizations as HTML. " ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "inputs = dict(user_input={\"fare\": 10.72, \"age\": 48})\n", - "\n", - "final_vars = [\"prediction\", \"load_data.model\"]\n", - "results = dr.execute(final_vars, inputs=inputs)\n", - "dr.visualize_execution(final_vars, inputs=inputs)" + "Use the `MLFlowTracker` to specify experiment metadata and run metadata that will help you browse the MLFlow UI and programatic search. `experiment_description` and `run_description` accept markdown strings. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "MlflowException", + "evalue": "Invalid experiment name: Ellipsis. Expects a string.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mMlflowException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mMLFlowTracker\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mexperiment_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mexperiment_description\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_tags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_description\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/dagworks/hamilton/hamilton/plugins/h_mlflow.py:98\u001b[0m, in \u001b[0;36mMLFlowTracker.__init__\u001b[0;34m(self, tracking_uri, registry_uri, artifact_location, experiment_name, experiment_tags, experiment_description, run_id, run_name, run_tags, run_description, log_system_metrics)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient\u001b[38;5;241m.\u001b[39mset_experiment_tag(experiment_id, key\u001b[38;5;241m=\u001b[39mk, value\u001b[38;5;241m=\u001b[39mv)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# create an experiment\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 98\u001b[0m experiment_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexperiment_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 101\u001b[0m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexperiment_tags\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexperiment_id \u001b[38;5;241m=\u001b[39m experiment_id\n\u001b[1;32m 105\u001b[0m \u001b[38;5;66;03m# run setup\u001b[39;00m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;66;03m# TODO link HamiltonTracker and MLFlowTracker run ids\u001b[39;00m\n", + "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/tracking/client.py:1284\u001b[0m, in \u001b[0;36mMlflowClient.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_experiment\u001b[39m(\n\u001b[1;32m 1233\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1234\u001b[0m name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 1235\u001b[0m artifact_location: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1236\u001b[0m tags: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1237\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m 1238\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create an experiment.\u001b[39;00m\n\u001b[1;32m 1239\u001b[0m \n\u001b[1;32m 1240\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1282\u001b[0m \n\u001b[1;32m 1283\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tracking_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/tracking/_tracking_service/client.py:498\u001b[0m, in \u001b[0;36mTrackingServiceClient.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Create an experiment.\u001b[39;00m\n\u001b[1;32m 485\u001b[0m \n\u001b[1;32m 486\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 495\u001b[0m \n\u001b[1;32m 496\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 497\u001b[0m _validate_experiment_artifact_location(artifact_location)\n\u001b[0;32m--> 498\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExperimentTag\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py:391\u001b[0m, in \u001b[0;36mFileStore.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_experiment\u001b[39m(\u001b[38;5;28mself\u001b[39m, name, artifact_location\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, tags\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 390\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_root_dir()\n\u001b[0;32m--> 391\u001b[0m \u001b[43m_validate_experiment_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_experiment_does_not_exist(name)\n\u001b[1;32m 393\u001b[0m experiment_id \u001b[38;5;241m=\u001b[39m _generate_unique_integer_id()\n", + "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/utils/validation.py:366\u001b[0m, in \u001b[0;36m_validate_experiment_name\u001b[0;34m(experiment_name)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\n\u001b[1;32m 362\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid experiment name: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexperiment_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m, error_code\u001b[38;5;241m=\u001b[39mINVALID_PARAMETER_VALUE\n\u001b[1;32m 363\u001b[0m )\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_string_type(experiment_name):\n\u001b[0;32m--> 366\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\n\u001b[1;32m 367\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid experiment name: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexperiment_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Expects a string.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 368\u001b[0m error_code\u001b[38;5;241m=\u001b[39mINVALID_PARAMETER_VALUE,\n\u001b[1;32m 369\u001b[0m )\n", + "\u001b[0;31mMlflowException\u001b[0m: Invalid experiment name: Ellipsis. Expects a string." + ] + } + ], "source": [ - "print(results[\"prediction\"])\n", - "results[\"load_data.model\"]" + "MLFlowTracker(\n", + " experiment_name=...,\n", + " experiment_description=...,\n", + " run_name=...,\n", + " run_tags=...,\n", + " run_description=...,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## MLFlowTracker" + "The tags specified using the Hamilton decorator `@tag` on the model-producing function are stored in the MLFlow model registry " ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import importlib\n", - "from hamilton import driver\n", - "import hamilton.plugins.h_mlflow\n", - "from hamilton.io.materialization import to\n", - "from hamilton.plugins.h_mlflow import MLFlowTracker\n", - "importlib.reload(hamilton.plugins.h_mlflow)\n", + "import pandas as pd\n", + "from hamilton.function_modifiers import tag\n", + "from sklearn.linear_model import LogisticRegression\n", "\n", - "dr = (\n", - " driver.Builder()\n", - " .with_modules(model_training)\n", - " .with_config(dict(algo=\"logistic_regression\"))\n", - " .with_adapters(\n", - " hamilton.plugins.h_mlflow.MLFlowTracker()\n", - " )\n", - " .with_materializers(\n", - " to.mlflow(\n", - " id=\"trained_model__mlflow\",\n", - " dependencies=[\"trained_model\"],\n", - " mode=\"runs\",\n", - " register=True,\n", - " model_name=\"my_loom_video\",\n", - " ),\n", - " )\n", - " .build()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/tjean/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/_distutils_hack/__init__.py:11: UserWarning: Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.\n", - " warnings.warn(\n", - "/home/tjean/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.\n", - " warnings.warn(\"Setuptools is replacing distutils.\")\n", - "Registered model 'my_loom_video' already exists. Creating a new version of this model...\n", - "Created version '2' of model 'my_loom_video'.\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "inputs = dict(test_size_fraction=0.3)\n", - "results = dr.execute(\n", - " [\"trained_model__mlflow\", \"test_performance\", \"test_scatter_plot\"],\n", - " inputs=inputs\n", - ")" + "@tag(team=\"forecast\", feature_set=\"v3\")\n", + "def trained_model(X_train: pd.DataFrame, y_train: pd.Series) -> LogisticRegression:\n", + " \"\"\"Fit a binary classifier on the training data\"\"\"\n", + " model = LogisticRegression()\n", + " model.fit(X_train, y_train)\n", + " return model\n", + "\n", + "# ...\n", + "\n", + "to.mlflow(\n", + " id=\"trained_model__mlflow\",\n", + " dependencies=[\"trained_model\"],\n", + " register_as=\"new_algo\",\n", + ")," ] } ], diff --git a/hamilton/plugins/h_mlflow.py b/hamilton/plugins/h_mlflow.py index 193bc68f9..75ccee81a 100644 --- a/hamilton/plugins/h_mlflow.py +++ b/hamilton/plugins/h_mlflow.py @@ -1,5 +1,6 @@ import logging import pickle +import warnings from typing import Any, Dict, List, Optional, Type, Union import mlflow @@ -8,6 +9,11 @@ from hamilton import graph_types from hamilton.lifecycle import GraphConstructionHook, GraphExecutionHook, NodeExecutionHook +# silence odd ongoing MLFlow issue that spams warnings +# GitHub Issue https://github.com/mlflow/mlflow/issues/8605 +warnings.filterwarnings("ignore", category=UserWarning) + + FIGURE_TYPES = [] try: import matplotlib.figure diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py index 73f966751..396978422 100644 --- a/hamilton/plugins/mlflow_extensions.py +++ b/hamilton/plugins/mlflow_extensions.py @@ -132,7 +132,7 @@ def __post_init__(self): if not self.model_name: raise ValueError("Using `mode='registry` requires passing `model_name`") - if bool(self.version) and bool(self.version_alias): + if not (bool(self.version) ^ bool(self.version_alias)): raise ValueError( "If using `mode='registry'` requires passing `version` OR `version_alias" ) diff --git a/requirements-test.txt b/requirements-test.txt index a819c0455..9d79f161d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -13,6 +13,7 @@ lightgbm lxml lz4 matplotlib +mlflow networkx openpyxl # for excel data loader pandera From 1a93b5790c228d984479aa1ed96b920fa7691aeb Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 10:29:45 -0400 Subject: [PATCH 06/12] fix if condition --- hamilton/plugins/mlflow_extensions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hamilton/plugins/mlflow_extensions.py b/hamilton/plugins/mlflow_extensions.py index 396978422..2d78939b5 100644 --- a/hamilton/plugins/mlflow_extensions.py +++ b/hamilton/plugins/mlflow_extensions.py @@ -139,7 +139,7 @@ def __post_init__(self): if self.version: self.model_uri = f"models:/{self.model_name}/{self.version}" - elif self.version: + elif self.version_alias: self.model_uri = f"models:/{self.model_name}@{self.version_alias}" @classmethod From 047cc4e1c5193f3a37f4db15de5de0ea7d0bea5e Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 10:31:46 -0400 Subject: [PATCH 07/12] fixed README typo --- examples/mlflow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mlflow/README.md b/examples/mlflow/README.md index ba5d49592..327cf7bfb 100644 --- a/examples/mlflow/README.md +++ b/examples/mlflow/README.md @@ -8,7 +8,7 @@ The MLFlow plugin for Hamilton includes two sets of features: This pairs nicely with the `HamiltonTracker` and the [Hamilton UI](https://hamilton.dagworks.io/en/latest/hamilton-ui/ui/) which gives you execution observability. -We're looking forward to better link Hamilton "projects" with MLFlow "experiments" and runs from both projects. +We're working on better linking Hamilton "projects" with MLFlow "experiments" and runs from both projects. ## Instructions 1. Create a virtual environment and activate it From 94308360a11e3cad5b16c3f959bf1dd779817091 Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 10:33:24 -0400 Subject: [PATCH 08/12] updated README --- examples/mlflow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mlflow/README.md b/examples/mlflow/README.md index 327cf7bfb..4891302b5 100644 --- a/examples/mlflow/README.md +++ b/examples/mlflow/README.md @@ -6,7 +6,7 @@ The MLFlow plugin for Hamilton includes two sets of features: - Save and load machine learning models with the `MLFlowModelSaver` and `MLFlowModelLoader` materializers - Automatically track data pipeline results in MLFlow with the `MLFlowTracker`. -This pairs nicely with the `HamiltonTracker` and the [Hamilton UI](https://hamilton.dagworks.io/en/latest/hamilton-ui/ui/) which gives you execution observability. +This pairs nicely with the `HamiltonTracker` and the [Hamilton UI](https://hamilton.dagworks.io/en/latest/hamilton-ui/ui/) which gives you a way to explore your pipeline code, attributes of the artifacts produced, and execution observability. We're working on better linking Hamilton "projects" with MLFlow "experiments" and runs from both projects. From f3a2e0ec33ddc1e707453e74b6787e3d5f8e19f9 Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 10:39:05 -0400 Subject: [PATCH 09/12] added a test for HamiltonTracker and DataSaver contract --- tests/plugins/test_mlflow_extension.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/plugins/test_mlflow_extension.py b/tests/plugins/test_mlflow_extension.py index 860e108ab..f9f2dadfc 100644 --- a/tests/plugins/test_mlflow_extension.py +++ b/tests/plugins/test_mlflow_extension.py @@ -8,6 +8,9 @@ from hamilton.plugins.mlflow_extensions import MLFlowModelLoader, MLFlowModelSaver +# TODO move these tests to `plugin_tests` because the required read-writes can get +# complicated and tests are time consuming. + @pytest.fixture def fitted_sklearn_model() -> BaseEstimator: @@ -149,3 +152,17 @@ def test_mlflow_handle_saver_kwargs(): assert saver.path == path assert saver.flavor == flavor assert saver.kwargs.get("unknown_kwarg") is True + + +def test_mlflow_registered_model_metadata(fitted_sklearn_model: BaseEstimator, tmp_path: Path): + """When registering a model through materializers, the metadata must contain the + key `registered_model` because the `hamilton.plugins.h_mlflow.MLFlowTracker` is expecting it. + """ + model_path = tmp_path / "sklearn_model" + saver = MLFlowModelSaver(flavor="sklearn", register_as="my_model") + + mlflow.set_tracking_uri(model_path.as_uri()) + with mlflow.start_run(): + metadata = saver.save_data(fitted_sklearn_model) + + assert metadata.get("registered_model") From f73d81eab7f4d04ac057224c37850ccd31dc4a77 Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 11:01:55 -0400 Subject: [PATCH 10/12] updated tutorial notebook --- examples/mlflow/tutorial.ipynb | 986 +++++++++++++++++---------------- 1 file changed, 515 insertions(+), 471 deletions(-) diff --git a/examples/mlflow/tutorial.ipynb b/examples/mlflow/tutorial.ipynb index e0b7ac547..1e6dd950d 100644 --- a/examples/mlflow/tutorial.ipynb +++ b/examples/mlflow/tutorial.ipynb @@ -70,38 +70,12 @@ "\n", "Legend\n", "
\n", - "\n", - "\n", - "load_data\n", - "\n", - "load_data\n", - "dict\n", - "\n", - "\n", - "\n", - "y\n", - "\n", - "y\n", - "Series\n", - "\n", - "\n", - "\n", - "load_data->y\n", - "\n", - "\n", - "\n", "\n", - "\n", + "\n", "X\n", - "\n", - "X\n", - "DataFrame\n", - "\n", - "\n", - "\n", - "load_data->X\n", - "\n", - "\n", + "\n", + "X\n", + "DataFrame\n", "\n", "\n", "\n", @@ -110,17 +84,43 @@ "trained_model\n", "LogisticRegression\n", "\n", - "\n", + "\n", "\n", + "X->trained_model\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", "y->trained_model\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", "\n", - "X->trained_model\n", - "\n", - "\n", + "load_data->X\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", "\n", "\n", "\n", @@ -132,7 +132,7 @@ "
\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -199,12 +199,12 @@ "\n", "Legend\n", "\n", - "\n", + "\n", "\n", - "X\n", - "\n", - "X\n", - "DataFrame\n", + "y\n", + "\n", + "y\n", + "Series\n", "\n", "\n", "\n", @@ -213,37 +213,24 @@ "trained_model\n", "LogisticRegression\n", "\n", - "\n", - "\n", - "X->trained_model\n", - "\n", - "\n", + "\n", + "\n", + "y->trained_model\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "load_data\n", - "\n", - "load_data\n", - "dict\n", - "\n", - "\n", - "\n", - "load_data->X\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "y\n", - "\n", - "y\n", - "Series\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", "\n", - "\n", - "\n", - "load_data->y\n", - "\n", - "\n", + "\n", + "\n", + "X->trained_model\n", + "\n", + "\n", "\n", "\n", "\n", @@ -254,16 +241,29 @@ "MLFlowModelSaver\n", "\n", "\n", - "\n", + "\n", "trained_model->trained_model__mlflow\n", "\n", "\n", "\n", - "\n", - "\n", - "y->trained_model\n", - "\n", - "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", "\n", "\n", "\n", @@ -282,7 +282,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -364,7 +364,7 @@ "output_type": "stream", "text": [ "Registered model 'my_predictor' already exists. Creating a new version of this model...\n", - "Created version '7' of model 'my_predictor'.\n" + "Created version '8' of model 'my_predictor'.\n" ] }, { @@ -386,38 +386,25 @@ "\n", "Legend\n", "\n", - "\n", - "\n", - "load_data\n", - "\n", - "load_data\n", - "dict\n", - "\n", - "\n", - "\n", - "y\n", - "\n", - "y\n", - "Series\n", - "\n", - "\n", - "\n", - "load_data->y\n", - "\n", - "\n", - "\n", "\n", - "\n", + "\n", "X\n", - "\n", - "X\n", - "DataFrame\n", + "\n", + "X\n", + "DataFrame\n", "\n", - "\n", - "\n", - "load_data->X\n", - "\n", - "\n", + "\n", + "\n", + "trained_model\n", + "\n", + "trained_model\n", + "LogisticRegression\n", + "\n", + "\n", + "\n", + "X->trained_model\n", + "\n", + "\n", "\n", "\n", "\n", @@ -427,30 +414,43 @@ "trained_model__mlflow\n", "MLFlowModelSaver\n", "\n", - "\n", - "\n", - "trained_model\n", - "\n", - "trained_model\n", - "LogisticRegression\n", - "\n", "\n", - "\n", + "\n", "trained_model->trained_model__mlflow\n", "\n", "\n", "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", + "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", + "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", + "\n", "\n", - "\n", + "\n", "y->trained_model\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "X->trained_model\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", @@ -475,7 +475,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -506,24 +506,24 @@ " 'sklearn_version': '1.5.0',\n", " 'serialization_format': 'cloudpickle',\n", " 'code': None}},\n", - " 'model_uri': 'runs:/67ebe9a6acdd402785428defe00b8c03/model',\n", - " 'model_uuid': '6641c1731cf549a08655dc0836ea51b1',\n", - " 'run_id': '67ebe9a6acdd402785428defe00b8c03',\n", + " 'model_uri': 'runs:/30c106f0915d43fda9f5974fb36cdc39/model',\n", + " 'model_uuid': '860cc1ea405d461d8c58db4b68d25158',\n", + " 'run_id': '30c106f0915d43fda9f5974fb36cdc39',\n", " 'saved_input_example_info': None,\n", " 'signature_dict': None,\n", " 'signature': None,\n", - " 'utc_time_created': '2024-06-10 23:38:25.769987',\n", + " 'utc_time_created': '2024-06-11 15:01:30.896479',\n", " 'mlflow_version': '2.13.2',\n", " 'metadata': None,\n", " 'registered_model': {'name': 'my_predictor',\n", - " 'version': 7,\n", - " 'creation_time': 1718062707186,\n", - " 'last_updated_timestamp': 1718062707186,\n", + " 'version': 8,\n", + " 'creation_time': 1718118092290,\n", + " 'last_updated_timestamp': 1718118092290,\n", " 'description': None,\n", " 'user_id': None,\n", " 'current_stage': 'None',\n", - " 'source': 'file:///home/tjean/projects/dagworks/hamilton/examples/mlflow/mlruns/0/67ebe9a6acdd402785428defe00b8c03/artifacts/model',\n", - " 'run_id': '67ebe9a6acdd402785428defe00b8c03',\n", + " 'source': 'file:///home/tjean/projects/dagworks/hamilton/examples/mlflow/mlruns/0/30c106f0915d43fda9f5974fb36cdc39/artifacts/model',\n", + " 'run_id': '30c106f0915d43fda9f5974fb36cdc39',\n", " 'run_link': None,\n", " 'status': 'READY',\n", " 'status_message': None,\n", @@ -572,79 +572,79 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "%3\n", - "\n", + "\n", "\n", "cluster__legend\n", - "\n", - "Legend\n", + "\n", + "Legend\n", "\n", - "\n", + "\n", "\n", - "preprocessed_inputs\n", - "\n", - "preprocessed_inputs\n", - "DataFrame\n", + "prediction\n", + "\n", + "prediction\n", + "int\n", "\n", - "\n", + "\n", "\n", - "prediction\n", - "\n", - "prediction\n", - "int\n", + "preprocessed_inputs\n", + "\n", + "preprocessed_inputs\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "preprocessed_inputs->prediction\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "_preprocessed_inputs_inputs\n", - "\n", - "user_input\n", - "dict\n", + "_prediction_inputs\n", + "\n", + "model\n", + "BaseEstimator\n", "\n", - "\n", - "\n", - "_preprocessed_inputs_inputs->preprocessed_inputs\n", - "\n", - "\n", + "\n", + "\n", + "_prediction_inputs->prediction\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "_prediction_inputs\n", - "\n", - "model\n", - "BaseEstimator\n", + "_preprocessed_inputs_inputs\n", + "\n", + "user_input\n", + "dict\n", "\n", - "\n", + "\n", "\n", - "_prediction_inputs->prediction\n", - "\n", - "\n", + "_preprocessed_inputs_inputs->preprocessed_inputs\n", + "\n", + "\n", "\n", "\n", "\n", "input\n", - "\n", - "input\n", + "\n", + "input\n", "\n", "\n", "\n", "function\n", - "\n", - "function\n", + "\n", + "function\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -704,27 +704,27 @@ "BaseEstimator\n", "\n", "\n", - "\n", + "\n", "prediction\n", "\n", "prediction\n", "int\n", "\n", "\n", - "\n", + "\n", "model->prediction\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "preprocessed_inputs\n", "\n", "preprocessed_inputs\n", "DataFrame\n", "\n", "\n", - "\n", + "\n", "preprocessed_inputs->prediction\n", "\n", "\n", @@ -750,7 +750,7 @@ "dict\n", "\n", "\n", - "\n", + "\n", "_preprocessed_inputs_inputs->preprocessed_inputs\n", "\n", "\n", @@ -771,7 +771,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -867,98 +867,98 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "%3\n", - "\n", + "\n", "\n", "cluster__legend\n", - "\n", - "Legend\n", - "\n", - "\n", - "\n", - "model\n", - "\n", - "model\n", - "BaseEstimator\n", + "\n", + "Legend\n", "\n", "\n", - "\n", + "\n", "prediction\n", "\n", "prediction\n", "int\n", "\n", + "\n", + "\n", + "model\n", + "\n", + "model\n", + "BaseEstimator\n", + "\n", "\n", - "\n", + "\n", "model->prediction\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "preprocessed_inputs\n", - "\n", - "preprocessed_inputs\n", - "DataFrame\n", + "\n", + "preprocessed_inputs\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "preprocessed_inputs->prediction\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "load_data.model\n", - "\n", - "load_data.model\n", - "Tuple\n", + "\n", + "load_data.model\n", + "Tuple\n", "\n", "\n", - "\n", + "\n", "load_data.model->model\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "_preprocessed_inputs_inputs\n", - "\n", - "user_input\n", - "dict\n", + "\n", + "user_input\n", + "dict\n", "\n", "\n", - "\n", + "\n", "_preprocessed_inputs_inputs->preprocessed_inputs\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "input\n", - "\n", - "input\n", + "\n", + "input\n", "\n", "\n", "\n", "function\n", - "\n", - "function\n", + "\n", + "function\n", "\n", "\n", "\n", "output\n", - "\n", - "output\n", + "\n", + "output\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -1055,271 +1055,271 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "%3\n", - "\n", + "\n", "\n", "cluster__legend\n", - "\n", - "Legend\n", + "\n", + "Legend\n", "\n", - "\n", + "\n", "\n", - "test_scatter_plot\n", - "\n", - "test_scatter_plot\n", - "Figure\n", + "y_train\n", + "\n", + "y_train\n", + "Series\n", + "\n", + "\n", + "\n", + "trained_model\n", + "\n", + "trained_model\n", + "BaseEstimator\n", + "\n", + "\n", + "\n", + "y_train->trained_model\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "train_performance\n", - "\n", - "train_performance\n", - "float\n", + "\n", + "train_performance\n", + "float\n", + "\n", + "\n", + "\n", + "y_train->train_performance\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "y_test\n", - "\n", - "y_test\n", - "Series\n", - "\n", - "\n", - "\n", - "y_test->test_scatter_plot\n", - "\n", - "\n", + "\n", + "y_test\n", + "Series\n", "\n", "\n", "\n", "test_performance\n", - "\n", - "test_performance\n", - "float\n", + "\n", + "test_performance\n", + "float\n", "\n", "\n", - "\n", + "\n", "y_test->test_performance\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_train\n", - "\n", - "y_train\n", - "Series\n", - "\n", - "\n", - "\n", - "y_train->train_performance\n", - "\n", - "\n", + "\n", + "\n", + "test_scatter_plot\n", + "\n", + "test_scatter_plot\n", + "Figure\n", "\n", - "\n", - "\n", - "trained_model\n", - "\n", - "trained_model\n", - "BaseEstimator\n", + "\n", + "\n", + "y_test->test_scatter_plot\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y_train->trained_model\n", - "\n", - "\n", + "\n", + "\n", + "X_train\n", + "\n", + "X_train\n", + "DataFrame\n", "\n", - "\n", - "\n", - "test_predictions\n", - "\n", - "test_predictions\n", - "Series\n", + "\n", + "\n", + "X_train->trained_model\n", + "\n", + "\n", "\n", - "\n", - "\n", - "test_predictions->test_scatter_plot\n", - "\n", - "\n", + "\n", + "\n", + "train_predictions\n", + "\n", + "train_predictions\n", + "Series\n", "\n", - "\n", - "\n", - "test_predictions->test_performance\n", - "\n", - "\n", + "\n", + "\n", + "X_train->train_predictions\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X\n", - "\n", - "X\n", - "DataFrame\n", + "\n", + "\n", + "y\n", + "\n", + "y\n", + "Series\n", "\n", "\n", "\n", "split_dataset\n", - "\n", - "split_dataset\n", - "dict\n", + "\n", + "split_dataset\n", + "dict\n", "\n", - "\n", + "\n", "\n", - "X->split_dataset\n", - "\n", - "\n", + "y->split_dataset\n", + "\n", + "\n", "\n", - "\n", - "\n", - "load_data\n", - "\n", - "load_data\n", - "dict\n", + "\n", + "\n", + "X\n", + "\n", + "X\n", + "DataFrame\n", "\n", - "\n", + "\n", "\n", - "load_data->X\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "y\n", - "\n", - "y\n", - "Series\n", - "\n", - "\n", - "\n", - "load_data->y\n", - "\n", - "\n", + "X->split_dataset\n", + "\n", + "\n", "\n", - "\n", - "\n", - "split_dataset->y_test\n", - "\n", - "\n", + "\n", + "\n", + "X_test\n", + "\n", + "X_test\n", + "DataFrame\n", "\n", - "\n", + "\n", "\n", - "split_dataset->y_train\n", - "\n", - "\n", + "X_test->test_scatter_plot\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "X_test\n", - "\n", - "X_test\n", - "DataFrame\n", + "test_predictions\n", + "\n", + "test_predictions\n", + "Series\n", "\n", - "\n", - "\n", - "split_dataset->X_test\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "X_test->test_predictions\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_train\n", - "\n", - "X_train\n", - "DataFrame\n", + "\n", + "\n", + "split_dataset->y_train\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "split_dataset->y_test\n", + "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "split_dataset->X_train\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "train_predictions\n", - "\n", - "train_predictions\n", - "Series\n", - "\n", - "\n", - "\n", - "train_predictions->train_performance\n", - "\n", - "\n", + "\n", + "\n", + "split_dataset->X_test\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "trained_model->test_predictions\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "trained_model->train_predictions\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_test->test_scatter_plot\n", - "\n", - "\n", + "\n", + "\n", + "load_data\n", + "\n", + "load_data\n", + "dict\n", "\n", - "\n", - "\n", - "X_test->test_predictions\n", - "\n", - "\n", + "\n", + "\n", + "load_data->y\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_train->train_predictions\n", - "\n", - "\n", + "\n", + "\n", + "load_data->X\n", + "\n", + "\n", "\n", - "\n", - "\n", - "X_train->trained_model\n", - "\n", - "\n", + "\n", + "\n", + "test_predictions->test_performance\n", + "\n", + "\n", "\n", - "\n", - "\n", - "y->split_dataset\n", - "\n", - "\n", + "\n", + "\n", + "test_predictions->test_scatter_plot\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "train_predictions->train_performance\n", + "\n", + "\n", "\n", "\n", "\n", "_split_dataset_inputs\n", - "\n", - "test_size_fraction\n", - "float\n", + "\n", + "test_size_fraction\n", + "float\n", "\n", "\n", - "\n", + "\n", "_split_dataset_inputs->split_dataset\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", "\n", "input\n", - "\n", - "input\n", + "\n", + "input\n", "\n", "\n", "\n", "function\n", - "\n", - "function\n", + "\n", + "function\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1490,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1498,12 +1498,12 @@ "output_type": "stream", "text": [ "Registered model 'my_new_model' already exists. Creating a new version of this model...\n", - "Created version '5' of model 'my_new_model'.\n" + "Created version '6' of model 'my_new_model'.\n" ] }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1523,107 +1523,151 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Feature list" + "## 4. Feature list\n", + "A list of included features and cool things possible with the MLFlow plugin (in no particular order)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Automatically tracks `.execute(inputs=...)` and `Builder().with_config()` as MLFlow params. This creates columns that you can use to filter runs in the UI." + "1. Automatically tracks `.execute(inputs=...)` and `Builder().with_config()` as MLFlow params. This creates columns that you can use to filter runs in the UI." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The run tag `code_version` is automatically added by the `MLFlowTracker`. This allows you to know exactly what code was executed and group runs that use the same code, but vary in terms of inputs. " + "2. The run tag `code_version` is automatically added by the `MLFlowTracker`. This allows you to know exactly what code was executed and group runs that use the same code, but vary in terms of inputs. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Store the entire `HamiltonGraph` as an artifact `hamilton_graph.json`. This contains the source code of the executed dataflow." + "3. Store the entire `HamiltonGraph` as an artifact `hamilton_graph.json`. This contains the source code of the executed dataflow." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Automatically log `plotly` and `matplotlib` figures as `.png` artifacts. For more control, you can use the `to.plotly()` and `to.plt()` savers. Notably, this allows you to save interactive plotly visualizations as HTML. " + "4. Automatically log `plotly` and `matplotlib` figures as `.png` artifacts. For more control, you can use the `to.plotly()` and `to.plt()` savers. Notably, this allows you to save interactive plotly visualizations as HTML. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Use the `MLFlowTracker` to specify experiment metadata and run metadata that will help you browse the MLFlow UI and programatic search. `experiment_description` and `run_description` accept markdown strings. " + "5. Use the `MLFlowTracker` to specify experiment metadata and run metadata that will help you browse the MLFlow UI and programatic search. `experiment_description` and `run_description` accept markdown strings.\n", + "\n", + " ```python\n", + " MLFlowTracker(\n", + " experiment_name=...,\n", + " experiment_description=...,\n", + " run_name=...,\n", + " run_tags=...,\n", + " run_description=...,\n", + " )\n", + " ```" ] }, { - "cell_type": "code", - "execution_count": 16, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "ename": "MlflowException", - "evalue": "Invalid experiment name: Ellipsis. Expects a string.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mMlflowException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mMLFlowTracker\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mexperiment_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mexperiment_description\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_tags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_description\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/projects/dagworks/hamilton/hamilton/plugins/h_mlflow.py:98\u001b[0m, in \u001b[0;36mMLFlowTracker.__init__\u001b[0;34m(self, tracking_uri, registry_uri, artifact_location, experiment_name, experiment_tags, experiment_description, run_id, run_name, run_tags, run_description, log_system_metrics)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient\u001b[38;5;241m.\u001b[39mset_experiment_tag(experiment_id, key\u001b[38;5;241m=\u001b[39mk, value\u001b[38;5;241m=\u001b[39mv)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# create an experiment\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 98\u001b[0m experiment_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexperiment_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 101\u001b[0m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexperiment_tags\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexperiment_id \u001b[38;5;241m=\u001b[39m experiment_id\n\u001b[1;32m 105\u001b[0m \u001b[38;5;66;03m# run setup\u001b[39;00m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;66;03m# TODO link HamiltonTracker and MLFlowTracker run ids\u001b[39;00m\n", - "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/tracking/client.py:1284\u001b[0m, in \u001b[0;36mMlflowClient.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_experiment\u001b[39m(\n\u001b[1;32m 1233\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1234\u001b[0m name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 1235\u001b[0m artifact_location: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1236\u001b[0m tags: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1237\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m 1238\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create an experiment.\u001b[39;00m\n\u001b[1;32m 1239\u001b[0m \n\u001b[1;32m 1240\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1282\u001b[0m \n\u001b[1;32m 1283\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tracking_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/tracking/_tracking_service/client.py:498\u001b[0m, in \u001b[0;36mTrackingServiceClient.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Create an experiment.\u001b[39;00m\n\u001b[1;32m 485\u001b[0m \n\u001b[1;32m 486\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 495\u001b[0m \n\u001b[1;32m 496\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 497\u001b[0m _validate_experiment_artifact_location(artifact_location)\n\u001b[0;32m--> 498\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_experiment\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43martifact_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43martifact_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExperimentTag\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/store/tracking/file_store.py:391\u001b[0m, in \u001b[0;36mFileStore.create_experiment\u001b[0;34m(self, name, artifact_location, tags)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_experiment\u001b[39m(\u001b[38;5;28mself\u001b[39m, name, artifact_location\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, tags\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 390\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_root_dir()\n\u001b[0;32m--> 391\u001b[0m \u001b[43m_validate_experiment_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_experiment_does_not_exist(name)\n\u001b[1;32m 393\u001b[0m experiment_id \u001b[38;5;241m=\u001b[39m _generate_unique_integer_id()\n", - "File \u001b[0;32m~/projects/dagworks/hamilton/venv/lib/python3.11/site-packages/mlflow/utils/validation.py:366\u001b[0m, in \u001b[0;36m_validate_experiment_name\u001b[0;34m(experiment_name)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\n\u001b[1;32m 362\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid experiment name: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexperiment_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m, error_code\u001b[38;5;241m=\u001b[39mINVALID_PARAMETER_VALUE\n\u001b[1;32m 363\u001b[0m )\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_string_type(experiment_name):\n\u001b[0;32m--> 366\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\n\u001b[1;32m 367\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid experiment name: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexperiment_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Expects a string.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 368\u001b[0m error_code\u001b[38;5;241m=\u001b[39mINVALID_PARAMETER_VALUE,\n\u001b[1;32m 369\u001b[0m )\n", - "\u001b[0;31mMlflowException\u001b[0m: Invalid experiment name: Ellipsis. Expects a string." - ] - } - ], "source": [ - "MLFlowTracker(\n", - " experiment_name=...,\n", - " experiment_description=...,\n", - " run_name=...,\n", - " run_tags=...,\n", - " run_description=...,\n", - ")" + "6. The tags specified using the Hamilton decorator `@tag` on the model-producing function are stored in the MLFlow model registry\n", + " ```python\n", + " import pandas as pd\n", + " from hamilton.function_modifiers import tag\n", + " from sklearn.linear_model import LogisticRegression\n", + "\n", + " @tag(team=\"forecast\", feature_set=\"v3\")\n", + " def trained_model(X_train: pd.DataFrame, y_train: pd.Series) -> LogisticRegression:\n", + " \"\"\"Fit a binary classifier on the training data\"\"\"\n", + " model = LogisticRegression()\n", + " model.fit(X_train, y_train)\n", + " return model\n", + "\n", + " # ...\n", + "\n", + " to.mlflow(\n", + " id=\"trained_model__mlflow\",\n", + " dependencies=[\"trained_model\"],\n", + " register_as=\"new_algo\",\n", + " ),\n", + " ```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The tags specified using the Hamilton decorator `@tag` on the model-producing function are stored in the MLFlow model registry " + "7. Use the `MLFlowTracker` with the `HamiltonTracker`. You can link the two by matching:\n", + "\n", + " - `experiment_name` == `project_id`; You can manually create an `experiment_id`, but you can set its name.\n", + " - `run_name` == `dag_name`; You can have multiple MLFlow runs with the same name\n", + "\n", + " ```python\n", + " from hamilton import driver\n", + " from hamilton.io.materialization import to\n", + " from hamilton.plugins.h_mlflow import MLFlowTracker\n", + " from hamilton_sdk.adapters import HamiltonTracker\n", + "\n", + " project_id = 3\n", + " dag_name = \"titanic_classifier_training\"\n", + "\n", + " dr = (\n", + " driver.Builder()\n", + " .with_modules(model_training_2)\n", + " .with_adapters(\n", + " MLFlowTracker(\n", + " experiment_name=f\"hamilton-project-{project_id}\",\n", + " run_name=dag_name,\n", + " ),\n", + " HamiltonTracker(\n", + " username=\"my_username\",\n", + " project_id=project_id,\n", + " dag_name=dag_name,\n", + " )\n", + " )\n", + " .with_materializers(\n", + " to.mlflow(\n", + " id=\"trained_model__mlflow\",\n", + " dependencies=[\"trained_model\"],\n", + " register_as=\"my_new_model\",\n", + " ),\n", + " )\n", + " .build()\n", + " )\n", + " ```" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd\n", - "from hamilton.function_modifiers import tag\n", - "from sklearn.linear_model import LogisticRegression\n", + "8. Log model performance with nested runs (e.g., cross-validation, hyperparameter tuning). This will require adding `mlflow` code in your dataflow definition though.\n", + " ```python\n", + " import mlflow\n", + " from sklearn.model_selection import KFold\n", "\n", - "@tag(team=\"forecast\", feature_set=\"v3\")\n", - "def trained_model(X_train: pd.DataFrame, y_train: pd.Series) -> LogisticRegression:\n", - " \"\"\"Fit a binary classifier on the training data\"\"\"\n", - " model = LogisticRegression()\n", - " model.fit(X_train, y_train)\n", - " return model\n", + " def model_cross_validation(X: pd.DataFrame, y: pd.Series):\n", + " kfold = KFold(n_splits=3)\n", + "\n", + " for train, test in kf.split(X):\n", + " X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]\n", + "\n", + " model = LogisticRegression()\n", + " model.fit(X_train, y_train)\n", "\n", - "# ...\n", + " test_pred = model.predict(X_test)\n", + " score = balanced_accuracy(y_test, test_pred)\n", "\n", - "to.mlflow(\n", - " id=\"trained_model__mlflow\",\n", - " dependencies=[\"trained_model\"],\n", - " register_as=\"new_algo\",\n", - ")," + " with mlflow.start_run(nested=True):\n", + " mlflow.log_metric(\"balanced_accuracy\", score)\n", + " # ... could log plots, hyperparams, etc.\n", + " ```\n" ] } ], From 6caf06ed577c334346af2436767360177b8113b4 Mon Sep 17 00:00:00 2001 From: zilto Date: Tue, 11 Jun 2024 11:28:51 -0400 Subject: [PATCH 11/12] added docs, added TODOs --- docs/integrations/index.rst | 1 + hamilton/plugins/h_mlflow.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/docs/integrations/index.rst b/docs/integrations/index.rst index 7fb9aaafb..3f5c68d6e 100644 --- a/docs/integrations/index.rst +++ b/docs/integrations/index.rst @@ -10,6 +10,7 @@ This section showcases how Hamilton integrates with popular frameworks. ibis/index streamlit dbt + MLFlow Airflow Amazon Web Services Burr diff --git a/hamilton/plugins/h_mlflow.py b/hamilton/plugins/h_mlflow.py index 75ccee81a..1119ced12 100644 --- a/hamilton/plugins/h_mlflow.py +++ b/hamilton/plugins/h_mlflow.py @@ -183,6 +183,7 @@ def run_after_node_execution( ): """Log materializers and final vars as artifacts""" # log DataSavers as artifacts + # TODO refactor if/else as `handle_materializers()` if node_tags.get("hamilton.data_saver") is True: # don't log mlflow materializers as artifact since they already create models # instead, use the Materializer metadata to add metadata to registered models @@ -246,6 +247,7 @@ def run_after_node_execution( if node_name not in self.final_vars: return + # TODO refactor if/else as `handle_final_vars()` # log float and int as metrics if node_return_type in [float, int]: self.client.log_metric(self.run_id, key=node_name, value=float(result)) From 98da37aaae326afc12bdc7445883878c41272e20 Mon Sep 17 00:00:00 2001 From: zilto Date: Wed, 12 Jun 2024 12:44:48 -0400 Subject: [PATCH 12/12] added metadata to registered models --- hamilton/plugins/h_mlflow.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/hamilton/plugins/h_mlflow.py b/hamilton/plugins/h_mlflow.py index 1119ced12..2d77815a0 100644 --- a/hamilton/plugins/h_mlflow.py +++ b/hamilton/plugins/h_mlflow.py @@ -205,6 +205,27 @@ def run_after_node_execution( model_name, version, materialized_node.documentation ) + # add node name as tag + self.client.set_registered_model_tag( + model_name, key="node_name", value=materialized_node.name + ) + self.client.set_model_version_tag( + model_name, version, key="node_name", value=materialized_node.name + ) + + # add origin function name as tag + self.client.set_registered_model_tag( + model_name, + key="function_name", + value=materialized_node.originating_functions[0].__name__, + ) + self.client.set_model_version_tag( + model_name, + version, + key="function_name", + value=materialized_node.originating_functions[0].__name__, + ) + # add the materialized node @tag values as tags for k, v in materialized_node.tags.items(): # skip internal Hamilton tags