-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updates docs and references for it.
- Loading branch information
Showing
11 changed files
with
442 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# machine\_learning | ||
|
||
This template shows a ML pipeline. | ||
|
||
It shows a few things: | ||
|
||
1. It shows how one could split up functions into modules. E.g. loading, vs features, vs fitting. | ||
2. It also shows how to use `@subdag` to fit different models in the same DAG run and reuse the same fitting code. | ||
3. It shows how to use data loaders and data savers to load and save data, that also then emit extra metadata | ||
that can be used to track lineage in the UI. | ||
4. It shows how to use the HamiltonTracker to integrate with the Hamilton UI. | ||
|
||
## Getting started | ||
|
||
To get started, you need to have the Hamilton UI running. | ||
|
||
1. See https://hamilton.dagworks.io/en/latest/concepts/ui/ for details, here are the cliff notes: | ||
|
||
```bash | ||
git clone https://github.com/dagworks-inc/hamilton | ||
cd hamilton/ui/deployment | ||
./run.sh | ||
``` | ||
Then go to http://localhost:8242 and create (1) an email, and (2) a project. | ||
See [this video](https://youtu.be/DPfxlTwaNsM) for a walkthrough. | ||
|
||
2. Ensure you have the right python dependencies installed. | ||
```bash | ||
cd hamilton/examples/hamilton_ui | ||
pip install -r requirements.txt | ||
``` | ||
|
||
2. Run the `run.py` script. Providing the email, and project ID to be able to log to the Hamilton UI. | ||
```bash | ||
python run.py --email <email> --project_id <project_id> | ||
``` | ||
Once you've run that, run this: | ||
```bash | ||
python run.py --email <email> --project_id <project_id> --load-from-parquet True | ||
``` | ||
Then you can go see the difference in the Hamilton UI. Find your project under http://localhost:8242/dashboard/projects. | ||
## Things to try: | ||
1. Place an error in the code and see how it shows up in the Hamilton UI. e.g. `raise ValueError("I'm an error")`. | ||
2. In `models.py` change `"data_set": source("data_set_v1"),` to `"data_set": source("data_set_v2"),`, along with | ||
what is requested in `run.py` (i.e. change/add saving `data_set_v2`) and see how the lineage changes in the Hamilton UI. | ||
3. Add a new feature and propagate it through the pipeline. E.g. add a new feature to `features.py` and then to a dataset. | ||
Execute it and then compare the data observed in the Hamilton UI against a prior run. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
""" | ||
Module to transform iris data into features. | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from hamilton.function_modifiers import parameterize_sources | ||
|
||
RAW_FEATURES = ["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"] | ||
|
||
# Here is more terse code that does the same thing as the below *_log functions. | ||
# Any `@parameterize*` decorator is just a less verbose way of defining functions that differ | ||
# slightly. We don't see anything wrong with verbose code - so we recommend err'ing on the side of | ||
# verbosity, but otherwise for this example show the terser code. | ||
# @parameterize_sources(**{f"{col}_log": {"col": col} for col in RAW_FEATURES}) | ||
# def log_value(col: pd.Series) -> pd.Series: | ||
# """Log value of {col}.""" | ||
# return np.log(col) | ||
|
||
|
||
def sepal_length_cm_log(sepal_length_cm: pd.Series) -> pd.Series: | ||
"""Log value of sepal_length_cm_.""" | ||
return np.log(sepal_length_cm) | ||
|
||
|
||
def sepal_width_cm_log(sepal_width_cm: pd.Series) -> pd.Series: | ||
"""Log value of sepal_width_cm_.""" | ||
return np.log(sepal_width_cm) | ||
|
||
|
||
def petal_length_cm_log(petal_length_cm: pd.Series) -> pd.Series: | ||
"""Log value of petal_length_cm_.""" | ||
return np.log(petal_length_cm) | ||
|
||
|
||
def petal_width_cm_log(petal_width_cm: pd.Series) -> pd.Series: | ||
"""Log value of petal_width_cm_.""" | ||
return np.log(petal_width_cm) | ||
|
||
|
||
@parameterize_sources(**{f"{col}_mean": {"col": col} for col in RAW_FEATURES}) | ||
def mean_value(col: pd.Series) -> float: | ||
"""Mean of {col}.""" | ||
return col.mean() | ||
|
||
|
||
@parameterize_sources(**{f"{col}_std": {"col": col} for col in RAW_FEATURES}) | ||
def std_value(col: pd.Series) -> float: | ||
"""Standard deviation of {col}.""" | ||
return col.std() | ||
|
||
|
||
@parameterize_sources( | ||
**{ | ||
f"{col}_normalized": {"col": col, "col_mean": f"{col}_mean", "col_std": f"{col}_std"} | ||
for col in RAW_FEATURES | ||
} | ||
) | ||
def normalized_value(col: pd.Series, col_mean: float, col_std: float) -> pd.Series: | ||
"""Normalized column of {col}.""" | ||
return (col - col_mean) / col_std | ||
|
||
|
||
def data_set_v1( | ||
sepal_length_cm_normalized: pd.Series, | ||
sepal_width_cm_normalized: pd.Series, | ||
petal_length_cm_normalized: pd.Series, | ||
petal_width_cm_normalized: pd.Series, | ||
target_class: pd.Series, | ||
) -> pd.DataFrame: | ||
"""Explicitly define the feature set we want to use.""" | ||
return pd.DataFrame( | ||
{ | ||
"sepal_length_cm_normalized": sepal_length_cm_normalized, | ||
"sepal_width_cm_normalized": sepal_width_cm_normalized, | ||
"petal_length_cm_normalized": petal_length_cm_normalized, | ||
"petal_width_cm_normalized": petal_width_cm_normalized, | ||
"target_class": target_class, | ||
} | ||
) | ||
|
||
|
||
def data_set_v2( | ||
sepal_length_cm_normalized: pd.Series, | ||
sepal_width_cm_normalized: pd.Series, | ||
petal_length_cm_normalized: pd.Series, | ||
petal_width_cm_normalized: pd.Series, | ||
sepal_length_cm_log: pd.Series, | ||
sepal_width_cm_log: pd.Series, | ||
petal_length_cm_log: pd.Series, | ||
petal_width_cm_log: pd.Series, | ||
target_class: pd.Series, | ||
) -> pd.DataFrame: | ||
"""Explicitly define the feature set we want to use. This one adds `log` features.""" | ||
return pd.DataFrame( | ||
{ | ||
"sepal_length_cm_normalized": sepal_length_cm_normalized, | ||
"sepal_width_cm_normalized": sepal_width_cm_normalized, | ||
"petal_length_cm_normalized": petal_length_cm_normalized, | ||
"petal_width_cm_normalized": petal_width_cm_normalized, | ||
"sepal_length_cm_log": sepal_length_cm_log, | ||
"sepal_width_cm_log": sepal_width_cm_log, | ||
"petal_length_cm_log": petal_length_cm_log, | ||
"petal_width_cm_log": petal_width_cm_log, | ||
"target_class": target_class, | ||
} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
""" | ||
Module to load iris data. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn import datasets, utils | ||
|
||
from hamilton.function_modifiers import config, extract_columns, load_from | ||
|
||
RAW_COLUMN_NAMES = [ | ||
"sepal_length_cm", | ||
"sepal_width_cm", | ||
"petal_length_cm", | ||
"petal_width_cm", | ||
] | ||
|
||
|
||
@config.when(case="api") | ||
def iris_data_raw__api() -> utils.Bunch: | ||
return datasets.load_iris() | ||
|
||
|
||
@extract_columns(*(RAW_COLUMN_NAMES + ["target_class"])) | ||
@config.when(case="api") | ||
def iris_df__api(iris_data_raw: utils.Bunch) -> pd.DataFrame: | ||
_df = pd.DataFrame(iris_data_raw.data, columns=RAW_COLUMN_NAMES) | ||
_df["target_class"] = [iris_data_raw.target_names[t] for t in iris_data_raw.target] | ||
return _df | ||
|
||
|
||
@extract_columns(*(RAW_COLUMN_NAMES + ["target_class"])) | ||
@load_from.parquet(path="iris.parquet") | ||
@config.when(case="parquet") | ||
def iris_df__parquet(iris_data_raw: pd.DataFrame) -> pd.DataFrame: | ||
return iris_data_raw |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
"""This module contains basic code for model fitting.""" | ||
|
||
from typing import Dict | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn import base, linear_model, metrics, svm | ||
from sklearn.model_selection import train_test_split | ||
|
||
from hamilton import function_modifiers | ||
|
||
|
||
@function_modifiers.config.when(clf="svm") | ||
def prefit_clf__svm(gamma: float = 0.001) -> base.ClassifierMixin: | ||
"""Returns an unfitted SVM classifier object. | ||
:param gamma: ... | ||
:return: | ||
""" | ||
return svm.SVC(gamma=gamma) | ||
|
||
|
||
@function_modifiers.config.when(clf="logistic") | ||
def prefit_clf__logreg(penalty: str) -> base.ClassifierMixin: | ||
"""Returns an unfitted Logistic Regression classifier object. | ||
:param penalty: One of {'l1', 'l2', 'elasticnet', None}. | ||
:return: | ||
""" | ||
return linear_model.LogisticRegression(penalty) | ||
|
||
|
||
@function_modifiers.extract_fields( | ||
{"X_train": pd.DataFrame, "X_test": pd.DataFrame, "y_train": pd.Series, "y_test": pd.Series} | ||
) | ||
def train_test_split_func( | ||
data_set: pd.DataFrame, | ||
test_size_fraction: float, | ||
shuffle_train_test_split: bool, | ||
) -> Dict[str, np.ndarray]: | ||
"""Function that creates the training & test splits. | ||
It this then extracted out into constituent components and used downstream. | ||
:param data_set: | ||
:param test_size_fraction: | ||
:param shuffle_train_test_split: | ||
:return: | ||
""" | ||
assert "target_class" in data_set.columns, "target_class column must be present in the data set" | ||
feature_set = data_set[[col for col in data_set.columns if col != "target_class"]] | ||
target_class = data_set["target_class"] | ||
X_train, X_test, y_train, y_test = train_test_split( | ||
feature_set, target_class, test_size=test_size_fraction, shuffle=shuffle_train_test_split | ||
) | ||
return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} | ||
|
||
|
||
def fit_clf( | ||
prefit_clf: base.ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series | ||
) -> base.ClassifierMixin: | ||
"""Calls fit on the classifier object; it mutates it.""" | ||
prefit_clf.fit(X_train, y_train) | ||
return prefit_clf | ||
|
||
|
||
def training_accuracy( | ||
fit_clf: base.ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series | ||
) -> float: | ||
"""Returns accuracy on the training set.""" | ||
return metrics.accuracy_score(fit_clf.predict(X_train), y_train) | ||
|
||
|
||
def testing_accuracy( | ||
fit_clf: base.ClassifierMixin, X_test: pd.DataFrame, y_test: pd.Series | ||
) -> float: | ||
"""Returns accuracy on the test set.""" | ||
return metrics.accuracy_score(fit_clf.predict(X_test), y_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
"""This module contains specific incarnations of models.""" | ||
|
||
from sklearn import base | ||
|
||
from hamilton.function_modifiers import source, subdag | ||
|
||
try: | ||
import model_fitting | ||
except ImportError: | ||
from . import model_fitting | ||
|
||
|
||
@subdag( | ||
model_fitting, | ||
inputs={ | ||
"data_set": source("data_set_v1"), | ||
}, | ||
config={"clf": "svm", "shuffle_train_test_split": True, "test_size_fraction": 0.2}, | ||
) | ||
def svm_model( | ||
fit_clf: base.ClassifierMixin, training_accuracy: float, testing_accuracy: float | ||
) -> dict: | ||
return { | ||
"svm": fit_clf, | ||
"training_accuracy": training_accuracy, | ||
"testing_accuracy": testing_accuracy, | ||
} | ||
|
||
|
||
@subdag( | ||
model_fitting, | ||
inputs={ | ||
"data_set": source("data_set_v1"), | ||
}, | ||
config={ | ||
"clf": "logistic", | ||
"shuffle_train_test_split": True, | ||
"test_size_fraction": 0.2, | ||
"penalty": "l2", | ||
}, | ||
) | ||
def lr_model( | ||
fit_clf: base.ClassifierMixin, training_accuracy: float, testing_accuracy: float | ||
) -> dict: | ||
return { | ||
"logistic": fit_clf, | ||
"training_accuracy": training_accuracy, | ||
"testing_accuracy": testing_accuracy, | ||
} | ||
|
||
|
||
def best_model(svm_model: dict, lr_model: dict) -> dict: | ||
"""Returns the best model based on the testing accuracy.""" | ||
if svm_model["testing_accuracy"] > lr_model["testing_accuracy"]: | ||
return svm_model | ||
else: | ||
return lr_model |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
click | ||
pandas | ||
scikit-learn | ||
sf-hamilton[sdk] |
Oops, something went wrong.