diff --git a/.github/workflows/update-templates-to-examples.yml b/.github/workflows/update-templates-to-examples.yml
index 2fd1f858f30..4eb0fa90043 100644
--- a/.github/workflows/update-templates-to-examples.yml
+++ b/.github/workflows/update-templates-to-examples.yml
@@ -136,7 +136,7 @@ jobs:
with:
python-version: ${{ inputs.python-version }}
stack-name: local
- ref-zenml: 'main'
+ ref-zenml: ${{ github.ref }}
ref-template: '0.45.0' # Make sure it is aligned with ZENML_PROJECT_TEMPLATES from src/zenml/cli/base.py
- name: Clean-up
@@ -192,4 +192,85 @@ jobs:
owner: context.repo.owner,
repo: context.repo.repo,
body: 'NLP template updates in `examples/nlp-case` have been pushed.'
+ })
+
+
+ update-starter-template-to-examples:
+ name: update-starter-template-to-examples
+ runs-on: ${{ inputs.os }}
+ env:
+ ZENML_DEBUG: 1
+ ZENML_ANALYTICS_OPT_IN: false
+ PYTHONIOENCODING: "utf-8"
+ OBJC_DISABLE_INITIALIZE_FORK_SAFETY: "YES"
+ if: github.event_name == 'pull_request' && ! startsWith(github.event.head_commit.message, 'GitBook:')
+
+ defaults:
+ run:
+ shell: bash
+
+ steps:
+ - name: Run template tests for zenml-io/template-starter
+ uses: zenml-io/template-starter/.github/actions/starter_template_test@main
+ with:
+ python-version: ${{ inputs.python-version }}
+ stack-name: local
+ ref-zenml: ${{ github.ref }}
+ ref-template: '2023.12.18' # Make sure it is aligned with ZENML_PROJECT_TEMPLATES from src/zenml/cli/base.py
+
+ - name: Clean-up
+ run: |
+ rm -rf ./local_checkout
+
+ - name: message-on-error
+ if: failure()
+ run: |
+ echo "::error title=zenml-io/template-starter project template testing failed with new version of
+ ZenML core!::\
+ Breaking changes affecting templates have been introduced. To mitigate this issue,\
+ please make the code in zenml-io/template-starter compatible with new version of\
+ ZenML core, release it and update release tag in zenml.cli.base.ZENML_PROJECT_TEMPLATES"
+
+ - uses: actions/checkout@v3
+ with:
+ ref: ${{ github.event.pull_request.head.ref }}
+
+ - name: Check-out fresh Starter template
+ run: |
+ rm -rf examples/quickstart
+ mkdir -p examples/quickstart
+ printf 'info@zenml.io' | zenml init --path examples/quickstart --template starter --template-with-defaults
+ bash scripts/format.sh
+
+ - name: Check for changes
+ id: check_changes
+ run: |
+ if git diff --quiet "origin/${{ github.event.pull_request.head.ref }}"; then
+ echo "No active Git changes found."
+ echo "changes=false" >> $GITHUB_OUTPUT
+ else
+ echo "Active Git changes found."
+ echo "changes=true" >> $GITHUB_OUTPUT
+ fi
+
+ - name: Commit and push template
+ if: steps.check_changes.outputs.changes == 'true'
+ run: |
+ git config --global user.name "GitHub Actions"
+ git config --global user.email "actions@github.com"
+ git add .
+ git commit -am "Auto-update of Starter template"
+ git push origin HEAD:${{ github.event.pull_request.head.ref }}
+
+ - name: Create PR comment
+ if: steps.check_changes.outputs.changes == 'true'
+ uses: actions/github-script@v4
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ github.issues.createComment({
+ issue_number: ${{ github.event.pull_request.number }},
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: 'Quickstart template updates in `examples/quickstart` have been pushed.'
})
\ No newline at end of file
diff --git a/examples/e2e/.copier-answers.yml b/examples/e2e/.copier-answers.yml
index 7a288d71456..04637068e52 100644
--- a/examples/e2e/.copier-answers.yml
+++ b/examples/e2e/.copier-answers.yml
@@ -1,5 +1,5 @@
# Changes here will be overwritten by Copier
-_commit: 2023.12.06-4-g1e3edc6
+_commit: 2023.12.12
_src_path: gh:zenml-io/template-e2e-batch
data_quality_checks: true
email: ''
diff --git a/examples/quickstart/.assets/cloud_mcp.png b/examples/quickstart/.assets/cloud_mcp.png
new file mode 100644
index 00000000000..81197e9574f
Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp.png differ
diff --git a/examples/quickstart/.assets/cloud_mcp_predictions.png b/examples/quickstart/.assets/cloud_mcp_predictions.png
new file mode 100644
index 00000000000..a6bf7c902d6
Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp_predictions.png differ
diff --git a/examples/quickstart/.assets/cloud_mcp_screenshot.png b/examples/quickstart/.assets/cloud_mcp_screenshot.png
new file mode 100644
index 00000000000..8f56defad50
Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp_screenshot.png differ
diff --git a/examples/quickstart/.assets/feature_engineering_pipeline.png b/examples/quickstart/.assets/feature_engineering_pipeline.png
new file mode 100644
index 00000000000..db3019132ee
Binary files /dev/null and b/examples/quickstart/.assets/feature_engineering_pipeline.png differ
diff --git a/examples/quickstart/.assets/inference_pipeline.png b/examples/quickstart/.assets/inference_pipeline.png
new file mode 100644
index 00000000000..358d5537c8c
Binary files /dev/null and b/examples/quickstart/.assets/inference_pipeline.png differ
diff --git a/examples/quickstart/.assets/pipeline_overview.png b/examples/quickstart/.assets/pipeline_overview.png
new file mode 100644
index 00000000000..609e97d2f6f
Binary files /dev/null and b/examples/quickstart/.assets/pipeline_overview.png differ
diff --git a/examples/quickstart/.assets/training_pipeline.png b/examples/quickstart/.assets/training_pipeline.png
new file mode 100644
index 00000000000..a2e6a7d0499
Binary files /dev/null and b/examples/quickstart/.assets/training_pipeline.png differ
diff --git a/examples/quickstart/.copier-answers.yml b/examples/quickstart/.copier-answers.yml
new file mode 100644
index 00000000000..8662fc2da3f
--- /dev/null
+++ b/examples/quickstart/.copier-answers.yml
@@ -0,0 +1,8 @@
+# Changes here will be overwritten by Copier
+_commit: 0.43.0-48-g652a614
+_src_path: gh:zenml-io/template-starter
+email: ''
+full_name: ZenML GmbH
+open_source_license: apache
+project_name: ZenML Starter
+version: 0.1.0
diff --git a/examples/quickstart/.dockerignore b/examples/quickstart/.dockerignore
new file mode 100644
index 00000000000..455f4d7aed4
--- /dev/null
+++ b/examples/quickstart/.dockerignore
@@ -0,0 +1,2 @@
+.venv*
+.requirements*
\ No newline at end of file
diff --git a/examples/quickstart/LICENSE b/examples/quickstart/LICENSE
new file mode 100644
index 00000000000..0fcb970537e
--- /dev/null
+++ b/examples/quickstart/LICENSE
@@ -0,0 +1,15 @@
+Apache Software License 2.0
+
+Copyright (c) ZenML GmbH 2023. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
deleted file mode 100644
index 4be902633f2..00000000000
--- a/examples/quickstart/README.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# :running: Get up and running quickly
-
-Build your first ML pipelines with ZenML.
-
-## :question: Coming from `zenml go` ?
-Then open [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb) to get
-started.
-
-## :earth_americas: Overview
-
-This quickstart demonstrates some of ZenML's features. We will:
-
-- Import some data from a public dataset (Adult Census Income), then train two models (SGD and Random Forest)
-- Compare and evaluate which model performs better, and deploy the best one.
-- Run a prediction on the deployed model.
-
-Along the way we will also show you how to:
-
-- Automatically version, track, and cache data, models, and other artifacts,
-- Track model hyperparameters and metrics in an experiment tracking tool
-
-## :cloud: Run on Colab
-You can use Google Colab to see ZenML in action, no signup / installation required!
-
-
-
-## :computer: Run Locally
-
-To run locally, install ZenML and pull this quickstart:
-
-```shell
-# Install ZenML
-pip install "zenml[server]"
-
-# clone the ZenML repository
-git clone https://github.com/zenml-io/zenml.git
-cd zenml/examples/quickstart
-```
-
-### :arrow_forward: Run Locally
-Now we're ready to start. You have two options for running the quickstart locally:
-
-#### Option 1 (*Recommended*) - Interactively explore the quickstart using Jupyter Notebook:
-```bash
-pip install notebook
-jupyter notebook
-# open notebooks/quickstart.ipynb
-```
-
-#### Option 2 - Execute the whole ML pipeline from a Python script:
-```bash
-# Install required zenml integrations
-zenml integration install sklearn mlflow -y
-
-# Initialize ZenML
-zenml init
-
-# Start the ZenServer to enable dashboard access
-zenml up
-
-# Register required ZenML stack
-zenml experiment-tracker register mlflow_tracker --flavor=mlflow
-zenml model-deployer register mlflow_deployer --flavor=mlflow
-zenml model-registry register mlflow_registry --flavor=mlflow
-
-# Register a new stack with the new stack components
-zenml stack register quickstart_stack -a default\
- -o default\
- -d mlflow_deployer\
- -e mlflow_tracker\
- -r mlflow_registry\
- --set
-
-# Run the quickstart script
-python run.py
-```
-
-## :dart: Dashboard
-
-You can also take a look at our **dashboard** where you can inspect the quickstart
-pipeline run and much more. Simply execute:
-
-```shell
-# only once you've already run `zenml up`
-zenml show
-```
-
-## :sponge: Clean up
-
-To clean up, simply spin down the ZenML server.
-
-```shell
-zenml down
-```
-
-## :bulb: Learn More
-
-If you want to learn more about ZenML as a tool, then the
-[:page_facing_up: **ZenML Docs**](https://docs.zenml.io/) are the perfect place
-to get started.
-
-Already have an MLOps stack in mind? ZenML most likely has
-[**:link: Integrations**](https://docs.zenml.io/stacks-and-components/component-guide)
-for whatever tools you plan to use.
-
-Also, make sure to join our
-
- Slack Community
- to become part of the ZenML family!
-
-
diff --git a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png b/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png
deleted file mode 100644
index d0ef5780e78..00000000000
Binary files a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png and /dev/null differ
diff --git a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png b/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png
deleted file mode 100644
index 360b49a504b..00000000000
Binary files a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png and /dev/null differ
diff --git a/examples/quickstart/configs/feature_engineering.yaml b/examples/quickstart/configs/feature_engineering.yaml
new file mode 100644
index 00000000000..d5ab212951e
--- /dev/null
+++ b/examples/quickstart/configs/feature_engineering.yaml
@@ -0,0 +1,10 @@
+# environment configuration
+settings:
+ docker:
+ required_integrations:
+ - sklearn
+ requirements:
+ - pyarrow
+
+# pipeline configuration
+test_size: 0.35
\ No newline at end of file
diff --git a/examples/quickstart/configs/inference.yaml b/examples/quickstart/configs/inference.yaml
new file mode 100644
index 00000000000..0fd82f550c8
--- /dev/null
+++ b/examples/quickstart/configs/inference.yaml
@@ -0,0 +1,15 @@
+# environment configuration
+settings:
+ docker:
+ required_integrations:
+ - sklearn
+ requirements:
+ - pyarrow
+
+# configuration of the Model Control Plane
+model_version:
+ name: "breast_cancer_classifier"
+ version: "production"
+ license: Apache 2.0
+ description: A breast cancer classifier
+ tags: ["breast_cancer", "classifier"]
\ No newline at end of file
diff --git a/examples/quickstart/configs/training_rf.yaml b/examples/quickstart/configs/training_rf.yaml
new file mode 100644
index 00000000000..c1418afdef3
--- /dev/null
+++ b/examples/quickstart/configs/training_rf.yaml
@@ -0,0 +1,19 @@
+# environment configuration
+settings:
+ docker:
+ required_integrations:
+ - sklearn
+ requirements:
+ - pyarrow
+
+# configuration of the Model Control Plane
+model_version:
+ name: breast_cancer_classifier
+ version: rf
+ license: Apache 2.0
+ description: A breast cancer classifier
+ tags: ["breast_cancer", "classifier"]
+
+# Configure the pipeline
+parameters:
+ model_type: "rf" # Choose between rf/sgd
diff --git a/examples/quickstart/configs/training_sgd.yaml b/examples/quickstart/configs/training_sgd.yaml
new file mode 100644
index 00000000000..6ca7c0dd02f
--- /dev/null
+++ b/examples/quickstart/configs/training_sgd.yaml
@@ -0,0 +1,19 @@
+# environment configuration
+settings:
+ docker:
+ required_integrations:
+ - sklearn
+ requirements:
+ - pyarrow
+
+# configuration of the Model Control Plane
+model_version:
+ name: breast_cancer_classifier
+ version: sgd
+ license: Apache 2.0
+ description: A breast cancer classifier
+ tags: ["breast_cancer", "classifier"]
+
+# Configure the pipeline
+parameters:
+ model_type: "sgd" # Choose between rf/sgd
\ No newline at end of file
diff --git a/examples/quickstart/license b/examples/quickstart/license
new file mode 100644
index 00000000000..0fcb970537e
--- /dev/null
+++ b/examples/quickstart/license
@@ -0,0 +1,15 @@
+Apache Software License 2.0
+
+Copyright (c) ZenML GmbH 2023. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/examples/quickstart/license_header b/examples/quickstart/license_header
new file mode 100644
index 00000000000..264b1f357fe
--- /dev/null
+++ b/examples/quickstart/license_header
@@ -0,0 +1,16 @@
+Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
\ No newline at end of file
diff --git a/examples/quickstart/notebooks/quickstart.ipynb b/examples/quickstart/notebooks/quickstart.ipynb
deleted file mode 100644
index b489ab131a5..00000000000
--- a/examples/quickstart/notebooks/quickstart.ipynb
+++ /dev/null
@@ -1,975 +0,0 @@
-{
- "cells": [
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Get Up and Running Quickly\n",
- "\n",
- "## 🌍 Overview\n",
- "\n",
- "This quickstart demonstrates some of ZenML's features. We will:\n",
- "\n",
- "- Import some data from a public dataset (Adult Census Income), then train two models (SGD and Random Forest)\n",
- "- Compare and evaluate which model performs better, and deploy the best one.\n",
- "- Run a prediction on the deployed model.\n",
- "\n",
- "Along the way we will also show you how to:\n",
- "\n",
- "- Automatically version, track, and cache data, models, and other artifacts,\n",
- "- Track model hyperparameters and metrics in an experiment tracking tool\n",
- "\n",
- "This will give you enough to get started building your own ZenML Pipelines.\n",
- "Let's dive in!\n"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Run on Colab\n",
- "\n",
- "You can use Google Colab to see ZenML in action, no signup / installation\n",
- "required!\n",
- "\n",
- "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n",
- "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/notebooks/quickstart.ipynb)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 1. Install Requirements\n",
- "\n",
- "Let's install ZenML to get started. First we'll install the latest version of\n",
- "ZenML as well as the two integrations we'll need for this quickstart: `sklearn`\n",
- "and `mlflow`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install \"zenml[server]\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.environment import Environment\n",
- "\n",
- "if Environment.in_google_colab():\n",
- " # Install Cloudflare Tunnel binary\n",
- " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!zenml integration install sklearn mlflow -y\n",
- "\n",
- "import IPython\n",
- "IPython.Application.instance().kernel.do_shutdown(restart=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Please wait for the installation to complete before running subsequent cells. At\n",
- "the end of the installation, the notebook kernel will automatically restart."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Optional: If you are using ZenML Cloud, execute the following cell with your tenant URL. Otherwise ignore."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n",
- "\n",
- "!zenml connect --url $zenml_server_url"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!zenml init"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Please wait for the installation to complete before running subsequent cells. At the end of the installation, the notebook kernel will automatically restart."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 2. Import Data\n",
- "\n",
- "We'll start off by importing our data. In this quickstart we'll be working with\n",
- "[the Adult Census Income](https://archive.ics.uci.edu/dataset/2/adult) dataset\n",
- "which is publicly available on the UCI Machine Learning Repository. The task is\n",
- "to predict whether a person makes over $50k a year based on a number of\n",
- "features. These features are things like age, work class, education level,\n",
- "marital status, occupation, relationship, race, sex, capital gain, capital loss,\n",
- "hours per week, and native country.\n",
- "\n",
- "When you're getting started with a machine learning problem you'll want to do\n",
- "something similar to this: import your data and get it in the right shape for\n",
- "your training. ZenML mostly gets out of your way when you're writing your Python\n",
- "code, as you'll see from the following cell."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from typing import Tuple\n",
- "\n",
- "import pandas as pd\n",
- "from sklearn.model_selection import train_test_split\n",
- "\n",
- "from zenml import step\n",
- "\n",
- "\n",
- "@step\n",
- "def training_data_loader() -> (\n",
- " Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]\n",
- "):\n",
- " \"\"\"Load the Census Income dataset as tuple of Pandas DataFrame / Series.\"\"\"\n",
- " # Load the dataset\n",
- " url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
- " column_names = [\n",
- " \"age\",\n",
- " \"workclass\",\n",
- " \"fnlwgt\",\n",
- " \"education\",\n",
- " \"education-num\",\n",
- " \"marital-status\",\n",
- " \"occupation\",\n",
- " \"relationship\",\n",
- " \"race\",\n",
- " \"sex\",\n",
- " \"capital-gain\",\n",
- " \"capital-loss\",\n",
- " \"hours-per-week\",\n",
- " \"native-country\",\n",
- " \"income\",\n",
- " ]\n",
- " data = pd.read_csv(\n",
- " url, names=column_names, na_values=\"?\", skipinitialspace=True\n",
- " )\n",
- "\n",
- " # Drop rows with missing values\n",
- " data = data.dropna()\n",
- "\n",
- " # Encode categorical features and drop original columns\n",
- " categorical_cols = [\n",
- " \"workclass\",\n",
- " \"education\",\n",
- " \"marital-status\",\n",
- " \"occupation\",\n",
- " \"relationship\",\n",
- " \"race\",\n",
- " \"sex\",\n",
- " \"native-country\",\n",
- " ]\n",
- " data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)\n",
- "\n",
- " # Encode target feature\n",
- " data[\"income\"] = data[\"income\"].apply(\n",
- " lambda x: 1 if x.strip() == \">50K\" else 0\n",
- " )\n",
- "\n",
- " # Separate features and target\n",
- " X = data.drop(\"income\", axis=1)\n",
- " y = data[\"income\"]\n",
- "\n",
- " # Split the dataset into train and test sets\n",
- " X_train, X_test, y_train, y_test = train_test_split(\n",
- " X, y, test_size=0.2, random_state=42\n",
- " )\n",
- " return (X_train, X_test, y_train, y_test)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We download the data, dropping some columns and then splitting it up into train\n",
- "and test sets. The whole function is decorated with the `@step` decorator, which\n",
- "tells ZenML to track this function as a step in the pipeline. This means that\n",
- "ZenML will automatically version, track, and cache the data that is produced by\n",
- "this function. This is a very powerful feature, as it means that you can\n",
- "reproduce your data at any point in the future, even if the original data source\n",
- "changes or disappears.\n",
- "\n",
- "You'll also notice that we have included type hints for the outputs\n",
- "to the function. These are not only useful for anyone reading your code, but\n",
- "help ZenML process your data in a way appropriate to the specific data types."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "ZenML is built in a way that allows you to experiment with your data and build\n",
- "your pipelines as you work, so if you want to call this function to see how it\n",
- "works, you can just call it directly. Here we take a look at the first few rows\n",
- "of your training dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train, X_test, y_train, y_test = training_data_loader()\n",
- "X_train.head()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Everything looks as we'd expect and the values are all in the right format. We\n",
- "can shift to training some models now! 🥳"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 3. Train Models\n",
- "\n",
- "Now that we have our data it makes sense to train some models to get a sense of\n",
- "how difficult the task is. The Census Income\n",
- "dataset is sufficiently large and complex that it's unlikely we'll be able to\n",
- "train a model that behaves perfectly since the problem is inherently complex,\n",
- "but we can get a sense of what a reasonable baseline looks like.\n",
- "\n",
- "We'll start with two simple models, a SGD Classifier and a Random Forest\n",
- "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n",
- "same data and then compare their performance.\n",
- "\n",
- "Since we're starting our work properly, it makes sense to start tracking the\n",
- "experimentation that we're doing. ZenML integrates with MLflow to make this\n",
- "easy. This happens out of the box when using our experiment tracker integration\n",
- "and stack components. We'll see how this works below, but first let's set up\n",
- "ZenML to know that it should use the MLFlow experiment tracker."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Register the MLflow experiment tracker\n",
- "!zenml experiment-tracker register mlflow --flavor=mlflow\n",
- "\n",
- "# Register a new stack with our experiment tracker\n",
- "!zenml stack register quickstart -a default\\\n",
- " -o default\\\n",
- " -e mlflow\n",
- "\n",
- "!zenml stack set quickstart"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can now write the steps where we'll\n",
- "train our models, making sure to specify the name of our experiment tracker in\n",
- "the `@step` decorator. We could specify this manually using a string, but\n",
- "instead we'll use the ZenML `Client` to access the name of our active stack's\n",
- "experiment tracker."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import mlflow\n",
- "\n",
- "from sklearn.base import ClassifierMixin\n",
- "from sklearn.ensemble import RandomForestClassifier\n",
- "from sklearn.linear_model import SGDClassifier\n",
- "\n",
- "from zenml.client import Client\n",
- "\n",
- "experiment_tracker = Client().active_stack.experiment_tracker\n",
- "\n",
- "\n",
- "@step(experiment_tracker=experiment_tracker.name)\n",
- "def random_forest_trainer_mlflow(\n",
- " X_train: pd.DataFrame,\n",
- " y_train: pd.Series,\n",
- ") -> ClassifierMixin:\n",
- " \"\"\"Train a sklearn Random Forest classifier and log to MLflow.\"\"\"\n",
- " mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow\n",
- " model = RandomForestClassifier()\n",
- " model.fit(X_train.to_numpy(), y_train.to_numpy())\n",
- " train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())\n",
- " print(f\"Train accuracy: {train_acc}\")\n",
- " return model\n",
- "\n",
- "\n",
- "@step(experiment_tracker=experiment_tracker.name)\n",
- "def sgd_trainer_mlflow(\n",
- " X_train: pd.DataFrame,\n",
- " y_train: pd.Series,\n",
- ") -> ClassifierMixin:\n",
- " \"\"\"Train a SGD classifier and log to MLflow.\"\"\"\n",
- " mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow\n",
- " model = SGDClassifier()\n",
- " model.fit(X_train.to_numpy(), y_train.to_numpy())\n",
- " train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())\n",
- " print(f\"Train accuracy: {train_acc}\")\n",
- " return model"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Our two training steps both return different kinds of `sklearn` classifier\n",
- "models, so we use the generic `ClassifierMixin` type hint for the return type."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The end goal of this quick baseline evaluation is to understand which of the two\n",
- "models performs better. We'll use the `evaluator` step to compare the two\n",
- "models. This step takes in the two models we trained above, and compares them on\n",
- "the test data we created earlier. It returns whichever model performs best along\n",
- "with the accuracy score for that model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from typing_extensions import Annotated\n",
- "\n",
- "\n",
- "@step\n",
- "def best_model_selector(\n",
- " X_test: pd.DataFrame,\n",
- " y_test: pd.Series,\n",
- " model1: ClassifierMixin,\n",
- " model2: ClassifierMixin,\n",
- ") -> Tuple[\n",
- " Annotated[ClassifierMixin, \"best_model\"],\n",
- " Annotated[float, \"best_model_test_acc\"],\n",
- "]:\n",
- " \"\"\"Calculate the accuracy on the test set and return the best model and its accuracy.\"\"\"\n",
- " test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy())\n",
- " test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy())\n",
- " print(f\"Test accuracy ({model1.__class__.__name__}): {test_acc1}\")\n",
- " print(f\"Test accuracy ({model2.__class__.__name__}): {test_acc2}\")\n",
- " if test_acc1 > test_acc2:\n",
- " best_model = model1\n",
- " best_model_test_acc = test_acc1\n",
- " else:\n",
- " best_model = model2\n",
- " best_model_test_acc = test_acc2\n",
- " return best_model, best_model_test_acc"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Note the use of the `typing` module's `Annotated` type hint in the output of the\n",
- "step. We're using this to give a name to the output of the step, which will make\n",
- "it possible to access it via a keyword later on."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We'll likely want to use our model in the future so instead of simply outputting\n",
- "the model we'll use the MLflow model registry to store it. This allows us to\n",
- "version the model for retrieval and use later on as well as to use other\n",
- "functionality made possible within the MLflow dashboard. This step is a bit\n",
- "different from the ones listed above in that we're using a pre-built ZenML step\n",
- "instead of just writing our own. You'll often come across these pre-built steps\n",
- "for common workflows."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.integrations.mlflow.steps.mlflow_registry import (\n",
- " mlflow_register_model_step,\n",
- ")\n",
- "\n",
- "model_name = \"zenml-quickstart-model\"\n",
- "\n",
- "register_model = mlflow_register_model_step.with_options(\n",
- " parameters=dict(\n",
- " name=model_name,\n",
- " description=\"The first run of the Quickstart pipeline.\",\n",
- " )\n",
- ")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We're now at the point where can bring all these steps together into a single\n",
- "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n",
- "as simple as adding a `@pipeline` decorator to a function. This specific\n",
- "pipeline doesn't return a value, but that option is available to you if you need."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml import pipeline\n",
- "\n",
- "\n",
- "@pipeline(enable_cache=True)\n",
- "def train_and_register_model_pipeline() -> None:\n",
- " \"\"\"Train a model.\"\"\"\n",
- " X_train, X_test, y_train, y_test = training_data_loader()\n",
- " model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train)\n",
- " model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train)\n",
- " best_model, _ = best_model_selector(\n",
- " X_test=X_test, y_test=y_test, model1=model1, model2=model2\n",
- " )\n",
- " register_model(best_model)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We've used the built-in MLflow registry to store our model, but ZenML doesn't\n",
- "yet know that we want to use the MLflow flavor of the model registry stack\n",
- "component in our stack. Let's add that now and update our stack."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Register the MLflow model registry\n",
- "!zenml model-registry register mlflow --flavor=mlflow\n",
- "\n",
- "# Update our stack to include the model registry\n",
- "!zenml stack update quickstart -r mlflow"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "![](../_assets/local_stack_with_local_mlflow_tracker_and_registry.png)"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We're ready to run the pipeline now, which we can do just -- as with the step -- by calling the\n",
- "pipeline function itself."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "train_and_register_model_pipeline()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can see from the logs already how our model training went: the\n",
- "`RandomForestClassifier` performed considerably better than the `SGDClassifier`,\n",
- "so that will have been the model that was returned from the evaluation step and\n",
- "then registered with the MLflow model registry."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "At this point you might be interested to view your pipeline in the ZenML\n",
- "Dashboard. You can spin this up by executing the next cell. This will start a\n",
- "server which you can access by clicking on the link that appears in the output\n",
- "of the cell.\n",
- "\n",
- "Log into the Dashboard using default credentials (username 'default' and\n",
- "password left blank). From there you can inspect the pipeline or the specific\n",
- "pipeline run. You can also examine the stack and components that we've\n",
- "registered to run everything.\n",
- "\n",
- "![](../llm_quickstart/_assets/zenml-up.gif)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.environment import Environment\n",
- "\n",
- "if Environment.in_google_colab():\n",
- " # run ZenML through a cloudflare tunnel to get a public endpoint\n",
- " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n",
- "else:\n",
- " !zenml up"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We're using MLflow for our experiment tracking. If you'd like to inspect the\n",
- "MLflow dashboard to see your experiments and what's been logged so far, run the\n",
- "following cell. This cell will spin up a local server that you can access via\n",
- "the link mentioned after the \"Listening at:\" `INFO` log statement."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "from zenml.integrations.mlflow.mlflow_utils import get_tracking_uri\n",
- "\n",
- "os.environ[\"MLFLOW_TRACKING_URI\"] = get_tracking_uri()\n",
- "\n",
- "if Environment.in_google_colab():\n",
- " # run mlflow through a cloudflare tunnel to get a public endpoint\n",
- " !mlflow ui --backend-store-uri $MLFLOW_TRACKING_URI & cloudflared tunnel --url http://localhost:5000\n",
- "else:\n",
- " !mlflow ui --backend-store-uri $MLFLOW_TRACKING_URI"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Our pipeline above registered the best model with the MLflow model registry.\n",
- "Whenever you register a model it also versions the model since it's likely that\n",
- "you'll be iterating and improving your model over time.\n",
- "\n",
- "We'll now turn to actually deploying our model and serving some predictions, for\n",
- "which we'll need to specify the model version we want to use. You can specify\n",
- "the version number manually but below we'll use the ZenML `Client` to get the\n",
- "latest version number."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.client import Client\n",
- "\n",
- "most_recent_model_version_number = int(\n",
- " Client()\n",
- " .active_stack.model_registry.list_model_versions(metadata={})[0]\n",
- " .version\n",
- ")\n",
- "most_recent_model_version_number"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we've trained our model, and we've found the best one, we want to deploy it\n",
- "and run some inference on the deployed model. We'll use the local MLflow model\n",
- "deployer which once again comes with some pre-built ZenML steps to save you reinventing the wheel."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.integrations.mlflow.steps.mlflow_deployer import (\n",
- " mlflow_model_registry_deployer_step,\n",
- ")\n",
- "\n",
- "model_deployer = mlflow_model_registry_deployer_step.with_options(\n",
- " parameters=dict(\n",
- " registry_model_name=model_name,\n",
- " registry_model_version=most_recent_model_version_number,\n",
- " )\n",
- ")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "When you deploy a model this is usually something you want to remain available\n",
- "and running for a long time, so ZenML automatically creates a background service\n",
- "for your deployed model. We load the service (already created by the\n",
- "`model_deployer` step) and then use it to make some predictions."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from zenml.services import BaseService\n",
- "from zenml.client import Client\n",
- "\n",
- "\n",
- "@step(enable_cache=False)\n",
- "def prediction_service_loader() -> BaseService:\n",
- " \"\"\"Load the model service of our train_and_register_model_pipeline.\"\"\"\n",
- " client = Client()\n",
- " model_deployer = client.active_stack.model_deployer\n",
- " services = model_deployer.find_model_server(\n",
- " pipeline_name=\"train_and_register_model_pipeline\",\n",
- " running=True,\n",
- " )\n",
- " return services[0]\n",
- "\n",
- "\n",
- "@step\n",
- "def predictor(\n",
- " service: BaseService,\n",
- " data: pd.DataFrame,\n",
- ") -> Annotated[list, \"predictions\"]:\n",
- " \"\"\"Run a inference request against a prediction service.\"\"\"\n",
- " service.start(timeout=10) # should be a NOP if already started\n",
- " print(f\"Running predictions on data (single individual): {data.to_numpy()[0]}\")\n",
- " prediction = service.predict(data.to_numpy())\n",
- " print(f\"Prediction (for single example slice) is: {bool(prediction.tolist()[0])}\")\n",
- " return prediction.tolist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Register the MLflow model deployer\n",
- "!zenml model-deployer register mlflow --flavor=mlflow\n",
- "\n",
- "# Register a new stack with the new stack components\n",
- "!zenml stack update quickstart -d mlflow"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once again there is one dependency in terms of how the step needs to run, so\n",
- "we specify it upfront: the prediction service needs to be loaded\n",
- "before we try to make predictions with it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "@pipeline\n",
- "def deploy_and_predict() -> None:\n",
- " \"\"\"Deploy the best model and run some predictions.\"\"\"\n",
- " prediction_service_loader.after(model_deployer)\n",
- "\n",
- " model_deployer()\n",
- " _, inference_data, _, _ = training_data_loader()\n",
- " model_deployment_service = prediction_service_loader()\n",
- " predictor(service=model_deployment_service, data=inference_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice how we specify that we want the `prediction_service_loader` step to run *after* the\n",
- "model_deployer step. This is because we won't have a model ready for prediction\n",
- "until the deployment has taken place. ZenML automatically tries to run steps in\n",
- "parallel, so sometimes if you have this kind of sequencing you need to do then\n",
- "you'll need to specify it explicitly."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "![](../_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png)\n",
- "\n",
- "Unlike in the previous case where we just ran the pipeline directly, we might\n",
- "not want to deploy the model every time. Consider the case where our models are\n",
- "returning values under 50% accuracy on the test data. In that case we might want\n",
- "to address the issues with accuracy and not spin up a deployment at all. We can\n",
- "access the artifacts associated with the previous pipeline run and check the\n",
- "test accuracy metric to see if it's above a certain threshold. Adding this to\n",
- "our workflow is as simple as adding a conditional step."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "best_model_test_accuracy = (\n",
- " Client().get_pipeline(\"train_and_register_model_pipeline\")\n",
- " .last_successful_run.steps[\"best_model_selector\"]\n",
- " .outputs[\"best_model_test_acc\"].load()\n",
- ")\n",
- "\n",
- "if best_model_test_accuracy > 0.7:\n",
- " deploy_and_predict()"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "If you recall, the purpose of this model was to predict whether or not someone\n",
- "earns more than \\$50,000 USD per year. You can see a single example in the output above.\n",
- "Given the features of a particular individual, the model predicts that they do\n",
- "not earn more than $50k per year.\n",
- "\n",
- "If we were interested in learning more about the model's predictions, we could\n",
- "separately load the predictor service and use it to pass in some other data or\n",
- "try things out. To load the predictor we can run:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "predictor_service = deploy_and_predict.model.last_successful_run.steps[\n",
- " \"prediction_service_loader\"\n",
- "].output.load()\n",
- "\n",
- "predictor_service"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "At this point, passing in some data is as simple as calling the `predict` method\n",
- "on the predictor service. We can try this here:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\n",
- " f\"Model predictions: {predictor_service.predict(X_test.to_numpy()[25:35])}\"\n",
- ")\n",
- "print(f\"Ground truth: {y_test.to_numpy()[25:35]}\")"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We're passing in some of our test data into the model and getting back the\n",
- "predictions. You can already start to see some of the places where our\n",
- "predictions are not matching the ground truth labels. This is to be expected but\n",
- "we could potentially use this to now iterate on our models by adding more steps.\n",
- "\n",
- "To get an overview of the models and model versions that we have registered and\n",
- "deployed so\n",
- "far, we can use the CLI to list these out."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!zenml model-registry models list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!zenml model-registry models list-versions zenml-quickstart-model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!zenml model-deployer models list"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To view all this on the ZenML Dashboard, simply spin up the server again and\n",
- "view the steps via the DAG visualiser and also browse the artifacts."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "if Environment.in_google_colab():\n",
- " !zenml down # server needs restarting due to colab bug.\n",
- " # run ZenML through a cloudflare tunnel to get a public endpoint\n",
- " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n",
- "else:\n",
- " !zenml up"
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Congratulations!\n",
- "\n",
- "You just built two ML pipelines! You trained two models, evaluated them against\n",
- "a test set, registered the best one with the MLflow model registry, deployed it\n",
- "and served some predictions. You also learned how to iterate on your models and\n",
- "data by using some of the ZenML utility abstractions. You saw how to view your\n",
- "artifacts and stacks via the CLI as well as the ZenML Dashboard.\n",
- "\n",
- "And that is just the tip of the iceberg of what ZenML can do; check out the [**Integrations**](https://zenml.io/integrations) page for a list of all the cool MLOps tools that ZenML supports!\n",
- "\n",
- "## What to do now\n",
- "\n",
- "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack-invite) and become part of the ZenML family!\n",
- "* If you want to try ZenML in a real-world setting... check out the [ZenML Cloud](https://cloud.zenml.io/), a free trial of\n",
- " ZenML's managed offering that runs on your Cloud platform. [**Sign up here**](https://sandbox.zenml.io/)."
- ]
- },
- {
- "attachments": {},
- "cell_type": "markdown",
- "metadata": {},
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/quickstart/pipelines/__init__.py b/examples/quickstart/pipelines/__init__.py
index e69de29bb2d..0630c39c2e2 100644
--- a/examples/quickstart/pipelines/__init__.py
+++ b/examples/quickstart/pipelines/__init__.py
@@ -0,0 +1,20 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .feature_engineering import feature_engineering
+from .inference import inference
+from .training import training
diff --git a/examples/quickstart/pipelines/deploy_and_predict.py b/examples/quickstart/pipelines/deploy_and_predict.py
deleted file mode 100644
index 6d643d56190..00000000000
--- a/examples/quickstart/pipelines/deploy_and_predict.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from steps.model_deployer import model_deployer
-from steps.prediction_service_loader import prediction_service_loader
-from steps.predictor import predictor
-from steps.training_data_loader import training_data_loader
-
-from zenml import pipeline
-from zenml.config import DockerSettings
-from zenml.integrations.constants import MLFLOW, SKLEARN
-
-docker_settings = DockerSettings(
- required_integrations=[MLFLOW, SKLEARN],
- requirements=[
- "numpy==1.24.3",
- "scipy==1.10.1",
- "typing-extensions==4.6.3",
- ],
-)
-
-
-@pipeline(enable_cache=True, settings={"docker": docker_settings})
-def deploy_and_predict() -> None:
- """Deploy the best model and run some predictions."""
- prediction_service_loader.after(model_deployer)
-
- model_deployer()
- _, inference_data, _, _ = training_data_loader()
- model_deployment_service = prediction_service_loader()
- predictor(service=model_deployment_service, data=inference_data)
diff --git a/examples/quickstart/pipelines/feature_engineering.py b/examples/quickstart/pipelines/feature_engineering.py
new file mode 100644
index 00000000000..1927f17f594
--- /dev/null
+++ b/examples/quickstart/pipelines/feature_engineering.py
@@ -0,0 +1,74 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+from steps import (
+ data_loader,
+ data_preprocessor,
+ data_splitter,
+)
+
+from zenml import pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def feature_engineering(
+ test_size: float = 0.2,
+ drop_na: Optional[bool] = None,
+ normalize: Optional[bool] = None,
+ drop_columns: Optional[List[str]] = None,
+ target: Optional[str] = "target",
+ random_state: int = 17,
+):
+ """
+ Feature engineering pipeline.
+
+ This is a pipeline that loads the data, processes it and splits
+ it into train and test sets.
+
+ Args:
+ test_size: Size of holdout set for training 0.0..1.0
+ drop_na: If `True` NA values will be removed from dataset
+ normalize: If `True` dataset will be normalized with MinMaxScaler
+ drop_columns: List of columns to drop from dataset
+ target: Name of target column in dataset
+ random_state: Random state to configure the data loader
+
+ Returns:
+ The processed datasets (dataset_trn, dataset_tst).
+ """
+ # Link all the steps together by calling them and passing the output
+ # of one step as the input of the next step.
+ raw_data = data_loader(random_state=random_state, target=target)
+ dataset_trn, dataset_tst = data_splitter(
+ dataset=raw_data,
+ test_size=test_size,
+ )
+ dataset_trn, dataset_tst, _ = data_preprocessor(
+ dataset_trn=dataset_trn,
+ dataset_tst=dataset_tst,
+ drop_na=drop_na,
+ normalize=normalize,
+ drop_columns=drop_columns,
+ target=target,
+ random_state=random_state,
+ )
+ return dataset_trn, dataset_tst
diff --git a/examples/quickstart/pipelines/inference.py b/examples/quickstart/pipelines/inference.py
new file mode 100644
index 00000000000..fee74371adc
--- /dev/null
+++ b/examples/quickstart/pipelines/inference.py
@@ -0,0 +1,62 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from steps import (
+ data_loader,
+ inference_predict,
+ inference_preprocessor,
+)
+
+from zenml import get_pipeline_context, pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def inference(random_state: str, target: str):
+ """
+ Model inference pipeline.
+
+ This is a pipeline that loads the inference data, processes it with
+ the same preprocessing pipeline used in training, and runs inference
+ with the trained model.
+
+ Args:
+ random_state: Random state for reproducibility.
+ target: Name of target column in dataset.
+ """
+ # Get the production model artifact
+ model = get_pipeline_context().model_version.get_artifact("model")
+
+ # Get the preprocess pipeline artifact associated with this version
+ preprocess_pipeline = get_pipeline_context().model_version.get_artifact(
+ "preprocess_pipeline"
+ )
+
+ # Link all the steps together by calling them and passing the output
+ # of one step as the input of the next step.
+ df_inference = data_loader(random_state=random_state, is_inference=True)
+ df_inference = inference_preprocessor(
+ dataset_inf=df_inference,
+ preprocess_pipeline=preprocess_pipeline,
+ target=target,
+ )
+ inference_predict(
+ model=model,
+ dataset_inf=df_inference,
+ )
diff --git a/examples/quickstart/pipelines/train_and_register_model.py b/examples/quickstart/pipelines/train_and_register_model.py
deleted file mode 100644
index b0d734d7003..00000000000
--- a/examples/quickstart/pipelines/train_and_register_model.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from steps.best_model_selector import best_model_selector
-from steps.register_model import register_model
-from steps.trainers import random_forest_trainer_mlflow, sgd_trainer_mlflow
-from steps.training_data_loader import training_data_loader
-
-from zenml import pipeline
-from zenml.config import DockerSettings
-from zenml.integrations.constants import MLFLOW, SKLEARN
-
-docker_settings = DockerSettings(required_integrations=[MLFLOW, SKLEARN])
-
-
-@pipeline(enable_cache=True, settings={"docker": docker_settings})
-def train_and_register_model_pipeline() -> None:
- """Train a model."""
- X_train, X_test, y_train, y_test = training_data_loader()
- model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train)
- model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train)
- best_model, _ = best_model_selector(
- X_test=X_test, y_test=y_test, model1=model1, model2=model2
- )
- register_model(best_model)
diff --git a/examples/quickstart/pipelines/training.py b/examples/quickstart/pipelines/training.py
new file mode 100644
index 00000000000..39814bbb61c
--- /dev/null
+++ b/examples/quickstart/pipelines/training.py
@@ -0,0 +1,75 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from uuid import UUID
+
+from steps import model_evaluator, model_promoter, model_trainer
+
+from pipelines import (
+ feature_engineering,
+)
+from zenml import ExternalArtifact, pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def training(
+ train_dataset_id: Optional[UUID] = None,
+ test_dataset_id: Optional[UUID] = None,
+ target: Optional[str] = "target",
+ model_type: Optional[str] = "sgd",
+):
+ """
+ Model training pipeline.
+
+ This is a pipeline that loads the data from a preprocessing pipeline,
+ trains a model on it and evaluates the model. If it is the first model
+ to be trained, it will be promoted to production. If not, it will be
+ promoted only if it has a higher accuracy than the current production
+ model version.
+
+ Args:
+ train_dataset_id: ID of the train dataset produced by feature engineering.
+ test_dataset_id: ID of the test dataset produced by feature engineering.
+ target: Name of target column in dataset.
+ model_type: The type of model to train.
+ """
+ # Link all the steps together by calling them and passing the output
+ # of one step as the input of the next step.
+
+ # Execute Feature Engineering Pipeline
+ if train_dataset_id is None or test_dataset_id is None:
+ dataset_trn, dataset_tst = feature_engineering()
+ else:
+ dataset_trn = ExternalArtifact(id=train_dataset_id)
+ dataset_tst = ExternalArtifact(id=test_dataset_id)
+
+ model = model_trainer(
+ dataset_trn=dataset_trn, target=target, model_type=model_type
+ )
+
+ acc = model_evaluator(
+ model=model,
+ dataset_trn=dataset_trn,
+ dataset_tst=dataset_tst,
+ target=target,
+ )
+
+ model_promoter(accuracy=acc)
diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb
new file mode 100644
index 00000000000..36e502d79ee
--- /dev/null
+++ b/examples/quickstart/quickstart.ipynb
@@ -0,0 +1,1117 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "63ab391a",
+ "metadata": {},
+ "source": [
+ "# Intro to MLOps using ZenML\n",
+ "\n",
+ "## 🌍 Overview\n",
+ "\n",
+ "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n",
+ "\n",
+ "- A feature engineering pipeline that loads data and prepares it for training.\n",
+ "- A training pipeline that loads the preprocessed dataset and trains a model.\n",
+ "- A batch inference pipeline that runs predictions on the trained model with new data.\n",
+ "\n",
+ "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f466b16",
+ "metadata": {},
+ "source": [
+ "## Run on Colab\n",
+ "\n",
+ "You can use Google Colab to see ZenML in action, no signup / installation\n",
+ "required!\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n",
+ "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/quickstart.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "66b2977c",
+ "metadata": {},
+ "source": [
+ "# 👶 Step 0. Install Requirements\n",
+ "\n",
+ "Let's install ZenML to get started. First we'll install the latest version of\n",
+ "ZenML as well as the `sklearn` integration of ZenML:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce2f40eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install \"zenml[server]\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5aad397e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from zenml.environment import Environment\n",
+ "\n",
+ "if Environment.in_google_colab():\n",
+ " # Install Cloudflare Tunnel binary\n",
+ " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f76f562e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!zenml integration install sklearn -y\n",
+ "\n",
+ "import IPython\n",
+ "IPython.Application.instance().kernel.do_shutdown(restart=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b044374",
+ "metadata": {},
+ "source": [
+ "Please wait for the installation to complete before running subsequent cells. At\n",
+ "the end of the installation, the notebook kernel will automatically restart."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e3955ff1",
+ "metadata": {},
+ "source": [
+ "Optional: If you are using [ZenML Cloud](https://zenml.io/cloud), execute the following cell with your tenant URL. Otherwise ignore."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e2587315",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n",
+ "\n",
+ "!zenml connect --url $zenml_server_url"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "081d5616",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize ZenML and set the default stack\n",
+ "!zenml init\n",
+ "\n",
+ "!zenml stack set default"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79f775f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Do the imports at the top\n",
+ "from typing_extensions import Annotated\n",
+ "from sklearn.datasets import load_breast_cancer\n",
+ "\n",
+ "import random\n",
+ "import pandas as pd\n",
+ "from zenml import step, ExternalArtifact, pipeline, ModelVersion, get_step_context\n",
+ "from zenml.client import Client\n",
+ "from zenml.logger import get_logger\n",
+ "from uuid import UUID\n",
+ "\n",
+ "from typing import Optional, List\n",
+ "\n",
+ "from zenml import pipeline\n",
+ "\n",
+ "from steps import (\n",
+ " data_loader,\n",
+ " data_preprocessor,\n",
+ " data_splitter,\n",
+ " model_evaluator,\n",
+ " inference_preprocessor\n",
+ ")\n",
+ "\n",
+ "from zenml.logger import get_logger\n",
+ "\n",
+ "logger = get_logger(__name__)\n",
+ "\n",
+ "# Initialize the ZenML client to fetch objects from the ZenML Server\n",
+ "client = Client()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35e48460",
+ "metadata": {},
+ "source": [
+ "## 🥇 Step 1: Load your data and execute feature engineering\n",
+ "\n",
+ "We'll start off by importing our data. In this quickstart we'll be working with\n",
+ "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n",
+ "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n",
+ "problem, to predict whether a patient is diagnosed with breast cancer or not.\n",
+ "\n",
+ "When you're getting started with a machine learning problem you'll want to do\n",
+ "something similar to this: import your data and get it in the right shape for\n",
+ "your training. ZenML mostly gets out of your way when you're writing your Python\n",
+ "code, as you'll see from the following cell.\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3cd974d1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@step\n",
+ "def data_loader_simplified(\n",
+ " random_state: int, is_inference: bool = False, target: str = \"target\"\n",
+ ") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset \n",
+ " \"\"\"Dataset reader step.\"\"\"\n",
+ " dataset = load_breast_cancer(as_frame=True)\n",
+ " inference_size = int(len(dataset.target) * 0.05)\n",
+ " dataset: pd.DataFrame = dataset.frame\n",
+ " inference_subset = dataset.sample(inference_size, random_state=random_state)\n",
+ " if is_inference:\n",
+ " dataset = inference_subset\n",
+ " dataset.drop(columns=target, inplace=True)\n",
+ " else:\n",
+ " dataset.drop(inference_subset.index, inplace=True)\n",
+ " dataset.reset_index(drop=True, inplace=True)\n",
+ " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n",
+ " return dataset\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e8ba4c6",
+ "metadata": {},
+ "source": [
+ "The whole function is decorated with the `@step` decorator, which\n",
+ "tells ZenML to track this function as a step in the pipeline. This means that\n",
+ "ZenML will automatically version, track, and cache the data that is produced by\n",
+ "this function as an `artifact`. This is a very powerful feature, as it means that you can\n",
+ "reproduce your data at any point in the future, even if the original data source\n",
+ "changes or disappears. \n",
+ "\n",
+ "Note the use of the `typing` module's `Annotated` type hint in the output of the\n",
+ "step. We're using this to give a name to the output of the step, which will make\n",
+ "it possible to access it via a keyword later on.\n",
+ "\n",
+ "You'll also notice that we have included type hints for the outputs\n",
+ "to the function. These are not only useful for anyone reading your code, but\n",
+ "help ZenML process your data in a way appropriate to the specific data types."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b6286b67",
+ "metadata": {},
+ "source": [
+ "ZenML is built in a way that allows you to experiment with your data and build\n",
+ "your pipelines as you work, so if you want to call this function to see how it\n",
+ "works, you can just call it directly. Here we take a look at the first few rows\n",
+ "of your training dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d838e2ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = data_loader_simplified(random_state=42)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28c05291",
+ "metadata": {},
+ "source": [
+ "Everything looks as we'd expect and the values are all in the right format 🥳.\n",
+ "\n",
+ "We're now at the point where can bring this step (and some others) together into a single\n",
+ "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n",
+ "as simple as adding a `@pipeline` decorator to a function. This specific\n",
+ "pipeline doesn't return a value, but that option is available to you if you need."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b50a9537",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@pipeline\n",
+ "def feature_engineering(\n",
+ " test_size: float = 0.3,\n",
+ " drop_na: Optional[bool] = None,\n",
+ " normalize: Optional[bool] = None,\n",
+ " drop_columns: Optional[List[str]] = None,\n",
+ " target: Optional[str] = \"target\",\n",
+ " random_state: int = 17\n",
+ "):\n",
+ " \"\"\"Feature engineering pipeline.\"\"\"\n",
+ " # Link all the steps together by calling them and passing the output\n",
+ " # of one step as the input of the next step.\n",
+ " raw_data = data_loader(random_state=random_state, target=target)\n",
+ " dataset_trn, dataset_tst = data_splitter(\n",
+ " dataset=raw_data,\n",
+ " test_size=test_size,\n",
+ " )\n",
+ " dataset_trn, dataset_tst, _ = data_preprocessor(\n",
+ " dataset_trn=dataset_trn,\n",
+ " dataset_tst=dataset_tst,\n",
+ " drop_na=drop_na,\n",
+ " normalize=normalize,\n",
+ " drop_columns=drop_columns,\n",
+ " target=target,\n",
+ " random_state=random_state,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7cd73c23",
+ "metadata": {},
+ "source": [
+ "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n",
+ "pipeline function itself:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e0aa9af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_engineering()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1785c303",
+ "metadata": {},
+ "source": [
+ "Let's run this again with a slightly different test size, to create more datasets:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "658c0570-2607-4b97-a72d-d45c92633e48",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_engineering(test_size=0.25)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64bb7206",
+ "metadata": {},
+ "source": [
+ "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n",
+ "This is because ZenML automatically determined that nothing had changed in the data loader step, \n",
+ "so it didn't need to rerun it."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf",
+ "metadata": {},
+ "source": [
+ "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e1d8546",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_engineering(test_size=0.25, random_state=104)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6c42078a",
+ "metadata": {},
+ "source": [
+ "At this point you might be interested to view your pipeline runs in the ZenML\n",
+ "Dashboard. In case you are not using a hosted instance of ZenML, you can spin this up by executing the next cell. This will start a\n",
+ "server which you can access by clicking on the link that appears in the output\n",
+ "of the cell.\n",
+ "\n",
+ "Log into the Dashboard using default credentials (username 'default' and\n",
+ "password left blank). From there you can inspect the pipeline or the specific\n",
+ "pipeline run.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8cd3cc8c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from zenml.environment import Environment\n",
+ "from zenml.zen_stores.rest_zen_store import RestZenStore\n",
+ "\n",
+ "\n",
+ "if not isinstance(client.zen_store, RestZenStore):\n",
+ " # Only spin up a local Dashboard in case you aren't already connected to a remote server\n",
+ " if Environment.in_google_colab():\n",
+ " # run ZenML through a cloudflare tunnel to get a public endpoint\n",
+ " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n",
+ " else:\n",
+ " !zenml up"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8471f93",
+ "metadata": {},
+ "source": [
+ "We can also fetch the pipeline from the server and view the results directly in the notebook:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f208b200",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "client = Client()\n",
+ "run = client.get_pipeline(\"feature_engineering\").last_run\n",
+ "print(run.name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a037f09d",
+ "metadata": {},
+ "source": [
+ "We can also see the data artifacts that were produced by the last step of the pipeline:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34283e89",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "run.steps[\"data_preprocessor\"].outputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bceb0312",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Read one of the datasets. This is the one with a 0.25 test split\n",
+ "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26d26436",
+ "metadata": {},
+ "source": [
+ "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n",
+ "\n",
+ "You can fetch these artifact and their versions using the `client`: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c8f90647",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get artifact version from our run\n",
+ "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"] \n",
+ "\n",
+ "# Get latest version from client directly\n",
+ "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n",
+ "\n",
+ "# This should be true if our run is the latest run and no artifact has been produced\n",
+ "# in the intervening time\n",
+ "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3f9d3dfd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fetch the rest of the artifacts\n",
+ "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n",
+ "preprocessing_pipeline_artifact_version = client.get_artifact_version(\"preprocess_pipeline\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a7d1b04",
+ "metadata": {},
+ "source": [
+ "If you started with a fresh install, then you would have two versions corresponding\n",
+ "to the two pipelines that we ran above. We can even load a artifact version in memory: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c82aca75",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load an artifact to verify you can fetch it\n",
+ "dataset_trn_artifact_version.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5963509e",
+ "metadata": {},
+ "source": [
+ "We'll use these artifacts from above in our next pipeline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8c28b474",
+ "metadata": {},
+ "source": [
+ "# ⌚ Step 2: Training pipeline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "87909827",
+ "metadata": {},
+ "source": [
+ "Now that we have our data it makes sense to train some models to get a sense of\n",
+ "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n",
+ "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n",
+ "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n",
+ "\n",
+ "We'll start with two simple models, a SGD Classifier and a Random Forest\n",
+ "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n",
+ "same data and then compare their performance.\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fccf1bd9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.base import ClassifierMixin\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.linear_model import SGDClassifier\n",
+ "from typing_extensions import Annotated\n",
+ "from zenml import ArtifactConfig, step\n",
+ "from zenml.logger import get_logger\n",
+ "\n",
+ "logger = get_logger(__name__)\n",
+ "\n",
+ "\n",
+ "@step\n",
+ "def model_trainer(\n",
+ " dataset_trn: pd.DataFrame,\n",
+ " model_type: str = \"sgd\",\n",
+ ") -> Annotated[ClassifierMixin, ArtifactConfig(name=\"model\", is_model_artifact=True)]:\n",
+ " \"\"\"Configure and train a model on the training dataset.\"\"\"\n",
+ " target = \"target\"\n",
+ " if model_type == \"sgd\":\n",
+ " model = SGDClassifier()\n",
+ " elif model_type == \"rf\":\n",
+ " model = RandomForestClassifier()\n",
+ " else:\n",
+ " raise ValueError(f\"Unknown model type {model_type}\") \n",
+ "\n",
+ " logger.info(f\"Training model {model}...\")\n",
+ "\n",
+ " model.fit(\n",
+ " dataset_trn.drop(columns=[target]),\n",
+ " dataset_trn[target],\n",
+ " )\n",
+ " return model\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73a00008",
+ "metadata": {},
+ "source": [
+ "Our two training steps both return different kinds of `sklearn` classifier\n",
+ "models, so we use the generic `ClassifierMixin` type hint for the return type."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a5f22174",
+ "metadata": {},
+ "source": [
+ "ZenML allows you to load any version of any dataset that is tracked by the framework\n",
+ "directly into a pipeline using the `ExternalArtifact` interface. This is very convenient\n",
+ "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n",
+ "into the training pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1aa98f2f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@pipeline\n",
+ "def training(\n",
+ " train_dataset_id: Optional[UUID] = None,\n",
+ " test_dataset_id: Optional[UUID] = None,\n",
+ " model_type: str = \"sgd\",\n",
+ " min_train_accuracy: float = 0.0,\n",
+ " min_test_accuracy: float = 0.0,\n",
+ "):\n",
+ " \"\"\"Model training pipeline.\"\"\" \n",
+ " if train_dataset_id is None or test_dataset_id is None:\n",
+ " # If we dont pass the IDs, this will run the feature engineering pipeline \n",
+ " dataset_trn, dataset_tst = feature_engineering()\n",
+ " else:\n",
+ " # Load the datasets from an older pipeline\n",
+ " dataset_trn = ExternalArtifact(id=train_dataset_id)\n",
+ " dataset_tst = ExternalArtifact(id=test_dataset_id) \n",
+ "\n",
+ " trained_model = model_trainer(\n",
+ " dataset_trn=dataset_trn,\n",
+ " model_type=model_type,\n",
+ " )\n",
+ "\n",
+ " model_evaluator(\n",
+ " model=trained_model,\n",
+ " dataset_trn=dataset_trn,\n",
+ " dataset_tst=dataset_tst,\n",
+ " min_train_accuracy=min_train_accuracy,\n",
+ " min_test_accuracy=min_test_accuracy,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88b70fd3",
+ "metadata": {},
+ "source": [
+ "The end goal of this quick baseline evaluation is to understand which of the two\n",
+ "models performs better. We'll use the `evaluator` step to compare the two\n",
+ "models. This step takes in the model from the trainer step, and computes its score\n",
+ "over the testing set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c64885ac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use a random forest model with the chosen datasets.\n",
+ "# We need to pass the ID's of the datasets into the function\n",
+ "training(\n",
+ " model_type=\"rf\",\n",
+ " train_dataset_id=dataset_trn_artifact_version.id,\n",
+ " test_dataset_id=dataset_tst_artifact_version.id\n",
+ ")\n",
+ "\n",
+ "rf_run = client.get_pipeline(\"training\").last_run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4300c82f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use a SGD classifier\n",
+ "sgd_run = training(\n",
+ " model_type=\"sgd\",\n",
+ " train_dataset_id=dataset_trn_artifact_version.id,\n",
+ " test_dataset_id=dataset_tst_artifact_version.id\n",
+ ")\n",
+ "\n",
+ "sgd_run = client.get_pipeline(\"training\").last_run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43f1a68a",
+ "metadata": {},
+ "source": [
+ "You can see from the logs already how our model training went: the\n",
+ "`RandomForestClassifier` performed considerably better than the `SGDClassifier`.\n",
+ "We can use the ZenML `Client` to verify this:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d95810b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The evaluator returns a float value with the accuracy\n",
+ "rf_run.steps[\"model_evaluator\"].output.load() > sgd_run.steps[\"model_evaluator\"].output.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e256d145",
+ "metadata": {},
+ "source": [
+ "# 💯 Step 3: Associating a model with your pipeline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "927978f3",
+ "metadata": {},
+ "source": [
+ "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n",
+ "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n",
+ "which is a central register of all your ML models.\n",
+ "\n",
+ "You can easily create a ZenML `Model` and associate it with your pipelines using the `ModelVersion` object:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99ca00c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline_settings = {}\n",
+ "\n",
+ "# Lets add some metadata to the model to make it identifiable\n",
+ "pipeline_settings[\"model_version\"] = ModelVersion(\n",
+ " name=\"breast_cancer_classifier\",\n",
+ " license=\"Apache 2.0\",\n",
+ " description=\"A breast cancer classifier\",\n",
+ " tags=[\"breast_cancer\", \"classifier\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e78a520",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's train the SGD model and set the version name to \"sgd\"\n",
+ "pipeline_settings[\"model_version\"].version = \"sgd\"\n",
+ "\n",
+ "# the `with_options` method allows us to pass in pipeline settings\n",
+ "# and returns a configured pipeline\n",
+ "training_configured = training.with_options(**pipeline_settings)\n",
+ "\n",
+ "# We can now run this as usual\n",
+ "training_configured(\n",
+ " model_type=\"sgd\",\n",
+ " train_dataset_id=dataset_trn_artifact_version.id,\n",
+ " test_dataset_id=dataset_tst_artifact_version.id\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9b8e0002",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's train the RF model and set the version name to \"rf\"\n",
+ "pipeline_settings[\"model_version\"].version = \"rf\"\n",
+ "\n",
+ "# the `with_options` method allows us to pass in pipeline settings\n",
+ "# and returns a configured pipeline\n",
+ "training_configured = training.with_options(**pipeline_settings)\n",
+ "\n",
+ "# Let's run it again to make sure we have two versions\n",
+ "training_configured(\n",
+ " model_type=\"rf\",\n",
+ " train_dataset_id=dataset_trn_artifact_version.id,\n",
+ " test_dataset_id=dataset_tst_artifact_version.id\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "09597223",
+ "metadata": {},
+ "source": [
+ "This time, running both pipelines has created two associated **model versions**.\n",
+ "You can list your ZenML model and their versions as follows:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fbb25913",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zenml_model = client.get_model(\"breast_cancer_classifier\")\n",
+ "print(zenml_model)\n",
+ "\n",
+ "print(f\"Model {zenml_model.name} has {len(zenml_model.versions)} versions\")\n",
+ "\n",
+ "zenml_model.versions[0].version, zenml_model.versions[1].version"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e82cfac2",
+ "metadata": {},
+ "source": [
+ "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n",
+ "pipelines to that model version, including the two pickle files that represent our\n",
+ "SGD and RandomForest classifier. We can see all artifacts directly from the model\n",
+ "version object:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31211413",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Let's load the RF version\n",
+ "rf_zenml_model_version = client.get_model_version(\"breast_cancer_classifier\", \"rf\")\n",
+ "\n",
+ "# We can now load our classifier directly as well\n",
+ "random_forest_classifier = rf_zenml_model_version.get_artifact(\"model\").load()\n",
+ "\n",
+ "random_forest_classifier"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53517a9a",
+ "metadata": {},
+ "source": [
+ "If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eb645dde",
+ "metadata": {},
+ "source": [
+ "There is a lot more you can do with ZenML models, including the ability to\n",
+ "track metrics by adding metadata to it, or having them persist in a model\n",
+ "registry. However, these topics can be explored more in the\n",
+ "[ZenML docs](https://docs.zenml.io).\n",
+ "\n",
+ "For now, we will use the ZenML model control plane to promote our best\n",
+ "model to `production`. You can do this by simply setting the `stage` of\n",
+ "your chosen model version to the `production` tag."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26b718f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set our best classifier to production\n",
+ "rf_zenml_model_version.set_stage(\"production\", force=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9fddf3d0",
+ "metadata": {},
+ "source": [
+ "Of course, normally one would only promote the model by comparing to all other model\n",
+ "versions and doing some other tests. But that's a bit more advanced use-case. See the\n",
+ "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n",
+ "more insight into that sort of flow!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ecbc8cf",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f1146db",
+ "metadata": {},
+ "source": [
+ "Once the model is promoted, we can now consume the right model version in our\n",
+ "batch inference pipeline directly. Let's see how that works."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6306f14",
+ "metadata": {},
+ "source": [
+ "# 🫅 Step 4: Consuming the model in production"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b51f3108",
+ "metadata": {},
+ "source": [
+ "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n",
+ "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n",
+ "and generate predictions:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "92c4c7dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@step\n",
+ "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n",
+ " \"\"\"Predictions step\"\"\"\n",
+ " # Get the model_version\n",
+ " model_version = get_step_context().model_version\n",
+ "\n",
+ " # run prediction from memory\n",
+ " predictor = model_version.load_artifact(\"model\")\n",
+ " predictions = predictor.predict(dataset_inf)\n",
+ "\n",
+ " predictions = pd.Series(predictions, name=\"predicted\")\n",
+ "\n",
+ " return predictions\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3aeb227b",
+ "metadata": {},
+ "source": [
+ "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n",
+ "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37c409bd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@pipeline\n",
+ "def inference(preprocess_pipeline_id: UUID):\n",
+ " \"\"\"Model batch inference pipeline\"\"\"\n",
+ " # random_state = client.get_artifact_version(id=preprocess_pipeline_id).metadata[\"random_state\"].value\n",
+ " # target = client.get_artifact_version(id=preprocess_pipeline_id).run_metadata['target'].value\n",
+ " random_state = 42\n",
+ " target = \"target\"\n",
+ "\n",
+ " df_inference = data_loader(\n",
+ " random_state=random_state, is_inference=True\n",
+ " )\n",
+ " df_inference = inference_preprocessor(\n",
+ " dataset_inf=df_inference,\n",
+ " # We use the preprocess pipeline from the feature engineering pipeline\n",
+ " preprocess_pipeline=ExternalArtifact(id=preprocess_pipeline_id),\n",
+ " target=target,\n",
+ " )\n",
+ " inference_predict(\n",
+ " dataset_inf=df_inference,\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7afe7be",
+ "metadata": {},
+ "source": [
+ "The way to load the right model is to pass in the `production` stage into the `ModelVersion` config this time.\n",
+ "This will ensure to always load the production model, decoupled from all other pipelines:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61bf5939",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline_settings = {\"enable_cache\": False}\n",
+ "\n",
+ "# Lets add some metadata to the model to make it identifiable\n",
+ "pipeline_settings[\"model_version\"] = ModelVersion(\n",
+ " name=\"breast_cancer_classifier\",\n",
+ " version=\"production\", # We can pass in the stage name here!\n",
+ " license=\"Apache 2.0\",\n",
+ " description=\"A breast cancer classifier\",\n",
+ " tags=[\"breast_cancer\", \"classifier\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff3402f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the `with_options` method allows us to pass in pipeline settings\n",
+ "# and returns a configured pipeline\n",
+ "inference_configured = inference.with_options(**pipeline_settings)\n",
+ "\n",
+ "# Let's run it again to make sure we have two versions\n",
+ "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n",
+ "# in order to avoid training-serving skew\n",
+ "inference_configured(\n",
+ " preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2935d1fa",
+ "metadata": {},
+ "source": [
+ "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n",
+ "that were returned in the pipeline. This completes the MLOps loop of training to inference:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e191d019",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fetch production model\n",
+ "production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n",
+ "\n",
+ "# Get the predictions artifact\n",
+ "production_model_version.get_artifact(\"predictions\").load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0a73cdf",
+ "metadata": {},
+ "source": [
+ "You can also see all predictions ever created as a complete history in the dashboard:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679",
+ "metadata": {},
+ "source": [
+ "## Congratulations!\n",
+ "\n",
+ "You're a legit MLOps engineer now! You trained two models, evaluated them against\n",
+ "a test set, registered the best one with the ZenML model control plane,\n",
+ "and served some predictions. You also learned how to iterate on your models and\n",
+ "data by using some of the ZenML utility abstractions. You saw how to view your\n",
+ "artifacts and models via the client as well as the ZenML Dashboard.\n",
+ "\n",
+ "## Further exploration\n",
+ "\n",
+ "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n",
+ "about the capabilities of ZenML. For example, you might want to:\n",
+ "\n",
+ "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n",
+ "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n",
+ "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n",
+ "\n",
+ "## What next?\n",
+ "\n",
+ "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n",
+ "* If you want to quickly get started with ZenML, check out the [ZenML Cloud](https://zenml.io/cloud)."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/quickstart/requirements.txt b/examples/quickstart/requirements.txt
new file mode 100644
index 00000000000..060cab40b59
--- /dev/null
+++ b/examples/quickstart/requirements.txt
@@ -0,0 +1,4 @@
+zenml[server]>=0.50.0
+notebook
+scikit-learn<1.3
+pyarrow
diff --git a/examples/quickstart/run.py b/examples/quickstart/run.py
index 0f93a1f7594..abc917e5576 100644
--- a/examples/quickstart/run.py
+++ b/examples/quickstart/run.py
@@ -1,19 +1,253 @@
-from pipelines.deploy_and_predict import deploy_and_predict
-from pipelines.train_and_register_model import (
- train_and_register_model_pipeline,
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from typing import Optional
+
+import click
+import yaml
+from pipelines import (
+ feature_engineering,
+ inference,
+ training,
)
from zenml.client import Client
+from zenml.logger import get_logger
-if __name__ == "__main__":
- train_and_register_model_pipeline()
- best_model_test_accuracy = (
- Client()
- .get_pipeline("train_and_register_model_pipeline")
- .last_successful_run.steps["best_model_selector"]
- .outputs["best_model_test_acc"]
- .load()
+logger = get_logger(__name__)
+
+
+@click.command(
+ help="""
+ZenML Starter project.
+
+Run the ZenML starter project with basic options.
+
+Examples:
+
+ \b
+ # Run the feature engineering pipeline
+ python run.py --feature-pipeline
+
+ \b
+ # Run the training pipeline
+ python run.py --training-pipeline
+
+ \b
+ # Run the training pipeline with versioned artifacts
+ python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1
+
+ \b
+ # Run the inference pipeline
+ python run.py --inference-pipeline
+
+"""
+)
+@click.option(
+ "--train-dataset-name",
+ default="dataset_trn",
+ type=click.STRING,
+ help="The name of the train dataset produced by feature engineering.",
+)
+@click.option(
+ "--train-dataset-version-name",
+ default=None,
+ type=click.STRING,
+ help="Version of the train dataset produced by feature engineering. "
+ "If not specified, a new version will be created.",
+)
+@click.option(
+ "--test-dataset-name",
+ default="dataset_tst",
+ type=click.STRING,
+ help="The name of the test dataset produced by feature engineering.",
+)
+@click.option(
+ "--test-dataset-version-name",
+ default=None,
+ type=click.STRING,
+ help="Version of the test dataset produced by feature engineering. "
+ "If not specified, a new version will be created.",
+)
+@click.option(
+ "--feature-pipeline",
+ is_flag=True,
+ default=False,
+ help="Whether to run the pipeline that creates the dataset.",
+)
+@click.option(
+ "--training-pipeline",
+ is_flag=True,
+ default=False,
+ help="Whether to run the pipeline that trains the model.",
+)
+@click.option(
+ "--inference-pipeline",
+ is_flag=True,
+ default=False,
+ help="Whether to run the pipeline that performs inference.",
+)
+@click.option(
+ "--no-cache",
+ is_flag=True,
+ default=False,
+ help="Disable caching for the pipeline run.",
+)
+def main(
+ train_dataset_name: str = "dataset_trn",
+ train_dataset_version_name: Optional[str] = None,
+ test_dataset_name: str = "dataset_tst",
+ test_dataset_version_name: Optional[str] = None,
+ feature_pipeline: bool = False,
+ training_pipeline: bool = False,
+ inference_pipeline: bool = False,
+ no_cache: bool = False,
+):
+ """Main entry point for the pipeline execution.
+
+ This entrypoint is where everything comes together:
+
+ * configuring pipeline with the required parameters
+ (some of which may come from command line arguments, but most
+ of which comes from the YAML config files)
+ * launching the pipeline
+
+ Args:
+ train_dataset_name: The name of the train dataset produced by feature engineering.
+ train_dataset_version_name: Version of the train dataset produced by feature engineering.
+ If not specified, a new version will be created.
+ test_dataset_name: The name of the test dataset produced by feature engineering.
+ test_dataset_version_name: Version of the test dataset produced by feature engineering.
+ If not specified, a new version will be created.
+ feature_pipeline: Whether to run the pipeline that creates the dataset.
+ training_pipeline: Whether to run the pipeline that trains the model.
+ inference_pipeline: Whether to run the pipeline that performs inference.
+ no_cache: If `True` cache will be disabled.
+ """
+ client = Client()
+
+ config_folder = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)),
+ "configs",
)
- if best_model_test_accuracy > 0.7:
- deploy_and_predict()
+ # Execute Feature Engineering Pipeline
+ if feature_pipeline:
+ pipeline_args = {}
+ if no_cache:
+ pipeline_args["enable_cache"] = False
+ pipeline_args["config_path"] = os.path.join(
+ config_folder, "feature_engineering.yaml"
+ )
+ run_args_feature = {}
+ feature_engineering.with_options(**pipeline_args)(**run_args_feature)
+ logger.info("Feature Engineering pipeline finished successfully!\n")
+
+ train_dataset_artifact = client.get_artifact_version(
+ train_dataset_name
+ )
+ test_dataset_artifact = client.get_artifact_version(test_dataset_name)
+ logger.info(
+ "The latest feature engineering pipeline produced the following "
+ f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, "
+ f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: "
+ f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}"
+ )
+
+ # Execute Training Pipeline
+ if training_pipeline:
+ run_args_train = {}
+
+ # If train_dataset_version_name is specified, use versioned artifacts
+ if train_dataset_version_name or test_dataset_version_name:
+ # However, both train and test dataset versions must be specified
+ assert (
+ train_dataset_version_name is not None
+ and test_dataset_version_name is not None
+ )
+ train_dataset_artifact_version = client.get_artifact_version(
+ train_dataset_name, train_dataset_version_name
+ )
+ # If train dataset is specified, test dataset must be specified
+ test_dataset_artifact_version = client.get_artifact_version(
+ test_dataset_name, test_dataset_version_name
+ )
+ # Use versioned artifacts
+ run_args_train[
+ "train_dataset_id"
+ ] = train_dataset_artifact_version.id
+ run_args_train[
+ "test_dataset_id"
+ ] = test_dataset_artifact_version.id
+
+ # Run the SGD pipeline
+ pipeline_args = {}
+ if no_cache:
+ pipeline_args["enable_cache"] = False
+ pipeline_args["config_path"] = os.path.join(
+ config_folder, "training_sgd.yaml"
+ )
+ training.with_options(**pipeline_args)(**run_args_train)
+ logger.info("Training pipeline with SGD finished successfully!\n\n")
+
+ # Run the RF pipeline
+ pipeline_args = {}
+ if no_cache:
+ pipeline_args["enable_cache"] = False
+ pipeline_args["config_path"] = os.path.join(
+ config_folder, "training_rf.yaml"
+ )
+ training.with_options(**pipeline_args)(**run_args_train)
+ logger.info("Training pipeline with RF finished successfully!\n\n")
+
+ if inference_pipeline:
+ run_args_inference = {}
+ pipeline_args = {"enable_cache": False}
+ pipeline_args["config_path"] = os.path.join(
+ config_folder, "inference.yaml"
+ )
+
+ # Configure the pipeline
+ inference_configured = inference.with_options(**pipeline_args)
+
+ # Fetch the production model
+ with open(pipeline_args["config_path"], "r") as f:
+ config = yaml.load(f, Loader=yaml.SafeLoader)
+ zenml_model = client.get_model_version(
+ config["model_version"]["name"], config["model_version"]["version"]
+ )
+ preprocess_pipeline_artifact = zenml_model.get_artifact(
+ "preprocess_pipeline"
+ )
+
+ # Use the metadata of feature engineering pipeline artifact
+ # to get the random state and target column
+ random_state = preprocess_pipeline_artifact.run_metadata[
+ "random_state"
+ ].value
+ target = preprocess_pipeline_artifact.run_metadata["target"].value
+ run_args_inference["random_state"] = random_state
+ run_args_inference["target"] = target
+
+ # Run the pipeline
+ inference_configured(**run_args_inference)
+ logger.info("Inference pipeline finished successfully!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/quickstart/setup.sh b/examples/quickstart/setup.sh
deleted file mode 100644
index a3e454425b5..00000000000
--- a/examples/quickstart/setup.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-set -Eeo pipefail
-
-setup_stack () {
-
- zenml model-deployer register mlflow_deployer --flavor=mlflow || \
- msg "${WARNING}Reusing preexisting model deployer ${NOFORMAT}mlflow_deployer"
- zenml experiment-tracker register mlflow_tracker --flavor=mlflow || \
- msg "${WARNING}Reusing preexisting experiment tracker ${NOFORMAT}mlflow_tracker"
- zenml model-registry register mlflow_registry --flavor=mlflow || \
- msg "${WARNING}Reusing preexisting model registry ${NOFORMAT}mlflow_registry"
-
- zenml stack register quickstart_stack \
- -a default \
- -o default \
- -d mlflow_deployer \
- -r mlflow_registry \
- -e mlflow_tracker || \
- msg "${WARNING}Reusing preexisting stack ${NOFORMAT}quickstart"
-
- zenml stack set quickstart_stack
-}
-
-pre_run () {
- zenml integration install sklearn mlflow
-}
-
-pre_run_forced () {
- zenml integration install sklearn mlflow -y
-}
-
-post_run () {
- # cleanup the last local ZenML daemon started by the example
- pkill -n -f zenml.services.local.local_daemon_entrypoint
-}
diff --git a/examples/quickstart/steps/__init__.py b/examples/quickstart/steps/__init__.py
index e69de29bb2d..98124b2a40f 100644
--- a/examples/quickstart/steps/__init__.py
+++ b/examples/quickstart/steps/__init__.py
@@ -0,0 +1,41 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .data_loader import (
+ data_loader,
+)
+from .data_preprocessor import (
+ data_preprocessor,
+)
+from .data_splitter import (
+ data_splitter,
+)
+from .inference_predict import (
+ inference_predict,
+)
+from .inference_preprocessor import (
+ inference_preprocessor,
+)
+from .model_evaluator import (
+ model_evaluator,
+)
+from .model_promoter import (
+ model_promoter,
+)
+from .model_trainer import (
+ model_trainer,
+)
diff --git a/examples/quickstart/steps/best_model_selector.py b/examples/quickstart/steps/best_model_selector.py
deleted file mode 100644
index 884a7ec3eb3..00000000000
--- a/examples/quickstart/steps/best_model_selector.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from typing import Tuple
-
-import pandas as pd
-from sklearn.base import ClassifierMixin
-from typing_extensions import Annotated
-
-from zenml import step
-
-
-@step
-def best_model_selector(
- X_test: pd.DataFrame,
- y_test: pd.Series,
- model1: ClassifierMixin,
- model2: ClassifierMixin,
-) -> Tuple[
- Annotated[ClassifierMixin, "best_model"],
- Annotated[float, "best_model_test_acc"],
-]:
- """Calculate the accuracy on the test set and return the best model and its accuracy."""
- test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy())
- test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy())
- print(f"Test accuracy ({model1.__class__.__name__}): {test_acc1}")
- print(f"Test accuracy ({model2.__class__.__name__}): {test_acc2}")
- if test_acc1 > test_acc2:
- best_model = model1
- best_model_test_acc = test_acc1
- else:
- best_model = model2
- best_model_test_acc = test_acc2
- return best_model, best_model_test_acc
diff --git a/examples/quickstart/steps/data_loader.py b/examples/quickstart/steps/data_loader.py
new file mode 100644
index 00000000000..baf59c6433f
--- /dev/null
+++ b/examples/quickstart/steps/data_loader.py
@@ -0,0 +1,65 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+from sklearn.datasets import load_breast_cancer
+from typing_extensions import Annotated
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def data_loader(
+ random_state: int, is_inference: bool = False, target: str = "target"
+) -> Annotated[pd.DataFrame, "dataset"]:
+ """Dataset reader step.
+
+ This is an example of a dataset reader step that load Breast Cancer dataset.
+
+ This step is parameterized, which allows you to configure the step
+ independently of the step code, before running it in a pipeline.
+ In this example, the step can be configured with number of rows and logic
+ to drop target column or not. See the documentation for more information:
+
+ https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines
+
+ Args:
+ random_state: Random state for sampling
+ is_inference: If `True` subset will be returned and target column
+ will be removed from dataset.
+ target: Name of target columns in dataset.
+
+ Returns:
+ The dataset artifact as Pandas DataFrame and name of target column.
+ """
+ dataset = load_breast_cancer(as_frame=True)
+ inference_size = int(len(dataset.target) * 0.05)
+ dataset: pd.DataFrame = dataset.frame
+ inference_subset = dataset.sample(
+ inference_size, random_state=random_state
+ )
+ if is_inference:
+ dataset = inference_subset
+ dataset.drop(columns=target, inplace=True)
+ else:
+ dataset.drop(inference_subset.index, inplace=True)
+ dataset.reset_index(drop=True, inplace=True)
+ logger.info(f"Dataset with {len(dataset)} records loaded!")
+ return dataset
diff --git a/examples/quickstart/steps/data_preprocessor.py b/examples/quickstart/steps/data_preprocessor.py
new file mode 100644
index 00000000000..2f4a11ccbaa
--- /dev/null
+++ b/examples/quickstart/steps/data_preprocessor.py
@@ -0,0 +1,94 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Tuple
+
+import pandas as pd
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MinMaxScaler
+from typing_extensions import Annotated
+from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper
+
+from zenml import log_artifact_metadata, step
+
+
+@step
+def data_preprocessor(
+ random_state: int,
+ dataset_trn: pd.DataFrame,
+ dataset_tst: pd.DataFrame,
+ drop_na: Optional[bool] = None,
+ normalize: Optional[bool] = None,
+ drop_columns: Optional[List[str]] = None,
+ target: Optional[str] = "target",
+) -> Tuple[
+ Annotated[pd.DataFrame, "dataset_trn"],
+ Annotated[pd.DataFrame, "dataset_tst"],
+ Annotated[Pipeline, "preprocess_pipeline"],
+]:
+ """Data preprocessor step.
+
+ This is an example of a data processor step that prepares the data so that
+ it is suitable for model training. It takes in a dataset as an input step
+ artifact and performs any necessary preprocessing steps like cleaning,
+ feature engineering, feature selection, etc. It then returns the processed
+ dataset as a step output artifact.
+
+ This step is parameterized, which allows you to configure the step
+ independently of the step code, before running it in a pipeline.
+ In this example, the step can be configured to drop NA values, drop some
+ columns and normalize numerical columns. See the documentation for more
+ information:
+
+ https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines
+
+ Args:
+ random_state: Random state for sampling.
+ dataset_trn: The train dataset.
+ dataset_tst: The test dataset.
+ drop_na: If `True` all NA rows will be dropped.
+ normalize: If `True` all numeric fields will be normalized.
+ drop_columns: List of column names to drop.
+ target: Name of target column in dataset.
+
+ Returns:
+ The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object.
+ """
+ # We use the sklearn pipeline to chain together multiple preprocessing steps
+ preprocess_pipeline = Pipeline([("passthrough", "passthrough")])
+ if drop_na:
+ preprocess_pipeline.steps.append(("drop_na", NADropper()))
+ if drop_columns:
+ # Drop columns
+ preprocess_pipeline.steps.append(
+ ("drop_columns", ColumnsDropper(drop_columns))
+ )
+ if normalize:
+ # Normalize the data
+ preprocess_pipeline.steps.append(("normalize", MinMaxScaler()))
+ preprocess_pipeline.steps.append(
+ ("cast", DataFrameCaster(dataset_trn.columns))
+ )
+ dataset_trn = preprocess_pipeline.fit_transform(dataset_trn)
+ dataset_tst = preprocess_pipeline.transform(dataset_tst)
+
+ # Log metadata so we can load it in the inference pipeline
+ log_artifact_metadata(
+ artifact_name="preprocess_pipeline",
+ metadata={"random_state": random_state, "target": target},
+ )
+ return dataset_trn, dataset_tst, preprocess_pipeline
diff --git a/examples/quickstart/steps/data_splitter.py b/examples/quickstart/steps/data_splitter.py
new file mode 100644
index 00000000000..13ec7e856e1
--- /dev/null
+++ b/examples/quickstart/steps/data_splitter.py
@@ -0,0 +1,61 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Tuple
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from typing_extensions import Annotated
+
+from zenml import step
+
+
+@step
+def data_splitter(
+ dataset: pd.DataFrame, test_size: float = 0.2
+) -> Tuple[
+ Annotated[pd.DataFrame, "raw_dataset_trn"],
+ Annotated[pd.DataFrame, "raw_dataset_tst"],
+]:
+ """Dataset splitter step.
+
+ This is an example of a dataset splitter step that splits the data
+ into train and test set before passing it to ML model.
+
+ This step is parameterized, which allows you to configure the step
+ independently of the step code, before running it in a pipeline.
+ In this example, the step can be configured to use different test
+ set sizes. See the documentation for more information:
+
+ https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines
+
+ Args:
+ dataset: Dataset read from source.
+ test_size: 0.0..1.0 defining portion of test set.
+
+ Returns:
+ The split dataset: dataset_trn, dataset_tst.
+ """
+ dataset_trn, dataset_tst = train_test_split(
+ dataset,
+ test_size=test_size,
+ random_state=42,
+ shuffle=True,
+ )
+ dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns)
+ dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns)
+ return dataset_trn, dataset_tst
diff --git a/examples/quickstart/steps/inference_predict.py b/examples/quickstart/steps/inference_predict.py
new file mode 100644
index 00000000000..cd1d2921003
--- /dev/null
+++ b/examples/quickstart/steps/inference_predict.py
@@ -0,0 +1,57 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any
+
+import pandas as pd
+from typing_extensions import Annotated
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def inference_predict(
+ model: Any,
+ dataset_inf: pd.DataFrame,
+) -> Annotated[pd.Series, "predictions"]:
+ """Predictions step.
+
+ This is an example of a predictions step that takes the data and model in
+ and returns predicted values.
+
+ This step is parameterized, which allows you to configure the step
+ independently of the step code, before running it in a pipeline.
+ In this example, the step can be configured to use different input data.
+ See the documentation for more information:
+
+ https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines
+
+ Args:
+ model: Trained model.
+ dataset_inf: The inference dataset.
+
+ Returns:
+ The predictions as pandas series
+ """
+ # run prediction from memory
+ predictions = model.predict(dataset_inf)
+
+ predictions = pd.Series(predictions, name="predicted")
+ return predictions
diff --git a/examples/quickstart/steps/inference_preprocessor.py b/examples/quickstart/steps/inference_preprocessor.py
new file mode 100644
index 00000000000..d484433e51e
--- /dev/null
+++ b/examples/quickstart/steps/inference_preprocessor.py
@@ -0,0 +1,50 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+from sklearn.pipeline import Pipeline
+from typing_extensions import Annotated
+
+from zenml import step
+
+
+@step
+def inference_preprocessor(
+ dataset_inf: pd.DataFrame,
+ preprocess_pipeline: Pipeline,
+ target: str,
+) -> Annotated[pd.DataFrame, "inference_dataset"]:
+ """Data preprocessor step.
+
+ This is an example of a data processor step that prepares the data so that
+ it is suitable for model inference. It takes in a dataset as an input step
+ artifact and performs any necessary preprocessing steps based on pretrained
+ preprocessing pipeline.
+
+ Args:
+ dataset_inf: The inference dataset.
+ preprocess_pipeline: Pretrained `Pipeline` to process dataset.
+ target: Name of target columns in dataset.
+
+ Returns:
+ The processed dataframe: dataset_inf.
+ """
+ # artificially adding `target` column to avoid Pipeline issues
+ dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0])
+ dataset_inf = preprocess_pipeline.transform(dataset_inf)
+ dataset_inf.drop(columns=[target], inplace=True)
+ return dataset_inf
diff --git a/examples/quickstart/steps/model_deployer.py b/examples/quickstart/steps/model_deployer.py
deleted file mode 100644
index 7cde5a21334..00000000000
--- a/examples/quickstart/steps/model_deployer.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from steps.register_model import model_name
-from zenml.integrations.mlflow.steps.mlflow_deployer import (
- mlflow_model_registry_deployer_step,
-)
-
-model_deployer = mlflow_model_registry_deployer_step.with_options(
- parameters=dict(
- registry_model_name=model_name,
- registry_model_version=1,
- timeout=300,
- )
-)
diff --git a/examples/quickstart/steps/model_evaluator.py b/examples/quickstart/steps/model_evaluator.py
new file mode 100644
index 00000000000..579a78afe13
--- /dev/null
+++ b/examples/quickstart/steps/model_evaluator.py
@@ -0,0 +1,105 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+
+import pandas as pd
+from sklearn.base import ClassifierMixin
+
+from zenml import log_artifact_metadata, step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def model_evaluator(
+ model: ClassifierMixin,
+ dataset_trn: pd.DataFrame,
+ dataset_tst: pd.DataFrame,
+ min_train_accuracy: float = 0.0,
+ min_test_accuracy: float = 0.0,
+ target: Optional[str] = "target",
+) -> float:
+ """Evaluate a trained model.
+
+ This is an example of a model evaluation step that takes in a model artifact
+ previously trained by another step in your pipeline, and a training
+ and validation data set pair which it uses to evaluate the model's
+ performance. The model metrics are then returned as step output artifacts
+ (in this case, the model accuracy on the train and test set).
+
+ The suggested step implementation also outputs some warnings if the model
+ performance does not meet some minimum criteria. This is just an example of
+ how you can use steps to monitor your model performance and alert you if
+ something goes wrong. As an alternative, you can raise an exception in the
+ step to force the pipeline run to fail early and all subsequent steps to
+ be skipped.
+
+ This step is parameterized to configure the step independently of the step code,
+ before running it in a pipeline. In this example, the step can be configured
+ to use different values for the acceptable model performance thresholds and
+ to control whether the pipeline run should fail if the model performance
+ does not meet the minimum criteria. See the documentation for more
+ information:
+
+ https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines
+
+ Args:
+ model: The pre-trained model artifact.
+ dataset_trn: The train dataset.
+ dataset_tst: The test dataset.
+ min_train_accuracy: Minimal acceptable training accuracy value.
+ min_test_accuracy: Minimal acceptable testing accuracy value.
+ target: Name of target column in dataset.
+
+ Returns:
+ The model accuracy on the test set.
+ """
+ # Calculate the model accuracy on the train and test set
+ trn_acc = model.score(
+ dataset_trn.drop(columns=[target]),
+ dataset_trn[target],
+ )
+ tst_acc = model.score(
+ dataset_tst.drop(columns=[target]),
+ dataset_tst[target],
+ )
+ logger.info(f"Train accuracy={trn_acc*100:.2f}%")
+ logger.info(f"Test accuracy={tst_acc*100:.2f}%")
+
+ messages = []
+ if trn_acc < min_train_accuracy:
+ messages.append(
+ f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !"
+ )
+ if tst_acc < min_test_accuracy:
+ messages.append(
+ f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !"
+ )
+ else:
+ for message in messages:
+ logger.warning(message)
+
+ log_artifact_metadata(
+ metadata={
+ "train_accuracy": float(trn_acc),
+ "test_accuracy": float(tst_acc),
+ },
+ artifact_name="model",
+ )
+ return float(tst_acc)
diff --git a/examples/quickstart/steps/model_promoter.py b/examples/quickstart/steps/model_promoter.py
new file mode 100644
index 00000000000..c0ad76f7e64
--- /dev/null
+++ b/examples/quickstart/steps/model_promoter.py
@@ -0,0 +1,76 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from zenml import get_step_context, step
+from zenml.client import Client
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def model_promoter(accuracy: float, stage: str = "production") -> bool:
+ """Model promoter step.
+
+ This is an example of a step that conditionally promotes a model. It takes
+ in the accuracy of the model and the stage to promote the model to. If the
+ accuracy is below 80%, the model is not promoted. If it is above 80%, the
+ model is promoted to the stage indicated in the parameters. If there is
+ already a model in the indicated stage, the model with the higher accuracy
+ is promoted.
+
+ Args:
+ accuracy: Accuracy of the model.
+ stage: Which stage to promote the model to.
+
+ Returns:
+ Whether the model was promoted or not.
+ """
+ is_promoted = False
+
+ if accuracy < 0.8:
+ logger.info(
+ f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model."
+ )
+ else:
+ logger.info(f"Model promoted to {stage}!")
+ is_promoted = True
+
+ # Get the model in the current context
+ current_model_version = get_step_context().model_version
+
+ # Get the model that is in the production stage
+ client = Client()
+ try:
+ stage_model_version = client.get_model_version(
+ current_model_version.name, stage
+ )
+ # We compare their metrics
+ prod_accuracy = (
+ stage_model_version.get_artifact("model")
+ .run_metadata["test_accuracy"]
+ .value
+ )
+ if float(accuracy) > float(prod_accuracy):
+ # If current model has better metrics, we promote it
+ is_promoted = True
+ current_model_version.set_stage(stage, force=True)
+ except KeyError:
+ # If no such model exists, current one is promoted
+ is_promoted = True
+ current_model_version.set_stage(stage, force=True)
+ return is_promoted
diff --git a/examples/quickstart/steps/model_trainer.py b/examples/quickstart/steps/model_trainer.py
new file mode 100644
index 00000000000..a9a1c2891c5
--- /dev/null
+++ b/examples/quickstart/steps/model_trainer.py
@@ -0,0 +1,72 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+
+import pandas as pd
+from sklearn.base import ClassifierMixin
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import SGDClassifier
+from typing_extensions import Annotated
+
+from zenml import ArtifactConfig, step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def model_trainer(
+ dataset_trn: pd.DataFrame,
+ model_type: str = "sgd",
+ target: Optional[str] = "target",
+) -> Annotated[
+ ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True)
+]:
+ """Configure and train a model on the training dataset.
+
+ This is an example of a model training step that takes in a dataset artifact
+ previously loaded and pre-processed by other steps in your pipeline, then
+ configures and trains a model on it. The model is then returned as a step
+ output artifact.
+
+ Args:
+ dataset_trn: The preprocessed train dataset.
+ model_type: The type of model to train.
+ target: The name of the target column in the dataset.
+
+ Returns:
+ The trained model artifact.
+
+ Raises:
+ ValueError: If the model type is not supported.
+ """
+ # Initialize the model with the hyperparameters indicated in the step
+ # parameters and train it on the training set.
+ if model_type == "sgd":
+ model = SGDClassifier()
+ elif model_type == "rf":
+ model = RandomForestClassifier()
+ else:
+ raise ValueError(f"Unknown model type {model_type}")
+ logger.info(f"Training model {model}...")
+
+ model.fit(
+ dataset_trn.drop(columns=[target]),
+ dataset_trn[target],
+ )
+ return model
diff --git a/examples/quickstart/steps/prediction_service_loader.py b/examples/quickstart/steps/prediction_service_loader.py
deleted file mode 100644
index 7bf8f10fae9..00000000000
--- a/examples/quickstart/steps/prediction_service_loader.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from zenml import step
-from zenml.client import Client
-from zenml.services import BaseService
-
-
-@step(enable_cache=False)
-def prediction_service_loader() -> BaseService:
- """Load the model service of our train_and_register_model_pipeline."""
- client = Client()
- model_deployer = client.active_stack.model_deployer
- services = model_deployer.find_model_server(
- pipeline_name="train_and_register_model_pipeline",
- )
- return services[0]
diff --git a/examples/quickstart/steps/predictor.py b/examples/quickstart/steps/predictor.py
deleted file mode 100644
index 54b0e569fe0..00000000000
--- a/examples/quickstart/steps/predictor.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import pandas as pd
-from typing_extensions import Annotated
-
-from zenml import step
-from zenml.services import BaseService
-
-
-@step
-def predictor(
- service: BaseService,
- data: pd.DataFrame,
-) -> Annotated[list, "predictions"]:
- """Run a inference request against a prediction service."""
- service.start(timeout=100)
- print(
- f"Running predictions on data (single individual): {data.to_numpy()[0]}"
- )
- prediction = service.predict(data.to_numpy())
- print(
- f"Prediction (for single example slice) is: {bool(prediction.tolist()[0])}"
- )
- return prediction.tolist()
diff --git a/examples/quickstart/steps/register_model.py b/examples/quickstart/steps/register_model.py
deleted file mode 100644
index 9dfb05e8c35..00000000000
--- a/examples/quickstart/steps/register_model.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from zenml.integrations.mlflow.steps.mlflow_registry import (
- mlflow_register_model_step,
-)
-
-model_name = "zenml-quickstart-model"
-
-register_model = mlflow_register_model_step.with_options(
- parameters=dict(
- name=model_name,
- description="The first run of the Quickstart pipeline.",
- )
-)
diff --git a/examples/quickstart/steps/trainers.py b/examples/quickstart/steps/trainers.py
deleted file mode 100644
index b01f9943622..00000000000
--- a/examples/quickstart/steps/trainers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import mlflow
-import pandas as pd
-from sklearn.base import ClassifierMixin
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import SGDClassifier
-
-from zenml import step
-from zenml.client import Client
-
-experiment_tracker = Client().active_stack.experiment_tracker
-
-
-@step(experiment_tracker=experiment_tracker.name)
-def random_forest_trainer_mlflow(
- X_train: pd.DataFrame,
- y_train: pd.Series,
-) -> ClassifierMixin:
- """Train a sklearn Random Forest classifier and log to MLflow."""
- mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow
- model = RandomForestClassifier()
- model.fit(X_train.to_numpy(), y_train.to_numpy())
- train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
- print(f"Train accuracy: {train_acc}")
- return model
-
-
-@step(experiment_tracker=experiment_tracker.name)
-def sgd_trainer_mlflow(
- X_train: pd.DataFrame,
- y_train: pd.Series,
-) -> ClassifierMixin:
- """Train a SGD classifier and log to MLflow."""
- mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow
- model = SGDClassifier()
- model.fit(X_train.to_numpy(), y_train.to_numpy())
- train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
- print(f"Train accuracy: {train_acc}")
- return model
diff --git a/examples/quickstart/steps/training_data_loader.py b/examples/quickstart/steps/training_data_loader.py
deleted file mode 100644
index 4775711da21..00000000000
--- a/examples/quickstart/steps/training_data_loader.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from typing import Tuple
-
-import pandas as pd
-from sklearn.model_selection import train_test_split
-
-from zenml import step
-
-
-@step
-def training_data_loader() -> (
- Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]
-):
- """Load the Census Income dataset as tuple of Pandas DataFrame / Series."""
- # Load the dataset
- url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
- column_names = [
- "age",
- "workclass",
- "fnlwgt",
- "education",
- "education-num",
- "marital-status",
- "occupation",
- "relationship",
- "race",
- "sex",
- "capital-gain",
- "capital-loss",
- "hours-per-week",
- "native-country",
- "income",
- ]
- data = pd.read_csv(
- url, names=column_names, na_values="?", skipinitialspace=True
- )
-
- # Drop rows with missing values
- data = data.dropna()
-
- # Encode categorical features and drop original columns
- categorical_cols = [
- "workclass",
- "education",
- "marital-status",
- "occupation",
- "relationship",
- "race",
- "sex",
- "native-country",
- ]
- data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
-
- # Encode target feature
- data["income"] = data["income"].apply(
- lambda x: 1 if x.strip() == ">50K" else 0
- )
-
- # Separate features and target
- X = data.drop("income", axis=1)
- y = data["income"]
-
- # Split the dataset into train and test sets
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.2, random_state=42
- )
- return (X_train, X_test, y_train, y_test)
diff --git a/examples/quickstart/utils/__init__.py b/examples/quickstart/utils/__init__.py
new file mode 100644
index 00000000000..d7a9d7dc194
--- /dev/null
+++ b/examples/quickstart/utils/__init__.py
@@ -0,0 +1,16 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/examples/quickstart/utils/preprocess.py b/examples/quickstart/utils/preprocess.py
new file mode 100644
index 00000000000..cf6555f2929
--- /dev/null
+++ b/examples/quickstart/utils/preprocess.py
@@ -0,0 +1,56 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2023. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Union
+
+import pandas as pd
+
+
+class NADropper:
+ """Support class to drop NA values in sklearn Pipeline."""
+
+ def fit(self, *args, **kwargs):
+ return self
+
+ def transform(self, X: Union[pd.DataFrame, pd.Series]):
+ return X.dropna()
+
+
+class ColumnsDropper:
+ """Support class to drop specific columns in sklearn Pipeline."""
+
+ def __init__(self, columns):
+ self.columns = columns
+
+ def fit(self, *args, **kwargs):
+ return self
+
+ def transform(self, X: Union[pd.DataFrame, pd.Series]):
+ return X.drop(columns=self.columns)
+
+
+class DataFrameCaster:
+ """Support class to cast type back to pd.DataFrame in sklearn Pipeline."""
+
+ def __init__(self, columns):
+ self.columns = columns
+
+ def fit(self, *args, **kwargs):
+ return self
+
+ def transform(self, X):
+ return pd.DataFrame(X, columns=self.columns)
diff --git a/src/zenml/cli/base.py b/src/zenml/cli/base.py
index 30548253ca5..07e9cbf4ee7 100644
--- a/src/zenml/cli/base.py
+++ b/src/zenml/cli/base.py
@@ -77,11 +77,11 @@ def copier_github_url(self) -> str:
),
starter=ZenMLProjectTemplateLocation(
github_url="zenml-io/template-starter",
- github_tag="0.45.0",
+ github_tag="2023.12.18", # Make sure it is aligned with .github/workflows/update-templates-to-examples.yml
),
nlp=ZenMLProjectTemplateLocation(
github_url="zenml-io/template-nlp",
- github_tag="0.45.0",
+ github_tag="0.45.0", # Make sure it is aligned with .github/workflows/update-templates-to-examples.yml
),
)
diff --git a/tests/harness/cfg/tests.yaml b/tests/harness/cfg/tests.yaml
index 3dd057c7a68..f9a3014cd67 100644
--- a/tests/harness/cfg/tests.yaml
+++ b/tests/harness/cfg/tests.yaml
@@ -153,24 +153,6 @@ tests:
- capabilities:
synchronized: true
- - module: tests.integration.examples.test_quickstart
- requirements:
- - integrations:
- - sklearn
- - mlflow
- - evidently
- stacks:
- - type: experiment_tracker
- flavor: mlflow
- - type: model_deployer
- flavor: mlflow
- - type: model_registry
- flavor: mlflow
- - type: data_validator
- flavor: evidently
- - capabilities:
- synchronized: true
-
- module: tests.integration.examples.test_huggingface
requirements:
- integrations:
diff --git a/tests/integration/examples/test_quickstart.py b/tests/integration/examples/test_quickstart.py
deleted file mode 100644
index 07a0838b367..00000000000
--- a/tests/integration/examples/test_quickstart.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) ZenML GmbH 2023. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at:
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-
-import pytest
-
-from tests.integration.examples.utils import run_example
-from zenml.client import Client
-from zenml.integrations.mlflow.model_registries.mlflow_model_registry import (
- MLFlowModelRegistry,
-)
-
-
-def test_example(request: pytest.FixtureRequest) -> None:
- """Runs the quickstart example."""
-
- with run_example(
- request=request,
- name="quickstart",
- pipelines={
- "train_and_register_model_pipeline": (1, 5),
- "deploy_and_predict": (1, 4),
- },
- is_public_example=True,
- ):
- # activate the stack set up and used by the example
- client = Client()
- model_registry = client.active_stack.model_registry
- assert isinstance(model_registry, MLFlowModelRegistry)
-
- # fetch the MLflow registered model
- registered_model = model_registry.get_model(
- name="zenml-quickstart-model",
- )
- assert registered_model is not None
-
- # fetch the MLflow registered model version
- registered_model_version = model_registry.get_model_version(
- name="zenml-quickstart-model",
- version=1,
- )
- assert registered_model_version is not None
-
- # Check that the deployment service is running
- from zenml.integrations.mlflow.services import MLFlowDeploymentService
-
- training_run = client.get_pipeline("deploy_and_predict").last_run
-
- service = training_run.steps[
- "mlflow_model_registry_deployer_step"
- ].output.load()
- assert isinstance(service, MLFlowDeploymentService)
-
- if service.is_running:
- service.stop(timeout=180)
diff --git a/tests/integration/functional/cli/test_base.py b/tests/integration/functional/cli/test_base.py
index 6e615d6ef3a..7217a227712 100644
--- a/tests/integration/functional/cli/test_base.py
+++ b/tests/integration/functional/cli/test_base.py
@@ -56,11 +56,11 @@ def test_init_creates_from_templates(
],
)
assert (tmp_path / REPOSITORY_DIRECTORY_NAME).exists()
- files_in_top_level = set(os.listdir(str(tmp_path)))
+ files_in_top_level = set([f.lower() for f in os.listdir(str(tmp_path))])
must_have_files = {
".copier-answers.yml",
".dockerignore",
- "LICENSE",
+ "license",
"run.py",
}
assert not must_have_files - files_in_top_level