diff --git a/.github/workflows/update-templates-to-examples.yml b/.github/workflows/update-templates-to-examples.yml index 2fd1f858f30..4eb0fa90043 100644 --- a/.github/workflows/update-templates-to-examples.yml +++ b/.github/workflows/update-templates-to-examples.yml @@ -136,7 +136,7 @@ jobs: with: python-version: ${{ inputs.python-version }} stack-name: local - ref-zenml: 'main' + ref-zenml: ${{ github.ref }} ref-template: '0.45.0' # Make sure it is aligned with ZENML_PROJECT_TEMPLATES from src/zenml/cli/base.py - name: Clean-up @@ -192,4 +192,85 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, body: 'NLP template updates in `examples/nlp-case` have been pushed.' + }) + + + update-starter-template-to-examples: + name: update-starter-template-to-examples + runs-on: ${{ inputs.os }} + env: + ZENML_DEBUG: 1 + ZENML_ANALYTICS_OPT_IN: false + PYTHONIOENCODING: "utf-8" + OBJC_DISABLE_INITIALIZE_FORK_SAFETY: "YES" + if: github.event_name == 'pull_request' && ! startsWith(github.event.head_commit.message, 'GitBook:') + + defaults: + run: + shell: bash + + steps: + - name: Run template tests for zenml-io/template-starter + uses: zenml-io/template-starter/.github/actions/starter_template_test@main + with: + python-version: ${{ inputs.python-version }} + stack-name: local + ref-zenml: ${{ github.ref }} + ref-template: '2023.12.18' # Make sure it is aligned with ZENML_PROJECT_TEMPLATES from src/zenml/cli/base.py + + - name: Clean-up + run: | + rm -rf ./local_checkout + + - name: message-on-error + if: failure() + run: | + echo "::error title=zenml-io/template-starter project template testing failed with new version of + ZenML core!::\ + Breaking changes affecting templates have been introduced. To mitigate this issue,\ + please make the code in zenml-io/template-starter compatible with new version of\ + ZenML core, release it and update release tag in zenml.cli.base.ZENML_PROJECT_TEMPLATES" + + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.ref }} + + - name: Check-out fresh Starter template + run: | + rm -rf examples/quickstart + mkdir -p examples/quickstart + printf 'info@zenml.io' | zenml init --path examples/quickstart --template starter --template-with-defaults + bash scripts/format.sh + + - name: Check for changes + id: check_changes + run: | + if git diff --quiet "origin/${{ github.event.pull_request.head.ref }}"; then + echo "No active Git changes found." + echo "changes=false" >> $GITHUB_OUTPUT + else + echo "Active Git changes found." + echo "changes=true" >> $GITHUB_OUTPUT + fi + + - name: Commit and push template + if: steps.check_changes.outputs.changes == 'true' + run: | + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + git add . + git commit -am "Auto-update of Starter template" + git push origin HEAD:${{ github.event.pull_request.head.ref }} + + - name: Create PR comment + if: steps.check_changes.outputs.changes == 'true' + uses: actions/github-script@v4 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.issues.createComment({ + issue_number: ${{ github.event.pull_request.number }}, + owner: context.repo.owner, + repo: context.repo.repo, + body: 'Quickstart template updates in `examples/quickstart` have been pushed.' }) \ No newline at end of file diff --git a/examples/e2e/.copier-answers.yml b/examples/e2e/.copier-answers.yml index 7a288d71456..04637068e52 100644 --- a/examples/e2e/.copier-answers.yml +++ b/examples/e2e/.copier-answers.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: 2023.12.06-4-g1e3edc6 +_commit: 2023.12.12 _src_path: gh:zenml-io/template-e2e-batch data_quality_checks: true email: '' diff --git a/examples/quickstart/.assets/cloud_mcp.png b/examples/quickstart/.assets/cloud_mcp.png new file mode 100644 index 00000000000..81197e9574f Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp.png differ diff --git a/examples/quickstart/.assets/cloud_mcp_predictions.png b/examples/quickstart/.assets/cloud_mcp_predictions.png new file mode 100644 index 00000000000..a6bf7c902d6 Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp_predictions.png differ diff --git a/examples/quickstart/.assets/cloud_mcp_screenshot.png b/examples/quickstart/.assets/cloud_mcp_screenshot.png new file mode 100644 index 00000000000..8f56defad50 Binary files /dev/null and b/examples/quickstart/.assets/cloud_mcp_screenshot.png differ diff --git a/examples/quickstart/.assets/feature_engineering_pipeline.png b/examples/quickstart/.assets/feature_engineering_pipeline.png new file mode 100644 index 00000000000..db3019132ee Binary files /dev/null and b/examples/quickstart/.assets/feature_engineering_pipeline.png differ diff --git a/examples/quickstart/.assets/inference_pipeline.png b/examples/quickstart/.assets/inference_pipeline.png new file mode 100644 index 00000000000..358d5537c8c Binary files /dev/null and b/examples/quickstart/.assets/inference_pipeline.png differ diff --git a/examples/quickstart/.assets/pipeline_overview.png b/examples/quickstart/.assets/pipeline_overview.png new file mode 100644 index 00000000000..609e97d2f6f Binary files /dev/null and b/examples/quickstart/.assets/pipeline_overview.png differ diff --git a/examples/quickstart/.assets/training_pipeline.png b/examples/quickstart/.assets/training_pipeline.png new file mode 100644 index 00000000000..a2e6a7d0499 Binary files /dev/null and b/examples/quickstart/.assets/training_pipeline.png differ diff --git a/examples/quickstart/.copier-answers.yml b/examples/quickstart/.copier-answers.yml new file mode 100644 index 00000000000..8662fc2da3f --- /dev/null +++ b/examples/quickstart/.copier-answers.yml @@ -0,0 +1,8 @@ +# Changes here will be overwritten by Copier +_commit: 0.43.0-48-g652a614 +_src_path: gh:zenml-io/template-starter +email: '' +full_name: ZenML GmbH +open_source_license: apache +project_name: ZenML Starter +version: 0.1.0 diff --git a/examples/quickstart/.dockerignore b/examples/quickstart/.dockerignore new file mode 100644 index 00000000000..455f4d7aed4 --- /dev/null +++ b/examples/quickstart/.dockerignore @@ -0,0 +1,2 @@ +.venv* +.requirements* \ No newline at end of file diff --git a/examples/quickstart/LICENSE b/examples/quickstart/LICENSE new file mode 100644 index 00000000000..0fcb970537e --- /dev/null +++ b/examples/quickstart/LICENSE @@ -0,0 +1,15 @@ +Apache Software License 2.0 + +Copyright (c) ZenML GmbH 2023. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md deleted file mode 100644 index 4be902633f2..00000000000 --- a/examples/quickstart/README.md +++ /dev/null @@ -1,111 +0,0 @@ -# :running: Get up and running quickly - -Build your first ML pipelines with ZenML. - -## :question: Coming from `zenml go` ? -Then open [notebooks/quickstart.ipynb](notebooks/quickstart.ipynb) to get -started. - -## :earth_americas: Overview - -This quickstart demonstrates some of ZenML's features. We will: - -- Import some data from a public dataset (Adult Census Income), then train two models (SGD and Random Forest) -- Compare and evaluate which model performs better, and deploy the best one. -- Run a prediction on the deployed model. - -Along the way we will also show you how to: - -- Automatically version, track, and cache data, models, and other artifacts, -- Track model hyperparameters and metrics in an experiment tracking tool - -## :cloud: Run on Colab -You can use Google Colab to see ZenML in action, no signup / installation required! - -Open In Colab - -## :computer: Run Locally - -To run locally, install ZenML and pull this quickstart: - -```shell -# Install ZenML -pip install "zenml[server]" - -# clone the ZenML repository -git clone https://github.com/zenml-io/zenml.git -cd zenml/examples/quickstart -``` - -### :arrow_forward: Run Locally -Now we're ready to start. You have two options for running the quickstart locally: - -#### Option 1 (*Recommended*) - Interactively explore the quickstart using Jupyter Notebook: -```bash -pip install notebook -jupyter notebook -# open notebooks/quickstart.ipynb -``` - -#### Option 2 - Execute the whole ML pipeline from a Python script: -```bash -# Install required zenml integrations -zenml integration install sklearn mlflow -y - -# Initialize ZenML -zenml init - -# Start the ZenServer to enable dashboard access -zenml up - -# Register required ZenML stack -zenml experiment-tracker register mlflow_tracker --flavor=mlflow -zenml model-deployer register mlflow_deployer --flavor=mlflow -zenml model-registry register mlflow_registry --flavor=mlflow - -# Register a new stack with the new stack components -zenml stack register quickstart_stack -a default\ - -o default\ - -d mlflow_deployer\ - -e mlflow_tracker\ - -r mlflow_registry\ - --set - -# Run the quickstart script -python run.py -``` - -## :dart: Dashboard - -You can also take a look at our **dashboard** where you can inspect the quickstart -pipeline run and much more. Simply execute: - -```shell -# only once you've already run `zenml up` -zenml show -``` - -## :sponge: Clean up - -To clean up, simply spin down the ZenML server. - -```shell -zenml down -``` - -## :bulb: Learn More - -If you want to learn more about ZenML as a tool, then the -[:page_facing_up: **ZenML Docs**](https://docs.zenml.io/) are the perfect place -to get started. - -Already have an MLOps stack in mind? ZenML most likely has -[**:link: Integrations**](https://docs.zenml.io/stacks-and-components/component-guide) -for whatever tools you plan to use. - -Also, make sure to join our - Slack - Slack Community - to become part of the ZenML family! - - diff --git a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png b/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png deleted file mode 100644 index d0ef5780e78..00000000000 Binary files a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry.png and /dev/null differ diff --git a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png b/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png deleted file mode 100644 index 360b49a504b..00000000000 Binary files a/examples/quickstart/_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png and /dev/null differ diff --git a/examples/quickstart/configs/feature_engineering.yaml b/examples/quickstart/configs/feature_engineering.yaml new file mode 100644 index 00000000000..d5ab212951e --- /dev/null +++ b/examples/quickstart/configs/feature_engineering.yaml @@ -0,0 +1,10 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# pipeline configuration +test_size: 0.35 \ No newline at end of file diff --git a/examples/quickstart/configs/inference.yaml b/examples/quickstart/configs/inference.yaml new file mode 100644 index 00000000000..0fd82f550c8 --- /dev/null +++ b/examples/quickstart/configs/inference.yaml @@ -0,0 +1,15 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: "breast_cancer_classifier" + version: "production" + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] \ No newline at end of file diff --git a/examples/quickstart/configs/training_rf.yaml b/examples/quickstart/configs/training_rf.yaml new file mode 100644 index 00000000000..c1418afdef3 --- /dev/null +++ b/examples/quickstart/configs/training_rf.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + version: rf + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "rf" # Choose between rf/sgd diff --git a/examples/quickstart/configs/training_sgd.yaml b/examples/quickstart/configs/training_sgd.yaml new file mode 100644 index 00000000000..6ca7c0dd02f --- /dev/null +++ b/examples/quickstart/configs/training_sgd.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + version: sgd + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between rf/sgd \ No newline at end of file diff --git a/examples/quickstart/license b/examples/quickstart/license new file mode 100644 index 00000000000..0fcb970537e --- /dev/null +++ b/examples/quickstart/license @@ -0,0 +1,15 @@ +Apache Software License 2.0 + +Copyright (c) ZenML GmbH 2023. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/examples/quickstart/license_header b/examples/quickstart/license_header new file mode 100644 index 00000000000..264b1f357fe --- /dev/null +++ b/examples/quickstart/license_header @@ -0,0 +1,16 @@ +Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file diff --git a/examples/quickstart/notebooks/quickstart.ipynb b/examples/quickstart/notebooks/quickstart.ipynb deleted file mode 100644 index b489ab131a5..00000000000 --- a/examples/quickstart/notebooks/quickstart.ipynb +++ /dev/null @@ -1,975 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Get Up and Running Quickly\n", - "\n", - "## 🌍 Overview\n", - "\n", - "This quickstart demonstrates some of ZenML's features. We will:\n", - "\n", - "- Import some data from a public dataset (Adult Census Income), then train two models (SGD and Random Forest)\n", - "- Compare and evaluate which model performs better, and deploy the best one.\n", - "- Run a prediction on the deployed model.\n", - "\n", - "Along the way we will also show you how to:\n", - "\n", - "- Automatically version, track, and cache data, models, and other artifacts,\n", - "- Track model hyperparameters and metrics in an experiment tracking tool\n", - "\n", - "This will give you enough to get started building your own ZenML Pipelines.\n", - "Let's dive in!\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run on Colab\n", - "\n", - "You can use Google Colab to see ZenML in action, no signup / installation\n", - "required!\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", - "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/notebooks/quickstart.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Install Requirements\n", - "\n", - "Let's install ZenML to get started. First we'll install the latest version of\n", - "ZenML as well as the two integrations we'll need for this quickstart: `sklearn`\n", - "and `mlflow`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install \"zenml[server]\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.environment import Environment\n", - "\n", - "if Environment.in_google_colab():\n", - " # Install Cloudflare Tunnel binary\n", - " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml integration install sklearn mlflow -y\n", - "\n", - "import IPython\n", - "IPython.Application.instance().kernel.do_shutdown(restart=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please wait for the installation to complete before running subsequent cells. At\n", - "the end of the installation, the notebook kernel will automatically restart." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Optional: If you are using ZenML Cloud, execute the following cell with your tenant URL. Otherwise ignore." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n", - "\n", - "!zenml connect --url $zenml_server_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml init" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please wait for the installation to complete before running subsequent cells. At the end of the installation, the notebook kernel will automatically restart." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Import Data\n", - "\n", - "We'll start off by importing our data. In this quickstart we'll be working with\n", - "[the Adult Census Income](https://archive.ics.uci.edu/dataset/2/adult) dataset\n", - "which is publicly available on the UCI Machine Learning Repository. The task is\n", - "to predict whether a person makes over $50k a year based on a number of\n", - "features. These features are things like age, work class, education level,\n", - "marital status, occupation, relationship, race, sex, capital gain, capital loss,\n", - "hours per week, and native country.\n", - "\n", - "When you're getting started with a machine learning problem you'll want to do\n", - "something similar to this: import your data and get it in the right shape for\n", - "your training. ZenML mostly gets out of your way when you're writing your Python\n", - "code, as you'll see from the following cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Tuple\n", - "\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "from zenml import step\n", - "\n", - "\n", - "@step\n", - "def training_data_loader() -> (\n", - " Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]\n", - "):\n", - " \"\"\"Load the Census Income dataset as tuple of Pandas DataFrame / Series.\"\"\"\n", - " # Load the dataset\n", - " url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n", - " column_names = [\n", - " \"age\",\n", - " \"workclass\",\n", - " \"fnlwgt\",\n", - " \"education\",\n", - " \"education-num\",\n", - " \"marital-status\",\n", - " \"occupation\",\n", - " \"relationship\",\n", - " \"race\",\n", - " \"sex\",\n", - " \"capital-gain\",\n", - " \"capital-loss\",\n", - " \"hours-per-week\",\n", - " \"native-country\",\n", - " \"income\",\n", - " ]\n", - " data = pd.read_csv(\n", - " url, names=column_names, na_values=\"?\", skipinitialspace=True\n", - " )\n", - "\n", - " # Drop rows with missing values\n", - " data = data.dropna()\n", - "\n", - " # Encode categorical features and drop original columns\n", - " categorical_cols = [\n", - " \"workclass\",\n", - " \"education\",\n", - " \"marital-status\",\n", - " \"occupation\",\n", - " \"relationship\",\n", - " \"race\",\n", - " \"sex\",\n", - " \"native-country\",\n", - " ]\n", - " data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)\n", - "\n", - " # Encode target feature\n", - " data[\"income\"] = data[\"income\"].apply(\n", - " lambda x: 1 if x.strip() == \">50K\" else 0\n", - " )\n", - "\n", - " # Separate features and target\n", - " X = data.drop(\"income\", axis=1)\n", - " y = data[\"income\"]\n", - "\n", - " # Split the dataset into train and test sets\n", - " X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, test_size=0.2, random_state=42\n", - " )\n", - " return (X_train, X_test, y_train, y_test)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We download the data, dropping some columns and then splitting it up into train\n", - "and test sets. The whole function is decorated with the `@step` decorator, which\n", - "tells ZenML to track this function as a step in the pipeline. This means that\n", - "ZenML will automatically version, track, and cache the data that is produced by\n", - "this function. This is a very powerful feature, as it means that you can\n", - "reproduce your data at any point in the future, even if the original data source\n", - "changes or disappears.\n", - "\n", - "You'll also notice that we have included type hints for the outputs\n", - "to the function. These are not only useful for anyone reading your code, but\n", - "help ZenML process your data in a way appropriate to the specific data types." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "ZenML is built in a way that allows you to experiment with your data and build\n", - "your pipelines as you work, so if you want to call this function to see how it\n", - "works, you can just call it directly. Here we take a look at the first few rows\n", - "of your training dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train, X_test, y_train, y_test = training_data_loader()\n", - "X_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Everything looks as we'd expect and the values are all in the right format. We\n", - "can shift to training some models now! 🥳" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Train Models\n", - "\n", - "Now that we have our data it makes sense to train some models to get a sense of\n", - "how difficult the task is. The Census Income\n", - "dataset is sufficiently large and complex that it's unlikely we'll be able to\n", - "train a model that behaves perfectly since the problem is inherently complex,\n", - "but we can get a sense of what a reasonable baseline looks like.\n", - "\n", - "We'll start with two simple models, a SGD Classifier and a Random Forest\n", - "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", - "same data and then compare their performance.\n", - "\n", - "Since we're starting our work properly, it makes sense to start tracking the\n", - "experimentation that we're doing. ZenML integrates with MLflow to make this\n", - "easy. This happens out of the box when using our experiment tracker integration\n", - "and stack components. We'll see how this works below, but first let's set up\n", - "ZenML to know that it should use the MLFlow experiment tracker." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Register the MLflow experiment tracker\n", - "!zenml experiment-tracker register mlflow --flavor=mlflow\n", - "\n", - "# Register a new stack with our experiment tracker\n", - "!zenml stack register quickstart -a default\\\n", - " -o default\\\n", - " -e mlflow\n", - "\n", - "!zenml stack set quickstart" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now write the steps where we'll\n", - "train our models, making sure to specify the name of our experiment tracker in\n", - "the `@step` decorator. We could specify this manually using a string, but\n", - "instead we'll use the ZenML `Client` to access the name of our active stack's\n", - "experiment tracker." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import mlflow\n", - "\n", - "from sklearn.base import ClassifierMixin\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.linear_model import SGDClassifier\n", - "\n", - "from zenml.client import Client\n", - "\n", - "experiment_tracker = Client().active_stack.experiment_tracker\n", - "\n", - "\n", - "@step(experiment_tracker=experiment_tracker.name)\n", - "def random_forest_trainer_mlflow(\n", - " X_train: pd.DataFrame,\n", - " y_train: pd.Series,\n", - ") -> ClassifierMixin:\n", - " \"\"\"Train a sklearn Random Forest classifier and log to MLflow.\"\"\"\n", - " mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow\n", - " model = RandomForestClassifier()\n", - " model.fit(X_train.to_numpy(), y_train.to_numpy())\n", - " train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())\n", - " print(f\"Train accuracy: {train_acc}\")\n", - " return model\n", - "\n", - "\n", - "@step(experiment_tracker=experiment_tracker.name)\n", - "def sgd_trainer_mlflow(\n", - " X_train: pd.DataFrame,\n", - " y_train: pd.Series,\n", - ") -> ClassifierMixin:\n", - " \"\"\"Train a SGD classifier and log to MLflow.\"\"\"\n", - " mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow\n", - " model = SGDClassifier()\n", - " model.fit(X_train.to_numpy(), y_train.to_numpy())\n", - " train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())\n", - " print(f\"Train accuracy: {train_acc}\")\n", - " return model" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our two training steps both return different kinds of `sklearn` classifier\n", - "models, so we use the generic `ClassifierMixin` type hint for the return type." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The end goal of this quick baseline evaluation is to understand which of the two\n", - "models performs better. We'll use the `evaluator` step to compare the two\n", - "models. This step takes in the two models we trained above, and compares them on\n", - "the test data we created earlier. It returns whichever model performs best along\n", - "with the accuracy score for that model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing_extensions import Annotated\n", - "\n", - "\n", - "@step\n", - "def best_model_selector(\n", - " X_test: pd.DataFrame,\n", - " y_test: pd.Series,\n", - " model1: ClassifierMixin,\n", - " model2: ClassifierMixin,\n", - ") -> Tuple[\n", - " Annotated[ClassifierMixin, \"best_model\"],\n", - " Annotated[float, \"best_model_test_acc\"],\n", - "]:\n", - " \"\"\"Calculate the accuracy on the test set and return the best model and its accuracy.\"\"\"\n", - " test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy())\n", - " test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy())\n", - " print(f\"Test accuracy ({model1.__class__.__name__}): {test_acc1}\")\n", - " print(f\"Test accuracy ({model2.__class__.__name__}): {test_acc2}\")\n", - " if test_acc1 > test_acc2:\n", - " best_model = model1\n", - " best_model_test_acc = test_acc1\n", - " else:\n", - " best_model = model2\n", - " best_model_test_acc = test_acc2\n", - " return best_model, best_model_test_acc" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note the use of the `typing` module's `Annotated` type hint in the output of the\n", - "step. We're using this to give a name to the output of the step, which will make\n", - "it possible to access it via a keyword later on." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll likely want to use our model in the future so instead of simply outputting\n", - "the model we'll use the MLflow model registry to store it. This allows us to\n", - "version the model for retrieval and use later on as well as to use other\n", - "functionality made possible within the MLflow dashboard. This step is a bit\n", - "different from the ones listed above in that we're using a pre-built ZenML step\n", - "instead of just writing our own. You'll often come across these pre-built steps\n", - "for common workflows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.integrations.mlflow.steps.mlflow_registry import (\n", - " mlflow_register_model_step,\n", - ")\n", - "\n", - "model_name = \"zenml-quickstart-model\"\n", - "\n", - "register_model = mlflow_register_model_step.with_options(\n", - " parameters=dict(\n", - " name=model_name,\n", - " description=\"The first run of the Quickstart pipeline.\",\n", - " )\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're now at the point where can bring all these steps together into a single\n", - "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n", - "as simple as adding a `@pipeline` decorator to a function. This specific\n", - "pipeline doesn't return a value, but that option is available to you if you need." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml import pipeline\n", - "\n", - "\n", - "@pipeline(enable_cache=True)\n", - "def train_and_register_model_pipeline() -> None:\n", - " \"\"\"Train a model.\"\"\"\n", - " X_train, X_test, y_train, y_test = training_data_loader()\n", - " model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train)\n", - " model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train)\n", - " best_model, _ = best_model_selector(\n", - " X_test=X_test, y_test=y_test, model1=model1, model2=model2\n", - " )\n", - " register_model(best_model)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've used the built-in MLflow registry to store our model, but ZenML doesn't\n", - "yet know that we want to use the MLflow flavor of the model registry stack\n", - "component in our stack. Let's add that now and update our stack." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Register the MLflow model registry\n", - "!zenml model-registry register mlflow --flavor=mlflow\n", - "\n", - "# Update our stack to include the model registry\n", - "!zenml stack update quickstart -r mlflow" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](../_assets/local_stack_with_local_mlflow_tracker_and_registry.png)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're ready to run the pipeline now, which we can do just -- as with the step -- by calling the\n", - "pipeline function itself." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_and_register_model_pipeline()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see from the logs already how our model training went: the\n", - "`RandomForestClassifier` performed considerably better than the `SGDClassifier`,\n", - "so that will have been the model that was returned from the evaluation step and\n", - "then registered with the MLflow model registry." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this point you might be interested to view your pipeline in the ZenML\n", - "Dashboard. You can spin this up by executing the next cell. This will start a\n", - "server which you can access by clicking on the link that appears in the output\n", - "of the cell.\n", - "\n", - "Log into the Dashboard using default credentials (username 'default' and\n", - "password left blank). From there you can inspect the pipeline or the specific\n", - "pipeline run. You can also examine the stack and components that we've\n", - "registered to run everything.\n", - "\n", - "![](../llm_quickstart/_assets/zenml-up.gif)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.environment import Environment\n", - "\n", - "if Environment.in_google_colab():\n", - " # run ZenML through a cloudflare tunnel to get a public endpoint\n", - " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n", - "else:\n", - " !zenml up" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're using MLflow for our experiment tracking. If you'd like to inspect the\n", - "MLflow dashboard to see your experiments and what's been logged so far, run the\n", - "following cell. This cell will spin up a local server that you can access via\n", - "the link mentioned after the \"Listening at:\" `INFO` log statement." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from zenml.integrations.mlflow.mlflow_utils import get_tracking_uri\n", - "\n", - "os.environ[\"MLFLOW_TRACKING_URI\"] = get_tracking_uri()\n", - "\n", - "if Environment.in_google_colab():\n", - " # run mlflow through a cloudflare tunnel to get a public endpoint\n", - " !mlflow ui --backend-store-uri $MLFLOW_TRACKING_URI & cloudflared tunnel --url http://localhost:5000\n", - "else:\n", - " !mlflow ui --backend-store-uri $MLFLOW_TRACKING_URI" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our pipeline above registered the best model with the MLflow model registry.\n", - "Whenever you register a model it also versions the model since it's likely that\n", - "you'll be iterating and improving your model over time.\n", - "\n", - "We'll now turn to actually deploying our model and serving some predictions, for\n", - "which we'll need to specify the model version we want to use. You can specify\n", - "the version number manually but below we'll use the ZenML `Client` to get the\n", - "latest version number." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.client import Client\n", - "\n", - "most_recent_model_version_number = int(\n", - " Client()\n", - " .active_stack.model_registry.list_model_versions(metadata={})[0]\n", - " .version\n", - ")\n", - "most_recent_model_version_number" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we've trained our model, and we've found the best one, we want to deploy it\n", - "and run some inference on the deployed model. We'll use the local MLflow model\n", - "deployer which once again comes with some pre-built ZenML steps to save you reinventing the wheel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.integrations.mlflow.steps.mlflow_deployer import (\n", - " mlflow_model_registry_deployer_step,\n", - ")\n", - "\n", - "model_deployer = mlflow_model_registry_deployer_step.with_options(\n", - " parameters=dict(\n", - " registry_model_name=model_name,\n", - " registry_model_version=most_recent_model_version_number,\n", - " )\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you deploy a model this is usually something you want to remain available\n", - "and running for a long time, so ZenML automatically creates a background service\n", - "for your deployed model. We load the service (already created by the\n", - "`model_deployer` step) and then use it to make some predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.services import BaseService\n", - "from zenml.client import Client\n", - "\n", - "\n", - "@step(enable_cache=False)\n", - "def prediction_service_loader() -> BaseService:\n", - " \"\"\"Load the model service of our train_and_register_model_pipeline.\"\"\"\n", - " client = Client()\n", - " model_deployer = client.active_stack.model_deployer\n", - " services = model_deployer.find_model_server(\n", - " pipeline_name=\"train_and_register_model_pipeline\",\n", - " running=True,\n", - " )\n", - " return services[0]\n", - "\n", - "\n", - "@step\n", - "def predictor(\n", - " service: BaseService,\n", - " data: pd.DataFrame,\n", - ") -> Annotated[list, \"predictions\"]:\n", - " \"\"\"Run a inference request against a prediction service.\"\"\"\n", - " service.start(timeout=10) # should be a NOP if already started\n", - " print(f\"Running predictions on data (single individual): {data.to_numpy()[0]}\")\n", - " prediction = service.predict(data.to_numpy())\n", - " print(f\"Prediction (for single example slice) is: {bool(prediction.tolist()[0])}\")\n", - " return prediction.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Register the MLflow model deployer\n", - "!zenml model-deployer register mlflow --flavor=mlflow\n", - "\n", - "# Register a new stack with the new stack components\n", - "!zenml stack update quickstart -d mlflow" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once again there is one dependency in terms of how the step needs to run, so\n", - "we specify it upfront: the prediction service needs to be loaded\n", - "before we try to make predictions with it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@pipeline\n", - "def deploy_and_predict() -> None:\n", - " \"\"\"Deploy the best model and run some predictions.\"\"\"\n", - " prediction_service_loader.after(model_deployer)\n", - "\n", - " model_deployer()\n", - " _, inference_data, _, _ = training_data_loader()\n", - " model_deployment_service = prediction_service_loader()\n", - " predictor(service=model_deployment_service, data=inference_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice how we specify that we want the `prediction_service_loader` step to run *after* the\n", - "model_deployer step. This is because we won't have a model ready for prediction\n", - "until the deployment has taken place. ZenML automatically tries to run steps in\n", - "parallel, so sometimes if you have this kind of sequencing you need to do then\n", - "you'll need to specify it explicitly." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](../_assets/local_stack_with_local_mlflow_tracker_and_registry_and_deployer.png)\n", - "\n", - "Unlike in the previous case where we just ran the pipeline directly, we might\n", - "not want to deploy the model every time. Consider the case where our models are\n", - "returning values under 50% accuracy on the test data. In that case we might want\n", - "to address the issues with accuracy and not spin up a deployment at all. We can\n", - "access the artifacts associated with the previous pipeline run and check the\n", - "test accuracy metric to see if it's above a certain threshold. Adding this to\n", - "our workflow is as simple as adding a conditional step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_model_test_accuracy = (\n", - " Client().get_pipeline(\"train_and_register_model_pipeline\")\n", - " .last_successful_run.steps[\"best_model_selector\"]\n", - " .outputs[\"best_model_test_acc\"].load()\n", - ")\n", - "\n", - "if best_model_test_accuracy > 0.7:\n", - " deploy_and_predict()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you recall, the purpose of this model was to predict whether or not someone\n", - "earns more than \\$50,000 USD per year. You can see a single example in the output above.\n", - "Given the features of a particular individual, the model predicts that they do\n", - "not earn more than $50k per year.\n", - "\n", - "If we were interested in learning more about the model's predictions, we could\n", - "separately load the predictor service and use it to pass in some other data or\n", - "try things out. To load the predictor we can run:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictor_service = deploy_and_predict.model.last_successful_run.steps[\n", - " \"prediction_service_loader\"\n", - "].output.load()\n", - "\n", - "predictor_service" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this point, passing in some data is as simple as calling the `predict` method\n", - "on the predictor service. We can try this here:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\n", - " f\"Model predictions: {predictor_service.predict(X_test.to_numpy()[25:35])}\"\n", - ")\n", - "print(f\"Ground truth: {y_test.to_numpy()[25:35]}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're passing in some of our test data into the model and getting back the\n", - "predictions. You can already start to see some of the places where our\n", - "predictions are not matching the ground truth labels. This is to be expected but\n", - "we could potentially use this to now iterate on our models by adding more steps.\n", - "\n", - "To get an overview of the models and model versions that we have registered and\n", - "deployed so\n", - "far, we can use the CLI to list these out." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml model-registry models list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml model-registry models list-versions zenml-quickstart-model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml model-deployer models list" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To view all this on the ZenML Dashboard, simply spin up the server again and\n", - "view the steps via the DAG visualiser and also browse the artifacts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if Environment.in_google_colab():\n", - " !zenml down # server needs restarting due to colab bug.\n", - " # run ZenML through a cloudflare tunnel to get a public endpoint\n", - " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n", - "else:\n", - " !zenml up" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Congratulations!\n", - "\n", - "You just built two ML pipelines! You trained two models, evaluated them against\n", - "a test set, registered the best one with the MLflow model registry, deployed it\n", - "and served some predictions. You also learned how to iterate on your models and\n", - "data by using some of the ZenML utility abstractions. You saw how to view your\n", - "artifacts and stacks via the CLI as well as the ZenML Dashboard.\n", - "\n", - "And that is just the tip of the iceberg of what ZenML can do; check out the [**Integrations**](https://zenml.io/integrations) page for a list of all the cool MLOps tools that ZenML supports!\n", - "\n", - "## What to do now\n", - "\n", - "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack-invite) and become part of the ZenML family!\n", - "* If you want to try ZenML in a real-world setting... check out the [ZenML Cloud](https://cloud.zenml.io/), a free trial of\n", - " ZenML's managed offering that runs on your Cloud platform. [**Sign up here**](https://sandbox.zenml.io/)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/quickstart/pipelines/__init__.py b/examples/quickstart/pipelines/__init__.py index e69de29bb2d..0630c39c2e2 100644 --- a/examples/quickstart/pipelines/__init__.py +++ b/examples/quickstart/pipelines/__init__.py @@ -0,0 +1,20 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training diff --git a/examples/quickstart/pipelines/deploy_and_predict.py b/examples/quickstart/pipelines/deploy_and_predict.py deleted file mode 100644 index 6d643d56190..00000000000 --- a/examples/quickstart/pipelines/deploy_and_predict.py +++ /dev/null @@ -1,28 +0,0 @@ -from steps.model_deployer import model_deployer -from steps.prediction_service_loader import prediction_service_loader -from steps.predictor import predictor -from steps.training_data_loader import training_data_loader - -from zenml import pipeline -from zenml.config import DockerSettings -from zenml.integrations.constants import MLFLOW, SKLEARN - -docker_settings = DockerSettings( - required_integrations=[MLFLOW, SKLEARN], - requirements=[ - "numpy==1.24.3", - "scipy==1.10.1", - "typing-extensions==4.6.3", - ], -) - - -@pipeline(enable_cache=True, settings={"docker": docker_settings}) -def deploy_and_predict() -> None: - """Deploy the best model and run some predictions.""" - prediction_service_loader.after(model_deployer) - - model_deployer() - _, inference_data, _, _ = training_data_loader() - model_deployment_service = prediction_service_loader() - predictor(service=model_deployment_service, data=inference_data) diff --git a/examples/quickstart/pipelines/feature_engineering.py b/examples/quickstart/pipelines/feature_engineering.py new file mode 100644 index 00000000000..1927f17f594 --- /dev/null +++ b/examples/quickstart/pipelines/feature_engineering.py @@ -0,0 +1,74 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional + +from steps import ( + data_loader, + data_preprocessor, + data_splitter, +) + +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def feature_engineering( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17, +): + """ + Feature engineering pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + target: Name of target column in dataset + random_state: Random state to configure the data loader + + Returns: + The processed datasets (dataset_trn, dataset_tst). + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + return dataset_trn, dataset_tst diff --git a/examples/quickstart/pipelines/inference.py b/examples/quickstart/pipelines/inference.py new file mode 100644 index 00000000000..fee74371adc --- /dev/null +++ b/examples/quickstart/pipelines/inference.py @@ -0,0 +1,62 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from steps import ( + data_loader, + inference_predict, + inference_preprocessor, +) + +from zenml import get_pipeline_context, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference(random_state: str, target: str): + """ + Model inference pipeline. + + This is a pipeline that loads the inference data, processes it with + the same preprocessing pipeline used in training, and runs inference + with the trained model. + + Args: + random_state: Random state for reproducibility. + target: Name of target column in dataset. + """ + # Get the production model artifact + model = get_pipeline_context().model_version.get_artifact("model") + + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model_version.get_artifact( + "preprocess_pipeline" + ) + + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target=target, + ) + inference_predict( + model=model, + dataset_inf=df_inference, + ) diff --git a/examples/quickstart/pipelines/train_and_register_model.py b/examples/quickstart/pipelines/train_and_register_model.py deleted file mode 100644 index b0d734d7003..00000000000 --- a/examples/quickstart/pipelines/train_and_register_model.py +++ /dev/null @@ -1,22 +0,0 @@ -from steps.best_model_selector import best_model_selector -from steps.register_model import register_model -from steps.trainers import random_forest_trainer_mlflow, sgd_trainer_mlflow -from steps.training_data_loader import training_data_loader - -from zenml import pipeline -from zenml.config import DockerSettings -from zenml.integrations.constants import MLFLOW, SKLEARN - -docker_settings = DockerSettings(required_integrations=[MLFLOW, SKLEARN]) - - -@pipeline(enable_cache=True, settings={"docker": docker_settings}) -def train_and_register_model_pipeline() -> None: - """Train a model.""" - X_train, X_test, y_train, y_test = training_data_loader() - model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train) - model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train) - best_model, _ = best_model_selector( - X_test=X_test, y_test=y_test, model1=model1, model2=model2 - ) - register_model(best_model) diff --git a/examples/quickstart/pipelines/training.py b/examples/quickstart/pipelines/training.py new file mode 100644 index 00000000000..39814bbb61c --- /dev/null +++ b/examples/quickstart/pipelines/training.py @@ -0,0 +1,75 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_promoter, model_trainer + +from pipelines import ( + feature_engineering, +) +from zenml import ExternalArtifact, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + target: Optional[str] = "target", + model_type: Optional[str] = "sgd", +): + """ + Model training pipeline. + + This is a pipeline that loads the data from a preprocessing pipeline, + trains a model on it and evaluates the model. If it is the first model + to be trained, it will be promoted to production. If not, it will be + promoted only if it has a higher accuracy than the current production + model version. + + Args: + train_dataset_id: ID of the train dataset produced by feature engineering. + test_dataset_id: ID of the test dataset produced by feature engineering. + target: Name of target column in dataset. + model_type: The type of model to train. + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering() + else: + dataset_trn = ExternalArtifact(id=train_dataset_id) + dataset_tst = ExternalArtifact(id=test_dataset_id) + + model = model_trainer( + dataset_trn=dataset_trn, target=target, model_type=model_type + ) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + target=target, + ) + + model_promoter(accuracy=acc) diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb new file mode 100644 index 00000000000..36e502d79ee --- /dev/null +++ b/examples/quickstart/quickstart.ipynb @@ -0,0 +1,1117 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63ab391a", + "metadata": {}, + "source": [ + "# Intro to MLOps using ZenML\n", + "\n", + "## 🌍 Overview\n", + "\n", + "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n", + "\n", + "- A feature engineering pipeline that loads data and prepares it for training.\n", + "- A training pipeline that loads the preprocessed dataset and trains a model.\n", + "- A batch inference pipeline that runs predictions on the trained model with new data.\n", + "\n", + "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "8f466b16", + "metadata": {}, + "source": [ + "## Run on Colab\n", + "\n", + "You can use Google Colab to see ZenML in action, no signup / installation\n", + "required!\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", + "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/quickstart.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "66b2977c", + "metadata": {}, + "source": [ + "# 👶 Step 0. Install Requirements\n", + "\n", + "Let's install ZenML to get started. First we'll install the latest version of\n", + "ZenML as well as the `sklearn` integration of ZenML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce2f40eb", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"zenml[server]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aad397e", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.environment import Environment\n", + "\n", + "if Environment.in_google_colab():\n", + " # Install Cloudflare Tunnel binary\n", + " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f76f562e", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml integration install sklearn -y\n", + "\n", + "import IPython\n", + "IPython.Application.instance().kernel.do_shutdown(restart=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3b044374", + "metadata": {}, + "source": [ + "Please wait for the installation to complete before running subsequent cells. At\n", + "the end of the installation, the notebook kernel will automatically restart." + ] + }, + { + "cell_type": "markdown", + "id": "e3955ff1", + "metadata": {}, + "source": [ + "Optional: If you are using [ZenML Cloud](https://zenml.io/cloud), execute the following cell with your tenant URL. Otherwise ignore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2587315", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n", + "\n", + "!zenml connect --url $zenml_server_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "081d5616", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize ZenML and set the default stack\n", + "!zenml init\n", + "\n", + "!zenml stack set default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f775f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do the imports at the top\n", + "from typing_extensions import Annotated\n", + "from sklearn.datasets import load_breast_cancer\n", + "\n", + "import random\n", + "import pandas as pd\n", + "from zenml import step, ExternalArtifact, pipeline, ModelVersion, get_step_context\n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "from uuid import UUID\n", + "\n", + "from typing import Optional, List\n", + "\n", + "from zenml import pipeline\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " data_preprocessor,\n", + " data_splitter,\n", + " model_evaluator,\n", + " inference_preprocessor\n", + ")\n", + "\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "# Initialize the ZenML client to fetch objects from the ZenML Server\n", + "client = Client()" + ] + }, + { + "cell_type": "markdown", + "id": "35e48460", + "metadata": {}, + "source": [ + "## 🥇 Step 1: Load your data and execute feature engineering\n", + "\n", + "We'll start off by importing our data. In this quickstart we'll be working with\n", + "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n", + "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n", + "problem, to predict whether a patient is diagnosed with breast cancer or not.\n", + "\n", + "When you're getting started with a machine learning problem you'll want to do\n", + "something similar to this: import your data and get it in the right shape for\n", + "your training. ZenML mostly gets out of your way when you're writing your Python\n", + "code, as you'll see from the following cell.\n", + "\n", + "\"Feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cd974d1", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def data_loader_simplified(\n", + " random_state: int, is_inference: bool = False, target: str = \"target\"\n", + ") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset \n", + " \"\"\"Dataset reader step.\"\"\"\n", + " dataset = load_breast_cancer(as_frame=True)\n", + " inference_size = int(len(dataset.target) * 0.05)\n", + " dataset: pd.DataFrame = dataset.frame\n", + " inference_subset = dataset.sample(inference_size, random_state=random_state)\n", + " if is_inference:\n", + " dataset = inference_subset\n", + " dataset.drop(columns=target, inplace=True)\n", + " else:\n", + " dataset.drop(inference_subset.index, inplace=True)\n", + " dataset.reset_index(drop=True, inplace=True)\n", + " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", + " return dataset\n" + ] + }, + { + "cell_type": "markdown", + "id": "1e8ba4c6", + "metadata": {}, + "source": [ + "The whole function is decorated with the `@step` decorator, which\n", + "tells ZenML to track this function as a step in the pipeline. This means that\n", + "ZenML will automatically version, track, and cache the data that is produced by\n", + "this function as an `artifact`. This is a very powerful feature, as it means that you can\n", + "reproduce your data at any point in the future, even if the original data source\n", + "changes or disappears. \n", + "\n", + "Note the use of the `typing` module's `Annotated` type hint in the output of the\n", + "step. We're using this to give a name to the output of the step, which will make\n", + "it possible to access it via a keyword later on.\n", + "\n", + "You'll also notice that we have included type hints for the outputs\n", + "to the function. These are not only useful for anyone reading your code, but\n", + "help ZenML process your data in a way appropriate to the specific data types." + ] + }, + { + "cell_type": "markdown", + "id": "b6286b67", + "metadata": {}, + "source": [ + "ZenML is built in a way that allows you to experiment with your data and build\n", + "your pipelines as you work, so if you want to call this function to see how it\n", + "works, you can just call it directly. Here we take a look at the first few rows\n", + "of your training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d838e2ea", + "metadata": {}, + "outputs": [], + "source": [ + "df = data_loader_simplified(random_state=42)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "28c05291", + "metadata": {}, + "source": [ + "Everything looks as we'd expect and the values are all in the right format 🥳.\n", + "\n", + "We're now at the point where can bring this step (and some others) together into a single\n", + "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n", + "as simple as adding a `@pipeline` decorator to a function. This specific\n", + "pipeline doesn't return a value, but that option is available to you if you need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b50a9537", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def feature_engineering(\n", + " test_size: float = 0.3,\n", + " drop_na: Optional[bool] = None,\n", + " normalize: Optional[bool] = None,\n", + " drop_columns: Optional[List[str]] = None,\n", + " target: Optional[str] = \"target\",\n", + " random_state: int = 17\n", + "):\n", + " \"\"\"Feature engineering pipeline.\"\"\"\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + " raw_data = data_loader(random_state=random_state, target=target)\n", + " dataset_trn, dataset_tst = data_splitter(\n", + " dataset=raw_data,\n", + " test_size=test_size,\n", + " )\n", + " dataset_trn, dataset_tst, _ = data_preprocessor(\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " drop_na=drop_na,\n", + " normalize=normalize,\n", + " drop_columns=drop_columns,\n", + " target=target,\n", + " random_state=random_state,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "7cd73c23", + "metadata": {}, + "source": [ + "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", + "pipeline function itself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0aa9af", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering()" + ] + }, + { + "cell_type": "markdown", + "id": "1785c303", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size, to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "658c0570-2607-4b97-a72d-d45c92633e48", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(test_size=0.25)" + ] + }, + { + "cell_type": "markdown", + "id": "64bb7206", + "metadata": {}, + "source": [ + "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n", + "This is because ZenML automatically determined that nothing had changed in the data loader step, \n", + "so it didn't need to rerun it." + ] + }, + { + "cell_type": "markdown", + "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e1d8546", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(test_size=0.25, random_state=104)" + ] + }, + { + "cell_type": "markdown", + "id": "6c42078a", + "metadata": {}, + "source": [ + "At this point you might be interested to view your pipeline runs in the ZenML\n", + "Dashboard. In case you are not using a hosted instance of ZenML, you can spin this up by executing the next cell. This will start a\n", + "server which you can access by clicking on the link that appears in the output\n", + "of the cell.\n", + "\n", + "Log into the Dashboard using default credentials (username 'default' and\n", + "password left blank). From there you can inspect the pipeline or the specific\n", + "pipeline run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cd3cc8c", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.environment import Environment\n", + "from zenml.zen_stores.rest_zen_store import RestZenStore\n", + "\n", + "\n", + "if not isinstance(client.zen_store, RestZenStore):\n", + " # Only spin up a local Dashboard in case you aren't already connected to a remote server\n", + " if Environment.in_google_colab():\n", + " # run ZenML through a cloudflare tunnel to get a public endpoint\n", + " !zenml up --port 8237 & cloudflared tunnel --url http://localhost:8237\n", + " else:\n", + " !zenml up" + ] + }, + { + "cell_type": "markdown", + "id": "e8471f93", + "metadata": {}, + "source": [ + "We can also fetch the pipeline from the server and view the results directly in the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f208b200", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client()\n", + "run = client.get_pipeline(\"feature_engineering\").last_run\n", + "print(run.name)" + ] + }, + { + "cell_type": "markdown", + "id": "a037f09d", + "metadata": {}, + "source": [ + "We can also see the data artifacts that were produced by the last step of the pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34283e89", + "metadata": {}, + "outputs": [], + "source": [ + "run.steps[\"data_preprocessor\"].outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bceb0312", + "metadata": {}, + "outputs": [], + "source": [ + "# Read one of the datasets. This is the one with a 0.25 test split\n", + "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()" + ] + }, + { + "cell_type": "markdown", + "id": "26d26436", + "metadata": {}, + "source": [ + "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n", + "\n", + "You can fetch these artifact and their versions using the `client`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f90647", + "metadata": {}, + "outputs": [], + "source": [ + "# Get artifact version from our run\n", + "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"] \n", + "\n", + "# Get latest version from client directly\n", + "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n", + "\n", + "# This should be true if our run is the latest run and no artifact has been produced\n", + "# in the intervening time\n", + "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f9d3dfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch the rest of the artifacts\n", + "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n", + "preprocessing_pipeline_artifact_version = client.get_artifact_version(\"preprocess_pipeline\")" + ] + }, + { + "cell_type": "markdown", + "id": "7a7d1b04", + "metadata": {}, + "source": [ + "If you started with a fresh install, then you would have two versions corresponding\n", + "to the two pipelines that we ran above. We can even load a artifact version in memory: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c82aca75", + "metadata": {}, + "outputs": [], + "source": [ + "# Load an artifact to verify you can fetch it\n", + "dataset_trn_artifact_version.load()" + ] + }, + { + "cell_type": "markdown", + "id": "5963509e", + "metadata": {}, + "source": [ + "We'll use these artifacts from above in our next pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "8c28b474", + "metadata": {}, + "source": [ + "# ⌚ Step 2: Training pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "87909827", + "metadata": {}, + "source": [ + "Now that we have our data it makes sense to train some models to get a sense of\n", + "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n", + "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n", + "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n", + "\n", + "We'll start with two simple models, a SGD Classifier and a Random Forest\n", + "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", + "same data and then compare their performance.\n", + "\n", + "\"Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fccf1bd9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.base import ClassifierMixin\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import SGDClassifier\n", + "from typing_extensions import Annotated\n", + "from zenml import ArtifactConfig, step\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "\n", + "@step\n", + "def model_trainer(\n", + " dataset_trn: pd.DataFrame,\n", + " model_type: str = \"sgd\",\n", + ") -> Annotated[ClassifierMixin, ArtifactConfig(name=\"model\", is_model_artifact=True)]:\n", + " \"\"\"Configure and train a model on the training dataset.\"\"\"\n", + " target = \"target\"\n", + " if model_type == \"sgd\":\n", + " model = SGDClassifier()\n", + " elif model_type == \"rf\":\n", + " model = RandomForestClassifier()\n", + " else:\n", + " raise ValueError(f\"Unknown model type {model_type}\") \n", + "\n", + " logger.info(f\"Training model {model}...\")\n", + "\n", + " model.fit(\n", + " dataset_trn.drop(columns=[target]),\n", + " dataset_trn[target],\n", + " )\n", + " return model\n" + ] + }, + { + "cell_type": "markdown", + "id": "73a00008", + "metadata": {}, + "source": [ + "Our two training steps both return different kinds of `sklearn` classifier\n", + "models, so we use the generic `ClassifierMixin` type hint for the return type." + ] + }, + { + "cell_type": "markdown", + "id": "a5f22174", + "metadata": {}, + "source": [ + "ZenML allows you to load any version of any dataset that is tracked by the framework\n", + "directly into a pipeline using the `ExternalArtifact` interface. This is very convenient\n", + "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n", + "into the training pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa98f2f", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def training(\n", + " train_dataset_id: Optional[UUID] = None,\n", + " test_dataset_id: Optional[UUID] = None,\n", + " model_type: str = \"sgd\",\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + "):\n", + " \"\"\"Model training pipeline.\"\"\" \n", + " if train_dataset_id is None or test_dataset_id is None:\n", + " # If we dont pass the IDs, this will run the feature engineering pipeline \n", + " dataset_trn, dataset_tst = feature_engineering()\n", + " else:\n", + " # Load the datasets from an older pipeline\n", + " dataset_trn = ExternalArtifact(id=train_dataset_id)\n", + " dataset_tst = ExternalArtifact(id=test_dataset_id) \n", + "\n", + " trained_model = model_trainer(\n", + " dataset_trn=dataset_trn,\n", + " model_type=model_type,\n", + " )\n", + "\n", + " model_evaluator(\n", + " model=trained_model,\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " min_train_accuracy=min_train_accuracy,\n", + " min_test_accuracy=min_test_accuracy,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "88b70fd3", + "metadata": {}, + "source": [ + "The end goal of this quick baseline evaluation is to understand which of the two\n", + "models performs better. We'll use the `evaluator` step to compare the two\n", + "models. This step takes in the model from the trainer step, and computes its score\n", + "over the testing set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c64885ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a random forest model with the chosen datasets.\n", + "# We need to pass the ID's of the datasets into the function\n", + "training(\n", + " model_type=\"rf\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "rf_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4300c82f", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a SGD classifier\n", + "sgd_run = training(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "sgd_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "markdown", + "id": "43f1a68a", + "metadata": {}, + "source": [ + "You can see from the logs already how our model training went: the\n", + "`RandomForestClassifier` performed considerably better than the `SGDClassifier`.\n", + "We can use the ZenML `Client` to verify this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95810b1", + "metadata": {}, + "outputs": [], + "source": [ + "# The evaluator returns a float value with the accuracy\n", + "rf_run.steps[\"model_evaluator\"].output.load() > sgd_run.steps[\"model_evaluator\"].output.load()" + ] + }, + { + "cell_type": "markdown", + "id": "e256d145", + "metadata": {}, + "source": [ + "# 💯 Step 3: Associating a model with your pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "927978f3", + "metadata": {}, + "source": [ + "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n", + "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n", + "which is a central register of all your ML models.\n", + "\n", + "You can easily create a ZenML `Model` and associate it with your pipelines using the `ModelVersion` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ca00c0", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model_version\"] = ModelVersion(\n", + " name=\"breast_cancer_classifier\",\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + " tags=[\"breast_cancer\", \"classifier\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e78a520", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the SGD model and set the version name to \"sgd\"\n", + "pipeline_settings[\"model_version\"].version = \"sgd\"\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# We can now run this as usual\n", + "training_configured(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8e0002", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the RF model and set the version name to \"rf\"\n", + "pipeline_settings[\"model_version\"].version = \"rf\"\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "training_configured(\n", + " model_type=\"rf\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "09597223", + "metadata": {}, + "source": [ + "This time, running both pipelines has created two associated **model versions**.\n", + "You can list your ZenML model and their versions as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb25913", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_model = client.get_model(\"breast_cancer_classifier\")\n", + "print(zenml_model)\n", + "\n", + "print(f\"Model {zenml_model.name} has {len(zenml_model.versions)} versions\")\n", + "\n", + "zenml_model.versions[0].version, zenml_model.versions[1].version" + ] + }, + { + "cell_type": "markdown", + "id": "e82cfac2", + "metadata": {}, + "source": [ + "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n", + "pipelines to that model version, including the two pickle files that represent our\n", + "SGD and RandomForest classifier. We can see all artifacts directly from the model\n", + "version object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31211413", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load the RF version\n", + "rf_zenml_model_version = client.get_model_version(\"breast_cancer_classifier\", \"rf\")\n", + "\n", + "# We can now load our classifier directly as well\n", + "random_forest_classifier = rf_zenml_model_version.get_artifact(\"model\").load()\n", + "\n", + "random_forest_classifier" + ] + }, + { + "cell_type": "markdown", + "id": "53517a9a", + "metadata": {}, + "source": [ + "If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "eb645dde", + "metadata": {}, + "source": [ + "There is a lot more you can do with ZenML models, including the ability to\n", + "track metrics by adding metadata to it, or having them persist in a model\n", + "registry. However, these topics can be explored more in the\n", + "[ZenML docs](https://docs.zenml.io).\n", + "\n", + "For now, we will use the ZenML model control plane to promote our best\n", + "model to `production`. You can do this by simply setting the `stage` of\n", + "your chosen model version to the `production` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b718f8", + "metadata": {}, + "outputs": [], + "source": [ + "# Set our best classifier to production\n", + "rf_zenml_model_version.set_stage(\"production\", force=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9fddf3d0", + "metadata": {}, + "source": [ + "Of course, normally one would only promote the model by comparing to all other model\n", + "versions and doing some other tests. But that's a bit more advanced use-case. See the\n", + "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n", + "more insight into that sort of flow!" + ] + }, + { + "cell_type": "markdown", + "id": "2ecbc8cf", + "metadata": {}, + "source": [ + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "8f1146db", + "metadata": {}, + "source": [ + "Once the model is promoted, we can now consume the right model version in our\n", + "batch inference pipeline directly. Let's see how that works." + ] + }, + { + "cell_type": "markdown", + "id": "d6306f14", + "metadata": {}, + "source": [ + "# 🫅 Step 4: Consuming the model in production" + ] + }, + { + "cell_type": "markdown", + "id": "b51f3108", + "metadata": {}, + "source": [ + "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", + "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", + "and generate predictions:\n", + "\n", + "\"Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c4c7dc", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n", + " \"\"\"Predictions step\"\"\"\n", + " # Get the model_version\n", + " model_version = get_step_context().model_version\n", + "\n", + " # run prediction from memory\n", + " predictor = model_version.load_artifact(\"model\")\n", + " predictions = predictor.predict(dataset_inf)\n", + "\n", + " predictions = pd.Series(predictions, name=\"predicted\")\n", + "\n", + " return predictions\n" + ] + }, + { + "cell_type": "markdown", + "id": "3aeb227b", + "metadata": {}, + "source": [ + "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n", + "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37c409bd", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def inference(preprocess_pipeline_id: UUID):\n", + " \"\"\"Model batch inference pipeline\"\"\"\n", + " # random_state = client.get_artifact_version(id=preprocess_pipeline_id).metadata[\"random_state\"].value\n", + " # target = client.get_artifact_version(id=preprocess_pipeline_id).run_metadata['target'].value\n", + " random_state = 42\n", + " target = \"target\"\n", + "\n", + " df_inference = data_loader(\n", + " random_state=random_state, is_inference=True\n", + " )\n", + " df_inference = inference_preprocessor(\n", + " dataset_inf=df_inference,\n", + " # We use the preprocess pipeline from the feature engineering pipeline\n", + " preprocess_pipeline=ExternalArtifact(id=preprocess_pipeline_id),\n", + " target=target,\n", + " )\n", + " inference_predict(\n", + " dataset_inf=df_inference,\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7afe7be", + "metadata": {}, + "source": [ + "The way to load the right model is to pass in the `production` stage into the `ModelVersion` config this time.\n", + "This will ensure to always load the production model, decoupled from all other pipelines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf5939", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {\"enable_cache\": False}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model_version\"] = ModelVersion(\n", + " name=\"breast_cancer_classifier\",\n", + " version=\"production\", # We can pass in the stage name here!\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + " tags=[\"breast_cancer\", \"classifier\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff3402f1", + "metadata": {}, + "outputs": [], + "source": [ + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "inference_configured = inference.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n", + "# in order to avoid training-serving skew\n", + "inference_configured(\n", + " preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2935d1fa", + "metadata": {}, + "source": [ + "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n", + "that were returned in the pipeline. This completes the MLOps loop of training to inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e191d019", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch production model\n", + "production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n", + "\n", + "# Get the predictions artifact\n", + "production_model_version.get_artifact(\"predictions\").load()" + ] + }, + { + "cell_type": "markdown", + "id": "b0a73cdf", + "metadata": {}, + "source": [ + "You can also see all predictions ever created as a complete history in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "## Congratulations!\n", + "\n", + "You're a legit MLOps engineer now! You trained two models, evaluated them against\n", + "a test set, registered the best one with the ZenML model control plane,\n", + "and served some predictions. You also learned how to iterate on your models and\n", + "data by using some of the ZenML utility abstractions. You saw how to view your\n", + "artifacts and models via the client as well as the ZenML Dashboard.\n", + "\n", + "## Further exploration\n", + "\n", + "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", + "about the capabilities of ZenML. For example, you might want to:\n", + "\n", + "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n", + "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n", + "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", + "\n", + "## What next?\n", + "\n", + "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", + "* If you want to quickly get started with ZenML, check out the [ZenML Cloud](https://zenml.io/cloud)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/quickstart/requirements.txt b/examples/quickstart/requirements.txt new file mode 100644 index 00000000000..060cab40b59 --- /dev/null +++ b/examples/quickstart/requirements.txt @@ -0,0 +1,4 @@ +zenml[server]>=0.50.0 +notebook +scikit-learn<1.3 +pyarrow diff --git a/examples/quickstart/run.py b/examples/quickstart/run.py index 0f93a1f7594..abc917e5576 100644 --- a/examples/quickstart/run.py +++ b/examples/quickstart/run.py @@ -1,19 +1,253 @@ -from pipelines.deploy_and_predict import deploy_and_predict -from pipelines.train_and_register_model import ( - train_and_register_model_pipeline, +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from typing import Optional + +import click +import yaml +from pipelines import ( + feature_engineering, + inference, + training, ) from zenml.client import Client +from zenml.logger import get_logger -if __name__ == "__main__": - train_and_register_model_pipeline() - best_model_test_accuracy = ( - Client() - .get_pipeline("train_and_register_model_pipeline") - .last_successful_run.steps["best_model_selector"] - .outputs["best_model_test_acc"] - .load() +logger = get_logger(__name__) + + +@click.command( + help=""" +ZenML Starter project. + +Run the ZenML starter project with basic options. + +Examples: + + \b + # Run the feature engineering pipeline + python run.py --feature-pipeline + + \b + # Run the training pipeline + python run.py --training-pipeline + + \b + # Run the training pipeline with versioned artifacts + python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 + + \b + # Run the inference pipeline + python run.py --inference-pipeline + +""" +) +@click.option( + "--train-dataset-name", + default="dataset_trn", + type=click.STRING, + help="The name of the train dataset produced by feature engineering.", +) +@click.option( + "--train-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the train dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--test-dataset-name", + default="dataset_tst", + type=click.STRING, + help="The name of the test dataset produced by feature engineering.", +) +@click.option( + "--test-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the test dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--feature-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that creates the dataset.", +) +@click.option( + "--training-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that trains the model.", +) +@click.option( + "--inference-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that performs inference.", +) +@click.option( + "--no-cache", + is_flag=True, + default=False, + help="Disable caching for the pipeline run.", +) +def main( + train_dataset_name: str = "dataset_trn", + train_dataset_version_name: Optional[str] = None, + test_dataset_name: str = "dataset_tst", + test_dataset_version_name: Optional[str] = None, + feature_pipeline: bool = False, + training_pipeline: bool = False, + inference_pipeline: bool = False, + no_cache: bool = False, +): + """Main entry point for the pipeline execution. + + This entrypoint is where everything comes together: + + * configuring pipeline with the required parameters + (some of which may come from command line arguments, but most + of which comes from the YAML config files) + * launching the pipeline + + Args: + train_dataset_name: The name of the train dataset produced by feature engineering. + train_dataset_version_name: Version of the train dataset produced by feature engineering. + If not specified, a new version will be created. + test_dataset_name: The name of the test dataset produced by feature engineering. + test_dataset_version_name: Version of the test dataset produced by feature engineering. + If not specified, a new version will be created. + feature_pipeline: Whether to run the pipeline that creates the dataset. + training_pipeline: Whether to run the pipeline that trains the model. + inference_pipeline: Whether to run the pipeline that performs inference. + no_cache: If `True` cache will be disabled. + """ + client = Client() + + config_folder = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "configs", ) - if best_model_test_accuracy > 0.7: - deploy_and_predict() + # Execute Feature Engineering Pipeline + if feature_pipeline: + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, "feature_engineering.yaml" + ) + run_args_feature = {} + feature_engineering.with_options(**pipeline_args)(**run_args_feature) + logger.info("Feature Engineering pipeline finished successfully!\n") + + train_dataset_artifact = client.get_artifact_version( + train_dataset_name + ) + test_dataset_artifact = client.get_artifact_version(test_dataset_name) + logger.info( + "The latest feature engineering pipeline produced the following " + f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, " + f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: " + f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}" + ) + + # Execute Training Pipeline + if training_pipeline: + run_args_train = {} + + # If train_dataset_version_name is specified, use versioned artifacts + if train_dataset_version_name or test_dataset_version_name: + # However, both train and test dataset versions must be specified + assert ( + train_dataset_version_name is not None + and test_dataset_version_name is not None + ) + train_dataset_artifact_version = client.get_artifact_version( + train_dataset_name, train_dataset_version_name + ) + # If train dataset is specified, test dataset must be specified + test_dataset_artifact_version = client.get_artifact_version( + test_dataset_name, test_dataset_version_name + ) + # Use versioned artifacts + run_args_train[ + "train_dataset_id" + ] = train_dataset_artifact_version.id + run_args_train[ + "test_dataset_id" + ] = test_dataset_artifact_version.id + + # Run the SGD pipeline + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, "training_sgd.yaml" + ) + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with SGD finished successfully!\n\n") + + # Run the RF pipeline + pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, "training_rf.yaml" + ) + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with RF finished successfully!\n\n") + + if inference_pipeline: + run_args_inference = {} + pipeline_args = {"enable_cache": False} + pipeline_args["config_path"] = os.path.join( + config_folder, "inference.yaml" + ) + + # Configure the pipeline + inference_configured = inference.with_options(**pipeline_args) + + # Fetch the production model + with open(pipeline_args["config_path"], "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + zenml_model = client.get_model_version( + config["model_version"]["name"], config["model_version"]["version"] + ) + preprocess_pipeline_artifact = zenml_model.get_artifact( + "preprocess_pipeline" + ) + + # Use the metadata of feature engineering pipeline artifact + # to get the random state and target column + random_state = preprocess_pipeline_artifact.run_metadata[ + "random_state" + ].value + target = preprocess_pipeline_artifact.run_metadata["target"].value + run_args_inference["random_state"] = random_state + run_args_inference["target"] = target + + # Run the pipeline + inference_configured(**run_args_inference) + logger.info("Inference pipeline finished successfully!") + + +if __name__ == "__main__": + main() diff --git a/examples/quickstart/setup.sh b/examples/quickstart/setup.sh deleted file mode 100644 index a3e454425b5..00000000000 --- a/examples/quickstart/setup.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -set -Eeo pipefail - -setup_stack () { - - zenml model-deployer register mlflow_deployer --flavor=mlflow || \ - msg "${WARNING}Reusing preexisting model deployer ${NOFORMAT}mlflow_deployer" - zenml experiment-tracker register mlflow_tracker --flavor=mlflow || \ - msg "${WARNING}Reusing preexisting experiment tracker ${NOFORMAT}mlflow_tracker" - zenml model-registry register mlflow_registry --flavor=mlflow || \ - msg "${WARNING}Reusing preexisting model registry ${NOFORMAT}mlflow_registry" - - zenml stack register quickstart_stack \ - -a default \ - -o default \ - -d mlflow_deployer \ - -r mlflow_registry \ - -e mlflow_tracker || \ - msg "${WARNING}Reusing preexisting stack ${NOFORMAT}quickstart" - - zenml stack set quickstart_stack -} - -pre_run () { - zenml integration install sklearn mlflow -} - -pre_run_forced () { - zenml integration install sklearn mlflow -y -} - -post_run () { - # cleanup the last local ZenML daemon started by the example - pkill -n -f zenml.services.local.local_daemon_entrypoint -} diff --git a/examples/quickstart/steps/__init__.py b/examples/quickstart/steps/__init__.py index e69de29bb2d..98124b2a40f 100644 --- a/examples/quickstart/steps/__init__.py +++ b/examples/quickstart/steps/__init__.py @@ -0,0 +1,41 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .data_loader import ( + data_loader, +) +from .data_preprocessor import ( + data_preprocessor, +) +from .data_splitter import ( + data_splitter, +) +from .inference_predict import ( + inference_predict, +) +from .inference_preprocessor import ( + inference_preprocessor, +) +from .model_evaluator import ( + model_evaluator, +) +from .model_promoter import ( + model_promoter, +) +from .model_trainer import ( + model_trainer, +) diff --git a/examples/quickstart/steps/best_model_selector.py b/examples/quickstart/steps/best_model_selector.py deleted file mode 100644 index 884a7ec3eb3..00000000000 --- a/examples/quickstart/steps/best_model_selector.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Tuple - -import pandas as pd -from sklearn.base import ClassifierMixin -from typing_extensions import Annotated - -from zenml import step - - -@step -def best_model_selector( - X_test: pd.DataFrame, - y_test: pd.Series, - model1: ClassifierMixin, - model2: ClassifierMixin, -) -> Tuple[ - Annotated[ClassifierMixin, "best_model"], - Annotated[float, "best_model_test_acc"], -]: - """Calculate the accuracy on the test set and return the best model and its accuracy.""" - test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy()) - test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy()) - print(f"Test accuracy ({model1.__class__.__name__}): {test_acc1}") - print(f"Test accuracy ({model2.__class__.__name__}): {test_acc2}") - if test_acc1 > test_acc2: - best_model = model1 - best_model_test_acc = test_acc1 - else: - best_model = model2 - best_model_test_acc = test_acc2 - return best_model, best_model_test_acc diff --git a/examples/quickstart/steps/data_loader.py b/examples/quickstart/steps/data_loader.py new file mode 100644 index 00000000000..baf59c6433f --- /dev/null +++ b/examples/quickstart/steps/data_loader.py @@ -0,0 +1,65 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.datasets import load_breast_cancer +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def data_loader( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + random_state: Random state for sampling + is_inference: If `True` subset will be returned and target column + will be removed from dataset. + target: Name of target columns in dataset. + + Returns: + The dataset artifact as Pandas DataFrame and name of target column. + """ + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample( + inference_size, random_state=random_state + ) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + return dataset diff --git a/examples/quickstart/steps/data_preprocessor.py b/examples/quickstart/steps/data_preprocessor.py new file mode 100644 index 00000000000..2f4a11ccbaa --- /dev/null +++ b/examples/quickstart/steps/data_preprocessor.py @@ -0,0 +1,94 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional, Tuple + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Annotated +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper + +from zenml import log_artifact_metadata, step + + +@step +def data_preprocessor( + random_state: int, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +) -> Tuple[ + Annotated[pd.DataFrame, "dataset_trn"], + Annotated[pd.DataFrame, "dataset_tst"], + Annotated[Pipeline, "preprocess_pipeline"], +]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model training. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps like cleaning, + feature engineering, feature selection, etc. It then returns the processed + dataset as a step output artifact. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to drop NA values, drop some + columns and normalize numerical columns. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + random_state: Random state for sampling. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + drop_na: If `True` all NA rows will be dropped. + normalize: If `True` all numeric fields will be normalized. + drop_columns: List of column names to drop. + target: Name of target column in dataset. + + Returns: + The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. + """ + # We use the sklearn pipeline to chain together multiple preprocessing steps + preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) + if drop_na: + preprocess_pipeline.steps.append(("drop_na", NADropper())) + if drop_columns: + # Drop columns + preprocess_pipeline.steps.append( + ("drop_columns", ColumnsDropper(drop_columns)) + ) + if normalize: + # Normalize the data + preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) + preprocess_pipeline.steps.append( + ("cast", DataFrameCaster(dataset_trn.columns)) + ) + dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) + dataset_tst = preprocess_pipeline.transform(dataset_tst) + + # Log metadata so we can load it in the inference pipeline + log_artifact_metadata( + artifact_name="preprocess_pipeline", + metadata={"random_state": random_state, "target": target}, + ) + return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/examples/quickstart/steps/data_splitter.py b/examples/quickstart/steps/data_splitter.py new file mode 100644 index 00000000000..13ec7e856e1 --- /dev/null +++ b/examples/quickstart/steps/data_splitter.py @@ -0,0 +1,61 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Tuple + +import pandas as pd +from sklearn.model_selection import train_test_split +from typing_extensions import Annotated + +from zenml import step + + +@step +def data_splitter( + dataset: pd.DataFrame, test_size: float = 0.2 +) -> Tuple[ + Annotated[pd.DataFrame, "raw_dataset_trn"], + Annotated[pd.DataFrame, "raw_dataset_tst"], +]: + """Dataset splitter step. + + This is an example of a dataset splitter step that splits the data + into train and test set before passing it to ML model. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different test + set sizes. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + dataset: Dataset read from source. + test_size: 0.0..1.0 defining portion of test set. + + Returns: + The split dataset: dataset_trn, dataset_tst. + """ + dataset_trn, dataset_tst = train_test_split( + dataset, + test_size=test_size, + random_state=42, + shuffle=True, + ) + dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) + dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) + return dataset_trn, dataset_tst diff --git a/examples/quickstart/steps/inference_predict.py b/examples/quickstart/steps/inference_predict.py new file mode 100644 index 00000000000..cd1d2921003 --- /dev/null +++ b/examples/quickstart/steps/inference_predict.py @@ -0,0 +1,57 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + +import pandas as pd +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def inference_predict( + model: Any, + dataset_inf: pd.DataFrame, +) -> Annotated[pd.Series, "predictions"]: + """Predictions step. + + This is an example of a predictions step that takes the data and model in + and returns predicted values. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different input data. + See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + model: Trained model. + dataset_inf: The inference dataset. + + Returns: + The predictions as pandas series + """ + # run prediction from memory + predictions = model.predict(dataset_inf) + + predictions = pd.Series(predictions, name="predicted") + return predictions diff --git a/examples/quickstart/steps/inference_preprocessor.py b/examples/quickstart/steps/inference_preprocessor.py new file mode 100644 index 00000000000..d484433e51e --- /dev/null +++ b/examples/quickstart/steps/inference_preprocessor.py @@ -0,0 +1,50 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.pipeline import Pipeline +from typing_extensions import Annotated + +from zenml import step + + +@step +def inference_preprocessor( + dataset_inf: pd.DataFrame, + preprocess_pipeline: Pipeline, + target: str, +) -> Annotated[pd.DataFrame, "inference_dataset"]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model inference. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps based on pretrained + preprocessing pipeline. + + Args: + dataset_inf: The inference dataset. + preprocess_pipeline: Pretrained `Pipeline` to process dataset. + target: Name of target columns in dataset. + + Returns: + The processed dataframe: dataset_inf. + """ + # artificially adding `target` column to avoid Pipeline issues + dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) + dataset_inf = preprocess_pipeline.transform(dataset_inf) + dataset_inf.drop(columns=[target], inplace=True) + return dataset_inf diff --git a/examples/quickstart/steps/model_deployer.py b/examples/quickstart/steps/model_deployer.py deleted file mode 100644 index 7cde5a21334..00000000000 --- a/examples/quickstart/steps/model_deployer.py +++ /dev/null @@ -1,12 +0,0 @@ -from steps.register_model import model_name -from zenml.integrations.mlflow.steps.mlflow_deployer import ( - mlflow_model_registry_deployer_step, -) - -model_deployer = mlflow_model_registry_deployer_step.with_options( - parameters=dict( - registry_model_name=model_name, - registry_model_version=1, - timeout=300, - ) -) diff --git a/examples/quickstart/steps/model_evaluator.py b/examples/quickstart/steps/model_evaluator.py new file mode 100644 index 00000000000..579a78afe13 --- /dev/null +++ b/examples/quickstart/steps/model_evaluator.py @@ -0,0 +1,105 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin + +from zenml import log_artifact_metadata, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_evaluator( + model: ClassifierMixin, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, + target: Optional[str] = "target", +) -> float: + """Evaluate a trained model. + + This is an example of a model evaluation step that takes in a model artifact + previously trained by another step in your pipeline, and a training + and validation data set pair which it uses to evaluate the model's + performance. The model metrics are then returned as step output artifacts + (in this case, the model accuracy on the train and test set). + + The suggested step implementation also outputs some warnings if the model + performance does not meet some minimum criteria. This is just an example of + how you can use steps to monitor your model performance and alert you if + something goes wrong. As an alternative, you can raise an exception in the + step to force the pipeline run to fail early and all subsequent steps to + be skipped. + + This step is parameterized to configure the step independently of the step code, + before running it in a pipeline. In this example, the step can be configured + to use different values for the acceptable model performance thresholds and + to control whether the pipeline run should fail if the model performance + does not meet the minimum criteria. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + model: The pre-trained model artifact. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + min_train_accuracy: Minimal acceptable training accuracy value. + min_test_accuracy: Minimal acceptable testing accuracy value. + target: Name of target column in dataset. + + Returns: + The model accuracy on the test set. + """ + # Calculate the model accuracy on the train and test set + trn_acc = model.score( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + tst_acc = model.score( + dataset_tst.drop(columns=[target]), + dataset_tst[target], + ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") + logger.info(f"Test accuracy={tst_acc*100:.2f}%") + + messages = [] + if trn_acc < min_train_accuracy: + messages.append( + f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + ) + if tst_acc < min_test_accuracy: + messages.append( + f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + ) + else: + for message in messages: + logger.warning(message) + + log_artifact_metadata( + metadata={ + "train_accuracy": float(trn_acc), + "test_accuracy": float(tst_acc), + }, + artifact_name="model", + ) + return float(tst_acc) diff --git a/examples/quickstart/steps/model_promoter.py b/examples/quickstart/steps/model_promoter.py new file mode 100644 index 00000000000..c0ad76f7e64 --- /dev/null +++ b/examples/quickstart/steps/model_promoter.py @@ -0,0 +1,76 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Model promoter step. + + This is an example of a step that conditionally promotes a model. It takes + in the accuracy of the model and the stage to promote the model to. If the + accuracy is below 80%, the model is not promoted. If it is above 80%, the + model is promoted to the stage indicated in the parameters. If there is + already a model in the indicated stage, the model with the higher accuracy + is promoted. + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + is_promoted = False + + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + else: + logger.info(f"Model promoted to {stage}!") + is_promoted = True + + # Get the model in the current context + current_model_version = get_step_context().model_version + + # Get the model that is in the production stage + client = Client() + try: + stage_model_version = client.get_model_version( + current_model_version.name, stage + ) + # We compare their metrics + prod_accuracy = ( + stage_model_version.get_artifact("model") + .run_metadata["test_accuracy"] + .value + ) + if float(accuracy) > float(prod_accuracy): + # If current model has better metrics, we promote it + is_promoted = True + current_model_version.set_stage(stage, force=True) + except KeyError: + # If no such model exists, current one is promoted + is_promoted = True + current_model_version.set_stage(stage, force=True) + return is_promoted diff --git a/examples/quickstart/steps/model_trainer.py b/examples/quickstart/steps/model_trainer.py new file mode 100644 index 00000000000..a9a1c2891c5 --- /dev/null +++ b/examples/quickstart/steps/model_trainer.py @@ -0,0 +1,72 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated + +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", + target: Optional[str] = "target", +) -> Annotated[ + ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True) +]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + model_type: The type of model to train. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + + Raises: + ValueError: If the model type is not supported. + """ + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "rf": + model = RandomForestClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model diff --git a/examples/quickstart/steps/prediction_service_loader.py b/examples/quickstart/steps/prediction_service_loader.py deleted file mode 100644 index 7bf8f10fae9..00000000000 --- a/examples/quickstart/steps/prediction_service_loader.py +++ /dev/null @@ -1,14 +0,0 @@ -from zenml import step -from zenml.client import Client -from zenml.services import BaseService - - -@step(enable_cache=False) -def prediction_service_loader() -> BaseService: - """Load the model service of our train_and_register_model_pipeline.""" - client = Client() - model_deployer = client.active_stack.model_deployer - services = model_deployer.find_model_server( - pipeline_name="train_and_register_model_pipeline", - ) - return services[0] diff --git a/examples/quickstart/steps/predictor.py b/examples/quickstart/steps/predictor.py deleted file mode 100644 index 54b0e569fe0..00000000000 --- a/examples/quickstart/steps/predictor.py +++ /dev/null @@ -1,22 +0,0 @@ -import pandas as pd -from typing_extensions import Annotated - -from zenml import step -from zenml.services import BaseService - - -@step -def predictor( - service: BaseService, - data: pd.DataFrame, -) -> Annotated[list, "predictions"]: - """Run a inference request against a prediction service.""" - service.start(timeout=100) - print( - f"Running predictions on data (single individual): {data.to_numpy()[0]}" - ) - prediction = service.predict(data.to_numpy()) - print( - f"Prediction (for single example slice) is: {bool(prediction.tolist()[0])}" - ) - return prediction.tolist() diff --git a/examples/quickstart/steps/register_model.py b/examples/quickstart/steps/register_model.py deleted file mode 100644 index 9dfb05e8c35..00000000000 --- a/examples/quickstart/steps/register_model.py +++ /dev/null @@ -1,12 +0,0 @@ -from zenml.integrations.mlflow.steps.mlflow_registry import ( - mlflow_register_model_step, -) - -model_name = "zenml-quickstart-model" - -register_model = mlflow_register_model_step.with_options( - parameters=dict( - name=model_name, - description="The first run of the Quickstart pipeline.", - ) -) diff --git a/examples/quickstart/steps/trainers.py b/examples/quickstart/steps/trainers.py deleted file mode 100644 index b01f9943622..00000000000 --- a/examples/quickstart/steps/trainers.py +++ /dev/null @@ -1,38 +0,0 @@ -import mlflow -import pandas as pd -from sklearn.base import ClassifierMixin -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import SGDClassifier - -from zenml import step -from zenml.client import Client - -experiment_tracker = Client().active_stack.experiment_tracker - - -@step(experiment_tracker=experiment_tracker.name) -def random_forest_trainer_mlflow( - X_train: pd.DataFrame, - y_train: pd.Series, -) -> ClassifierMixin: - """Train a sklearn Random Forest classifier and log to MLflow.""" - mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow - model = RandomForestClassifier() - model.fit(X_train.to_numpy(), y_train.to_numpy()) - train_acc = model.score(X_train.to_numpy(), y_train.to_numpy()) - print(f"Train accuracy: {train_acc}") - return model - - -@step(experiment_tracker=experiment_tracker.name) -def sgd_trainer_mlflow( - X_train: pd.DataFrame, - y_train: pd.Series, -) -> ClassifierMixin: - """Train a SGD classifier and log to MLflow.""" - mlflow.sklearn.autolog() # log all model hyperparams and metrics to MLflow - model = SGDClassifier() - model.fit(X_train.to_numpy(), y_train.to_numpy()) - train_acc = model.score(X_train.to_numpy(), y_train.to_numpy()) - print(f"Train accuracy: {train_acc}") - return model diff --git a/examples/quickstart/steps/training_data_loader.py b/examples/quickstart/steps/training_data_loader.py deleted file mode 100644 index 4775711da21..00000000000 --- a/examples/quickstart/steps/training_data_loader.py +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Tuple - -import pandas as pd -from sklearn.model_selection import train_test_split - -from zenml import step - - -@step -def training_data_loader() -> ( - Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series] -): - """Load the Census Income dataset as tuple of Pandas DataFrame / Series.""" - # Load the dataset - url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" - column_names = [ - "age", - "workclass", - "fnlwgt", - "education", - "education-num", - "marital-status", - "occupation", - "relationship", - "race", - "sex", - "capital-gain", - "capital-loss", - "hours-per-week", - "native-country", - "income", - ] - data = pd.read_csv( - url, names=column_names, na_values="?", skipinitialspace=True - ) - - # Drop rows with missing values - data = data.dropna() - - # Encode categorical features and drop original columns - categorical_cols = [ - "workclass", - "education", - "marital-status", - "occupation", - "relationship", - "race", - "sex", - "native-country", - ] - data = pd.get_dummies(data, columns=categorical_cols, drop_first=True) - - # Encode target feature - data["income"] = data["income"].apply( - lambda x: 1 if x.strip() == ">50K" else 0 - ) - - # Separate features and target - X = data.drop("income", axis=1) - y = data["income"] - - # Split the dataset into train and test sets - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - return (X_train, X_test, y_train, y_test) diff --git a/examples/quickstart/utils/__init__.py b/examples/quickstart/utils/__init__.py new file mode 100644 index 00000000000..d7a9d7dc194 --- /dev/null +++ b/examples/quickstart/utils/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/examples/quickstart/utils/preprocess.py b/examples/quickstart/utils/preprocess.py new file mode 100644 index 00000000000..cf6555f2929 --- /dev/null +++ b/examples/quickstart/utils/preprocess.py @@ -0,0 +1,56 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns) diff --git a/src/zenml/cli/base.py b/src/zenml/cli/base.py index 30548253ca5..07e9cbf4ee7 100644 --- a/src/zenml/cli/base.py +++ b/src/zenml/cli/base.py @@ -77,11 +77,11 @@ def copier_github_url(self) -> str: ), starter=ZenMLProjectTemplateLocation( github_url="zenml-io/template-starter", - github_tag="0.45.0", + github_tag="2023.12.18", # Make sure it is aligned with .github/workflows/update-templates-to-examples.yml ), nlp=ZenMLProjectTemplateLocation( github_url="zenml-io/template-nlp", - github_tag="0.45.0", + github_tag="0.45.0", # Make sure it is aligned with .github/workflows/update-templates-to-examples.yml ), ) diff --git a/tests/harness/cfg/tests.yaml b/tests/harness/cfg/tests.yaml index 3dd057c7a68..f9a3014cd67 100644 --- a/tests/harness/cfg/tests.yaml +++ b/tests/harness/cfg/tests.yaml @@ -153,24 +153,6 @@ tests: - capabilities: synchronized: true - - module: tests.integration.examples.test_quickstart - requirements: - - integrations: - - sklearn - - mlflow - - evidently - stacks: - - type: experiment_tracker - flavor: mlflow - - type: model_deployer - flavor: mlflow - - type: model_registry - flavor: mlflow - - type: data_validator - flavor: evidently - - capabilities: - synchronized: true - - module: tests.integration.examples.test_huggingface requirements: - integrations: diff --git a/tests/integration/examples/test_quickstart.py b/tests/integration/examples/test_quickstart.py deleted file mode 100644 index 07a0838b367..00000000000 --- a/tests/integration/examples/test_quickstart.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) ZenML GmbH 2023. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. - - -import pytest - -from tests.integration.examples.utils import run_example -from zenml.client import Client -from zenml.integrations.mlflow.model_registries.mlflow_model_registry import ( - MLFlowModelRegistry, -) - - -def test_example(request: pytest.FixtureRequest) -> None: - """Runs the quickstart example.""" - - with run_example( - request=request, - name="quickstart", - pipelines={ - "train_and_register_model_pipeline": (1, 5), - "deploy_and_predict": (1, 4), - }, - is_public_example=True, - ): - # activate the stack set up and used by the example - client = Client() - model_registry = client.active_stack.model_registry - assert isinstance(model_registry, MLFlowModelRegistry) - - # fetch the MLflow registered model - registered_model = model_registry.get_model( - name="zenml-quickstart-model", - ) - assert registered_model is not None - - # fetch the MLflow registered model version - registered_model_version = model_registry.get_model_version( - name="zenml-quickstart-model", - version=1, - ) - assert registered_model_version is not None - - # Check that the deployment service is running - from zenml.integrations.mlflow.services import MLFlowDeploymentService - - training_run = client.get_pipeline("deploy_and_predict").last_run - - service = training_run.steps[ - "mlflow_model_registry_deployer_step" - ].output.load() - assert isinstance(service, MLFlowDeploymentService) - - if service.is_running: - service.stop(timeout=180) diff --git a/tests/integration/functional/cli/test_base.py b/tests/integration/functional/cli/test_base.py index 6e615d6ef3a..7217a227712 100644 --- a/tests/integration/functional/cli/test_base.py +++ b/tests/integration/functional/cli/test_base.py @@ -56,11 +56,11 @@ def test_init_creates_from_templates( ], ) assert (tmp_path / REPOSITORY_DIRECTORY_NAME).exists() - files_in_top_level = set(os.listdir(str(tmp_path))) + files_in_top_level = set([f.lower() for f in os.listdir(str(tmp_path))]) must_have_files = { ".copier-answers.yml", ".dockerignore", - "LICENSE", + "license", "run.py", } assert not must_have_files - files_in_top_level