Skip to content

Commit

Permalink
Create ensembler web service (#165)
Browse files Browse the repository at this point in the history
* Add skeleton class for pyfunc ensembler

* Refactor ensembler class in pyfunc to accept both batch and live requests

* Add supporting classes for live pyfunc ensembler

* Add preprocessing methods for live ensembler

* Update PyFunc ensembler in SDK to utilise returned treatment_config

* Modify predict method in SDK PyFunc to allow backward compatibility with current batch ensemblers

* Set output from prediction to be a list-like object

* Remove redundant header names for features in PyFunc

* Rename PyFuncEnsembler to PyFuncEnsemblerRunner to remove overloaded classname

* Rename references to renamed PyFuncEnsemblerRunner

* Add docstrings to various methods

* Add README template

* Add base files for containerisation

* Make container use a multi-stage build that use a venv derived from a conda env

* Rename preprocess method to make it appear private

* Add gitignore file

* Add test for preprocessing method for pyfunc_ensembler_runner

* Cleanup some testing configurations

* Rename test sample data to improve consistency in naming

* Remove test request

* Add additional tests for web service

* Add files for containerisation

* Rename live-ensembler to real-time-ensembler

* Add github workflow for real-time-ensembler

* Edit typo in workflow

* Edit typo in readme file

* Add changes missed out by rebasing

* Edit typo in exception message

* Separate dockerfiles into a base and app file

* Edit typo in dockerfile

* Rename real-time ensembler module and mentions to pyfunc-ensembler-service

* Rename batch-ensembler module and mentions with pyfunc-ensembler-job

* Rename remnants of ensemblers with old naming convention

* Add new pyfunc-ensembler-service engine to Turing CI

* Replace vanilla debian image with its slim version

* Clean up dockerfiles to utilise env variables

* Replace redundant run.sh script by running webservice from dockerfile

* Remove redundant entries in .gitignore

* Rename batch ensembler to pyfunc-ensembler-job

* Revamp pyfunc implementation to avoid dataframe manipulations for real-time ensembling

* Remove redundant imports

* Replace incorrect env variables in dockerfiles

* Refactor pyfunc predict method to use helper methods dependent on input type

* Rewrite help tags for arg parser
  • Loading branch information
deadlycoconuts authored Feb 16, 2022
1 parent 51d407e commit e108820
Show file tree
Hide file tree
Showing 55 changed files with 763 additions and 34 deletions.
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
name: engines/batch-ensembler
name: engines/pyfunc-ensembler-job

on:
# Automatically run CI on Release and Pre-Release tags and main branch
# (only if there are changes to relevant paths)
push:
tags:
- "batch-ensembler/v[0-9]+.[0-9]+.[0-9]+*"
- "pyfunc-ensembler-job/v[0-9]+.[0-9]+.[0-9]+*"
branches:
- main
paths:
- ".github/workflows/batch-ensembler.yaml"
- "engines/batch-ensembler/**"
- ".github/workflows/pyfunc-ensembler-job.yaml"
- "engines/pyfunc-ensembler-job/**"
- "sdk/**"

# Automatically run CI on branches, that have active PR opened
pull_request:
branches:
- main
paths:
- ".github/workflows/batch-ensembler.yaml"
- "engines/batch-ensembler/**"
- ".github/workflows/pyfunc-ensembler-job.yaml"
- "engines/pyfunc-ensembler-job/**"
- "sdk/**"

# To make it possible to trigger e2e CI workflow for any arbitrary git ref
Expand Down Expand Up @@ -50,13 +50,13 @@ jobs:
- name: Cache Conda environment
uses: actions/cache@v2
with:
path: engines/batch-ensembler/env
path: engines/pyfunc-ensembler-job/env
key: |
conda-${{ hashFiles('engines/batch-ensembler/environment.yaml') }}-${{ hashFiles('engines/batch-ensembler/requirements.txt') }}-${{ hashFiles('engines/batch-ensembler/requirements.dev.txt') }}
conda-${{ hashFiles('engines/pyfunc-ensembler-job/environment.yaml') }}-${{ hashFiles('engines/pyfunc-ensembler-job/requirements.txt') }}-${{ hashFiles('engines/pyfunc-ensembler-job/requirements.dev.txt') }}
restore-keys: conda-

- name: Run Tests
working-directory: engines/batch-ensembler
working-directory: engines/pyfunc-ensembler-job
run: |
make setup
make test
Expand All @@ -70,7 +70,7 @@ jobs:
- id: release-rules
uses: ./.github/actions/release-rules
with:
prefix: batch-ensembler/
prefix: pyfunc-ensembler-job/

publish:
# Automatically publish release and pre-release artifacts.
Expand Down Expand Up @@ -103,13 +103,13 @@ jobs:

- name: Build Docker Image
id: build
working-directory: engines/batch-ensembler
working-directory: engines/pyfunc-ensembler-job
env:
DOCKER_REGISTRY: ghcr.io/${{ github.repository }}
run: |
set -o pipefail
make build-image | tee output.log
echo "::set-output name=ensembler-image::$(sed -n 's%Building docker image: \(.*\)%\1%p' output.log)"
echo "::set-output name=pyfunc-ensembler-job::$(sed -n 's%Building docker image: \(.*\)%\1%p' output.log)"
- name: Publish Batch Ensembler Docker Image
run: docker push ${{ steps.build.outputs.ensembler-image }}
- name: Publish Pyfunc Ensembler Job Docker Image
run: docker push ${{ steps.build.outputs.pyfunc-ensembler-job }}
109 changes: 109 additions & 0 deletions .github/workflows/pyfunc-ensembler-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: engines/pyfunc-ensembler-service

on:
# Automatically run CI on Release and Pre-Release tags and main branch
# (only if there are changes to relevant paths)
push:
tags:
- "pyfunc-ensembler-service/v[0-9]+.[0-9]+.[0-9]+*"
branches:
- main
paths:
- ".github/workflows/pyfunc-ensembler-service.yaml"
- "engines/pyfunc-ensembler-service/**"
- "sdk/**"

# Automatically run CI on branches, that have active PR opened
pull_request:
branches:
- main
paths:
- ".github/workflows/pyfunc-ensembler-service.yaml"
- "engines/pyfunc-ensembler-service/**"
- "sdk/**"

# To make it possible to trigger e2e CI workflow for any arbitrary git ref
workflow_dispatch:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Setup Conda
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true

- name: Cache Conda environment
uses: actions/cache@v2
with:
path: engines/pyfunc-ensembler-service/env
key: |
conda-${{ hashFiles('engines/pyfunc-ensembler-service/environment.yaml') }}-${{ hashFiles('engines/pyfunc-ensembler-service/requirements.txt') }}-${{ hashFiles('engines/pyfunc-ensembler-service/requirements.dev.txt') }}
restore-keys: conda-

- name: Run Tests
working-directory: engines/pyfunc-ensembler-service
run: |
make setup
make test
release-rules:
runs-on: ubuntu-latest
outputs:
release-type: ${{ steps.release-rules.outputs.release-type }}
steps:
- uses: actions/checkout@v2
- id: release-rules
uses: ./.github/actions/release-rules
with:
prefix: pyfunc-ensembler-service/

publish:
# Automatically publish release and pre-release artifacts.
#
# As for dev releases, make it possible to publish artifacts
# manually by approving 'deployment' in the 'manual' environment.
#
# Dev build can be released either from the 'main' branch or
# by running this workflow manually with `workflow_dispatch` event.
if: >-
contains('release,pre-release', needs.release-rules.outputs.release-type)
|| ( github.event_name != 'pull_request' )
|| ( github.event.pull_request.head.repo.full_name == github.repository )
environment: ${{ needs.release-rules.outputs.release-type == 'dev' && 'manual' || '' }}
runs-on: ubuntu-latest
needs:
- release-rules
- test
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0

- name: Log in to the Container registry
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build Docker Image
id: build
working-directory: engines/pyfunc-ensembler-service
env:
DOCKER_REGISTRY: ghcr.io/${{ github.repository }}
run: |
set -o pipefail
make build-image | tee output.log
echo "::set-output name=pyfunc-ensembler-service-image::$(sed -n 's%Building docker image: \(.*\)%\1%p' output.log)"
- name: Publish Pyfunc Ensembler Service Docker Image
run: docker push ${{ steps.build.outputs.pyfunc-ensembler-service-image }}
12 changes: 8 additions & 4 deletions .github/workflows/turing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ on:
- main
paths-ignore:
- "docs/**"
- "engines/batch-ensembler/**"
- "engines/pyfunc-ensembler-job/**"
- "engines/pyfunc-ensembler-service/**"
- "sdk/**"
- ".github/workflows/batch-ensembler.yaml"
- ".github/workflows/pyfunc-ensembler-job.yaml"
- ".github/workflows/pyfunc-ensembler-service.yaml"
- ".github/workflows/sdk.yaml"
- ".github/workflows/helm-chart.yaml"
- ".github/workflows/cluster-init.yaml"
Expand All @@ -23,9 +25,11 @@ on:
- main
paths-ignore:
- "docs/**"
- "engines/batch-ensembler/**"
- "engines/pyfunc-ensembler-job/**"
- "engines/pyfunc-ensembler-service/**"
- "sdk/**"
- ".github/workflows/batch-ensembler.yaml"
- ".github/workflows/pyfunc-ensembler-job.yaml"
- ".github/workflows/pyfunc-ensembler-service.yaml"
- ".github/workflows/sdk.yaml"
- ".github/workflows/helm-chart.yaml"

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,10 @@ BatchEnsemblingConfig:
BuildNamespace: default
BuildTimeoutDuration: 20m
DestinationRegistry: ghcr.io
BaseImageRef: ghcr.io/gojek/turing/batch-ensembler:latest
BaseImageRef: ghcr.io/gojek/turing/pyfunc-ensembler-job:latest
KanikoConfig:
BuildContextURI: git://github.com/gojek/turing.git#refs/heads/main
DockerfileFilePath: engines/batch-ensembler/app.Dockerfile
DockerfileFilePath: engines/pyfunc-ensembler-job/app.Dockerfile
Image: gcr.io/kaniko-project/executor
ImageVersion: v1.6.0
ResourceRequestsLimits:
Expand Down
4 changes: 2 additions & 2 deletions api/config-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ BatchEnsemblingConfig:
BuildNamespace: default
BuildTimeoutDuration: 20m
DestinationRegistry: ghcr.io
BaseImageRef: ghcr.io/gojek/turing/batch-ensembler:latest
BaseImageRef: ghcr.io/gojek/turing/pyfunc-ensembler-job:latest
KanikoConfig:
BuildContextURI: git://github.com/gojek/turing.git#refs/heads/main
DockerfileFilePath: engines/batch-ensembler/app.Dockerfile
DockerfileFilePath: engines/pyfunc-ensembler-job/app.Dockerfile
Image: gcr.io/kaniko-project/executor
ImageVersion: v1.6.0
ResourceRequestsLimits:
Expand Down
4 changes: 2 additions & 2 deletions api/turing/config/example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ BatchEnsemblingConfig:
BuildNamespace: default
BuildTimeoutDuration: 20m
DestinationRegistry: ghcr.io
BaseImageRef: ghcr.io/gojek/turing/batch-ensembler:latest
BaseImageRef: ghcr.io/gojek/turing/pyfunc-ensembler-job:latest
KanikoConfig:
BuildContextURI: git://github.com/gojek/turing.git#refs/heads/main
DockerfileFilePath: engines/batch-ensembler/app.Dockerfile
DockerfileFilePath: engines/pyfunc-ensembler-job/app.Dockerfile
Image: gcr.io/kaniko-project/executor
ImageVersion: v1.6.0
ResourceRequestsLimits:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SHELL := /bin/bash

APP_NAME := batch-ensembler
APP_NAME := pyfunc-ensembler-job
CONDA_ENV_NAME ?= $(APP_NAME)
ACTIVATE_ENV = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ./env/$(CONDA_ENV_NAME)

Expand Down Expand Up @@ -39,4 +39,4 @@ build-image: version
.PHONY: version
version:
$(eval VERSION=$(if $(OVERWRITE_VERSION),$(OVERWRITE_VERSION),v$(shell ../../scripts/vertagen/vertagen.sh -p ${APP_NAME}/)))
@echo "turing-batch-ensembler version:" $(VERSION)
@echo "turing-pyfunc-ensembler-job version:" $(VERSION)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: batch-ensembler
name: pyfunc-ensembler-job
dependencies:
- python=3.8
- pip=21.0.1
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
]

setuptools.setup(
name='batch-ensembler',
name='pyfunc-ensembler-job',
packages=setuptools.find_packages(),
install_requires=requirements,
dev_requirements=dev_requirements,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
8 changes: 8 additions & 0 deletions engines/pyfunc-ensembler-service/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.gitignore
.dockerignore

env/
tests/

.mypy_cache/
.pytest_cache/
6 changes: 6 additions & 0 deletions engines/pyfunc-ensembler-service/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
env/
.coverage
**/mlruns/
**/__pycache__

ensembler/*
16 changes: 16 additions & 0 deletions engines/pyfunc-ensembler-service/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM continuumio/miniconda3 AS builder

RUN wget -qO- https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-367.0.0-linux-x86_64.tar.gz | tar xzf -
ENV PATH=$PATH:/google-cloud-sdk/bin
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ENV APP_NAME=${APP_NAME}

COPY . .
COPY ./temp-deps/sdk ./../../sdk

RUN conda env create -f ./environment.yaml && \
conda env update --name ${CONDA_ENV_NAME} --file /ensembler/conda.yaml && \
rm -rf /root/.cache

# Install conda-pack:
RUN conda install -c conda-forge conda-pack
33 changes: 33 additions & 0 deletions engines/pyfunc-ensembler-service/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
SHELL := /bin/bash

APP_NAME := pyfunc-ensembler-service
CONDA_ENV_NAME ?= $(APP_NAME)
ACTIVATE_ENV = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate $(CONDA_ENV_NAME)

.PHONY: setup
setup: $(CONDA_ENV_NAME)
$(CONDA_ENV_NAME):
@conda env update -f environment.yaml --prune
$(ACTIVATE_ENV) && pip install -r requirements.dev.txt

.PHONY: test
test:
@$(ACTIVATE_ENV) && \
python -m pytest \
--cov=pyfunc_ensembler_runner \
--cov-report term-missing \
-W ignore

.PHONY: build-image
build-image: version
@mkdir -p temp-deps
@cp -r ../../sdk temp-deps/
@$(eval IMAGE_TAG = $(if $(DOCKER_REGISTRY),$(DOCKER_REGISTRY)/,)${APP_NAME}:${VERSION})
@echo "Building docker image: ${IMAGE_TAG}"
@docker build . --tag ${IMAGE_TAG}
@rm -rf temp-deps

.PHONY: version
version:
$(eval VERSION=$(if $(OVERWRITE_VERSION),$(OVERWRITE_VERSION),v$(shell ../../scripts/vertagen/vertagen.sh -p ${APP_NAME}/)))
@echo "turing-pyfunc-ensembler-service version:" $(VERSION)
27 changes: 27 additions & 0 deletions engines/pyfunc-ensembler-service/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# PyFuncEnsembler Server for Real-Time Experiments

PyFuncEnsemblerRunner is a tool for deploying user-defined ensemblers (for use with Turing routers), written in
MLflow's `pyfunc` flavour.

## Usage
To run the ensembler as a webservice:
```bash
python -m pyfunc_ensembler_runner --mlflow_ensembler_dir $ENSEMBLER_DIR [-l {DEBUG,INFO,WARNING,ERROR,CRITICAL}]

arguments:
--mlflow_ensembler_dir <path/to/ensembler/dir/> Path to the ensembler folder containing the mlflow files
--log-level <DEBUG||INFO||WARNING||ERROR||CRITICAL> Set the logging level
-h, --help Show this help message and exit
```

## Docker Image Building

To create a docker image locally, you'll need to first download the model artifacts from the MLflow's model registry:
```bash
gsutil cp -r gs://[bucket-name]/mlflow/[project_id]/[run_id]/artifacts/ensembler .
```

To build the docker image, run the following:
```bash
make build-image
```
Loading

0 comments on commit e108820

Please sign in to comment.