Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to PyTorch 1.13 #1980

Merged
merged 23 commits into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
40ce815
Changes to support PyTorch 1.13
agunapal Nov 16, 2022
cc414e1
Merge branch 'master' into upgrade_pytorch_113
agunapal Nov 17, 2022
08b4274
review comments
agunapal Nov 17, 2022
a50335d
Updated default CUDA version for docker to cu116
agunapal Nov 17, 2022
9eac44d
Updated default CUDA version for docker to cu116
agunapal Nov 17, 2022
6e15536
Updated default ubuntu version to be 20.04 wherever applicable
agunapal Nov 17, 2022
97f29ca
Updated default CUDA version to CUDA 11.6
agunapal Nov 17, 2022
795c769
Updated docker to CUDA 11.7 as default
agunapal Nov 18, 2022
e8b59ad
Removed ubuntu arg from docker build
agunapal Nov 18, 2022
169665b
Added github action for cpu regression tests
agunapal Nov 18, 2022
483bb5b
Added github action for cpu regression tests
agunapal Nov 18, 2022
932e00c
Added github action for cpu regression tests
agunapal Nov 18, 2022
48211bb
Added github action for cpu regression tests
agunapal Nov 18, 2022
6d9c1d4
Added gpu regression tests action
agunapal Nov 18, 2022
6c2accf
Added gpu regression tests action
agunapal Nov 18, 2022
bfabe0f
Merge branch 'master' into upgrade_pytorch_113
msaroufim Nov 18, 2022
ab09c41
change runner
agunapal Nov 18, 2022
97413a0
Merge branch 'upgrade_pytorch_113' of https://github.com/pytorch/serv…
agunapal Nov 18, 2022
a6e30a2
added java 17 to github actions
agunapal Nov 18, 2022
cd1f098
update git version
agunapal Nov 18, 2022
c73137e
Verified GPU regression tests to be working
agunapal Nov 18, 2022
ba6b4d0
Skipping regression tests on windows
agunapal Nov 18, 2022
23fb33b
Skipping regression tests on windows
agunapal Nov 18, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Your contributions will fall into two categories:
```bash
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu102
```
> Supported cuda versions as cu116, cu113, cu111, cu102, cu101, cu92
> Supported cuda versions as cu117, cu116, cu113, cu111, cu102, cu101, cu92
- Install `pre-commit` to your Git flow:
```bash
pre-commit install
Expand Down
132 changes: 83 additions & 49 deletions benchmarks/auto_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import argparse
import datetime
import os
import ruamel.yaml
import shutil
from subprocess import Popen
from utils import gen_model_config_json
from utils import gen_md_report
from utils import gen_metrics_json

import ruamel.yaml
from utils import gen_md_report, gen_metrics_json, gen_model_config_json

CWD = os.getcwd()
MODEL_JSON_CONFIG_PATH = CWD + '/model_json_config'
BENCHMARK_TMP_PATH = '/tmp/benchmark'
BENCHMARK_REPORT_PATH = '/tmp/ts_benchmark'
TS_LOGS_PATH = CWD + '/logs'
MODEL_STORE = '/tmp/model_store'
WF_STORE = '/tmp/wf_store'
MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config"
BENCHMARK_TMP_PATH = "/tmp/benchmark"
BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark"
TS_LOGS_PATH = CWD + "/logs"
MODEL_STORE = "/tmp/model_store"
WF_STORE = "/tmp/wf_store"


class BenchmarkConfig:
def __init__(self, yaml_dict, skip_ts_install):
self.yaml_dict = yaml_dict
self.skip_ts_install = skip_ts_install
self.bm_config = {}
yesterday = datetime.date.today() - datetime.timedelta(days=1)
self.bm_config["version"] = \
"torchserve-nightly=={}.{}.{}".format(yesterday.year, yesterday.month, yesterday.day)
self.bm_config["hardware"] = 'cpu'
self.bm_config["version"] = "torchserve-nightly=={}.{}.{}".format(
yesterday.year, yesterday.month, yesterday.day
)
self.bm_config["hardware"] = "cpu"

def ts_version(self, version):
for k, v in version.items():
Expand All @@ -48,15 +49,15 @@ def metrics_cmd(self, cmd):
for k, v in key_value.items():
if k == "cmd":
cmd_options.append(v)
elif k == '--namespace':
elif k == "--namespace":
cmd_options.append(k)
cmd_options.append(''.join(v))
cmd_options.append("".join(v))
else:
cmd_options.append(k)
cmd_options.append(v)
break

self.bm_config["metrics_cmd"] = ' '.join(cmd_options)
self.bm_config["metrics_cmd"] = " ".join(cmd_options)

def report_cmd(self, cmd):
cmd_options = []
Expand All @@ -70,12 +71,14 @@ def report_cmd(self, cmd):
today = datetime.date.today()
v[i] = "{}-{}-{}".format(today.year, today.month, today.day)
break
cmd_options.append('{}/{}'.format('/'.join(v), self.bm_config["version"]))
cmd_options.append(
"{}/{}".format("/".join(v), self.bm_config["version"])
)
else:
cmd_options.append(v)
break

self.bm_config["report_cmd"] = ' '.join(cmd_options)
self.bm_config["report_cmd"] = " ".join(cmd_options)

def load_config(self):
report_cmd = None
Expand All @@ -91,10 +94,11 @@ def load_config(self):
elif k == "report_cmd":
report_cmd = v

self.bm_config["model_config_path"] = \
'{}/cpu'.format(MODEL_JSON_CONFIG_PATH) \
if self.bm_config["hardware"] == 'cpu' \
else '{}/gpu'.format(MODEL_JSON_CONFIG_PATH)
self.bm_config["model_config_path"] = (
"{}/cpu".format(MODEL_JSON_CONFIG_PATH)
if self.bm_config["hardware"] == "cpu"
else "{}/gpu".format(MODEL_JSON_CONFIG_PATH)
)

if self.skip_ts_install:
self.bm_config["version"] = get_torchserve_version()
Expand All @@ -105,67 +109,75 @@ def load_config(self):
for k, v in self.bm_config.items():
print("{}={}".format(k, v))


def load_benchmark_config(bm_config_path, skip_ts_install):
yaml = ruamel.yaml.YAML()
with open(bm_config_path, 'r') as f:
with open(bm_config_path, "r") as f:
yaml_dict = yaml.load(f)

benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install)
benchmark_config.load_config()

return benchmark_config.bm_config


def benchmark_env_setup(bm_config, skip_ts_install):
install_torchserve(skip_ts_install, bm_config["hardware"], bm_config["version"])
setup_benchmark_path(bm_config["model_config_path"])
build_model_json_config(bm_config["models"])


def install_torchserve(skip_ts_install, hw, ts_version):
if skip_ts_install:
return

# git checkout branch if it is needed
cmd = 'git checkout master && git reset --hard && git clean -dffx . && git pull --rebase'
cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase"
execute(cmd, wait=True)
print("successfully reset git")

ts_install_cmd = None
if ts_version.startswith("torchserve==") or ts_version.startswith("torchserve-nightly=="):
ts_install_cmd = 'pip install {}'.format(ts_version)
if ts_version.startswith("torchserve==") or ts_version.startswith(
"torchserve-nightly=="
):
ts_install_cmd = "pip install {}".format(ts_version)
else:
cmd = 'git checkout {}'.format(ts_version)
cmd = "git checkout {}".format(ts_version)
execute(cmd, wait=True)

# install_dependencies.py
if hw == 'gpu':
cmd = 'python ts_scripts/install_dependencies.py --environment dev --cuda cu102'
if hw == "gpu":
cmd = "python ts_scripts/install_dependencies.py --environment dev --cuda cu116"
else:
cmd = 'python ts_scripts/install_dependencies.py --environment dev'
cmd = "python ts_scripts/install_dependencies.py --environment dev"
execute(cmd, wait=True)
print("successfully install install_dependencies.py")

# install torchserve
if ts_install_cmd is None:
ts_install_cmd = 'python ts_scripts/install_from_src.py'
ts_install_cmd = "python ts_scripts/install_from_src.py"
execute(ts_install_cmd, wait=True)
print("successfully install torchserve")


def setup_benchmark_path(model_config_path):
benchmark_path_list = [BENCHMARK_TMP_PATH, BENCHMARK_REPORT_PATH, model_config_path]
for benchmark_path in benchmark_path_list:
shutil.rmtree(benchmark_path, ignore_errors=True)
os.makedirs(benchmark_path, exist_ok=True)

print('successfully setup benchmark_path={}'.format(benchmark_path))
print("successfully setup benchmark_path={}".format(benchmark_path))


def build_model_json_config(models):
for model in models:
if model.startswith('/'):
if model.startswith("/"):
input_file = model
else:
input_file = CWD + '/benchmarks/models_config/{}'.format(model)
input_file = CWD + "/benchmarks/models_config/{}".format(model)
gen_model_config_json.convert_yaml_to_json(input_file, MODEL_JSON_CONFIG_PATH)


def run_benchmark(bm_config):
files = os.listdir(bm_config["model_config_path"])
files.sort()
Expand All @@ -174,67 +186,84 @@ def run_benchmark(bm_config):
# call benchmark-ab.py
shutil.rmtree(TS_LOGS_PATH, ignore_errors=True)
shutil.rmtree(BENCHMARK_TMP_PATH, ignore_errors=True)
cmd = 'python ./benchmarks/benchmark-ab.py --tmp_dir /tmp --report_location /tmp --config_properties ' \
'./benchmarks/config.properties --config {}/{}'\
.format(bm_config["model_config_path"], model_json_config)
cmd = (
"python ./benchmarks/benchmark-ab.py --tmp_dir /tmp --report_location /tmp --config_properties "
"./benchmarks/config.properties --config {}/{}".format(
bm_config["model_config_path"], model_json_config
)
)
execute(cmd, wait=True)

# generate stats metrics from ab_report.csv
bm_model = model_json_config[0: -len('.json')]
bm_model = model_json_config[0 : -len(".json")]

gen_metrics_json.gen_metric(
'{}/ab_report.csv'.format(BENCHMARK_TMP_PATH),
'{}/logs/stats_metrics.json'.format(BENCHMARK_TMP_PATH)
"{}/ab_report.csv".format(BENCHMARK_TMP_PATH),
"{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH),
)

# load stats metrics to remote metrics storage
if "metrics_cmd" in bm_config:
execute(bm_config["metrics_cmd"], wait=True)

# cp benchmark logs to local
bm_model_log_path = '{}/{}'.format(BENCHMARK_REPORT_PATH, bm_model)
bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model)
os.makedirs(bm_model_log_path, exist_ok=True)
csv_file = '{}/ab_report.csv'.format(BENCHMARK_TMP_PATH)
csv_file = "{}/ab_report.csv".format(BENCHMARK_TMP_PATH)
if os.path.exists(csv_file):
shutil.move(csv_file, bm_model_log_path)
cmd = 'tar -cvzf {}/benchmark.tar.gz {}'.format(bm_model_log_path, BENCHMARK_TMP_PATH)
cmd = "tar -cvzf {}/benchmark.tar.gz {}".format(
bm_model_log_path, BENCHMARK_TMP_PATH
)
execute(cmd, wait=True)

cmd = 'tar -cvzf {}/logs.tar.gz {}'.format(bm_model_log_path, TS_LOGS_PATH)
cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH)
execute(cmd, wait=True)
print("finish benchmark {}".format(bm_model))

# generate final report
gen_md_report.iterate_subdir(
BENCHMARK_REPORT_PATH,
'{}/report.md'.format(BENCHMARK_REPORT_PATH),
"{}/report.md".format(BENCHMARK_REPORT_PATH),
bm_config["hardware"],
bm_config["version"])
bm_config["version"],
)
print("report.md is generated")

# load logs to remote storage
if "report_cmd" in bm_config:
execute(bm_config["report_cmd"], wait=True)


def clean_up_benchmark_env(bm_config):
shutil.rmtree(BENCHMARK_TMP_PATH, ignore_errors=True)
shutil.rmtree(MODEL_JSON_CONFIG_PATH, ignore_errors=True)
shutil.rmtree(MODEL_STORE, ignore_errors=True)
shutil.rmtree(WF_STORE, ignore_errors=True)


def execute(command, wait=False, stdout=None, stderr=None, shell=True):
print("execute: {}".format(command))
cmd = Popen(command, shell=shell, close_fds=True, stdout=stdout, stderr=stderr, universal_newlines=True)
cmd = Popen(
command,
shell=shell,
close_fds=True,
stdout=stdout,
stderr=stderr,
universal_newlines=True,
)
if wait:
cmd.wait()
return cmd


def get_torchserve_version():
# fetch the torchserve version from version.txt file
with open(os.path.join(CWD, 'ts', 'version.txt'), 'r') as file:
with open(os.path.join(CWD, "ts", "version.txt"), "r") as file:
version = file.readline().rstrip()
return version


def main():
parser = argparse.ArgumentParser()

Expand All @@ -250,12 +279,17 @@ def main():
)

arguments = parser.parse_args()
skip_ts_config = False if arguments.skip is not None and arguments.skip.lower() == "false" else True
skip_ts_config = (
False
if arguments.skip is not None and arguments.skip.lower() == "false"
else True
)
bm_config = load_benchmark_config(arguments.input, skip_ts_config)
benchmark_env_setup(bm_config, skip_ts_config)
run_benchmark(bm_config)
clean_up_benchmark_env(bm_config)
print("benchmark_serving.sh finished successfully.")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Use `build_image.sh` script to build the docker images. The script builds the `p
|-g, --gpu|Build image with GPU based ubuntu base image|
|-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, codebuild|
|-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.|
|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`. Default `cu102`|
|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`. Default `cu116`|
|-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.|
|--codebuild| Set if you need [AWS CodeBuild](https://aws.amazon.com/codebuild/)|

Expand All @@ -51,7 +51,7 @@ Creates a docker image with publicly available `torchserve` and `torch-model-arc
./build_image.sh
```

- To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`
- To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`

```bash
./build_image.sh -g -cv cu102
Expand Down
22 changes: 14 additions & 8 deletions docker/build_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ do
-g|--gpu)
MACHINE=gpu
DOCKER_TAG="pytorch/torchserve:latest-gpu"
BASE_IMAGE="nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04"
CUDA_VERSION="cu102"
BASE_IMAGE="nvidia/cuda:11.6.0-cudnn8-runtime-ubuntu20.04"
CUDA_VERSION="cu116"
shift
;;
-bt|--buildtype)
Expand All @@ -65,18 +65,21 @@ do
BUILD_WITH_IPEX=true
shift
;;
# With default ubuntu version 18.04
# With default ubuntu version 20.04
-cv|--cudaversion)
CUDA_VERSION="$2"
if [ $CUDA_VERSION == "cu116" ];
if [ $CUDA_VERSION == "cu117" ];
then
BASE_IMAGE="nvidia/cuda:11.6.0-cudnn8-runtime-ubuntu18.04"
BASE_IMAGE="nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04"
elif [ $CUDA_VERSION == "cu116" ];
then
BASE_IMAGE="nvidia/cuda:11.6.0-cudnn8-runtime-ubuntu20.04"
elif [ $CUDA_VERSION == "cu113" ];
then
BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-runtime-ubuntu18.04"
BASE_IMAGE="nvidia/cuda:11.3.0-cudnn8-runtime-ubuntu20.04"
elif [ $CUDA_VERSION == "cu111" ];
then
BASE_IMAGE="nvidia/cuda:11.1.1-cudnn8-runtime-ubuntu18.04"
BASE_IMAGE="nvidia/cuda:11.1.1-cudnn8-runtime-ubuntu20.04"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NVIDIA doesn't always name their containers consistently, need to manually verify each one of these and make sure they exist

elif [ $CUDA_VERSION == "cu102" ];
then
BASE_IMAGE="nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04"
Expand All @@ -96,7 +99,10 @@ do
# CUDA 10 is not supported on Ubuntu 20.04
-ub|--ubuntu)
agunapal marked this conversation as resolved.
Show resolved Hide resolved
UBUNTU_VERSION="$2"
if [[ $CUDA_VERSION == "cu116" && $UBUNTU_VERSION == "ubuntu20.04" ]];
if [[ $CUDA_VERSION == "cu117" && $UBUNTU_VERSION == "ubuntu20.04" ]];
then
BASE_IMAGE="nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04"
elif [[ $CUDA_VERSION == "cu116" && $UBUNTU_VERSION == "ubuntu20.04" ]];
then
BASE_IMAGE="nvidia/cuda:11.6.0-cudnn8-runtime-ubuntu20.04"
elif [[ $CUDA_VERSION == "cu113" && $UBUNTU_VERSION == "ubuntu20.04" ]];
Expand Down
2 changes: 1 addition & 1 deletion docker/docker_nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# Build Nightly images and append the date in the name
try_and_handle(f"./build_image.sh -bt dev -t {organization}/{cpu_version}", dry_run)
try_and_handle(
f"./build_image.sh -bt dev -g -cv cu102 -t {organization}/{gpu_version}",
f"./build_image.sh -bt dev -g -cv cu116 -t {organization}/{gpu_version}",
dry_run,
)

Expand Down
Loading