From 20f1862ebf18cf6438119f42992f1174ac7a7dc1 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 15:20:22 -0700 Subject: [PATCH 01/11] Fix SM Endpoint Error --- dlc_developer_config.toml | 16 ++++++++-------- pytorch/inference/buildspec.yml | 4 ++++ .../pytorch/inference/requirements.txt | 4 ++-- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index f0aa95ef8e55..ae13d02827f5 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,15 +34,15 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures @@ -53,12 +53,12 @@ notify_test_failures = false [test] ### On by default -sanity_tests = true +sanity_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -70,7 +70,7 @@ ec2_tests_on_heavy_instances = false ### SM specific tests ### Off by default -sagemaker_local_tests = false +sagemaker_local_tests = true # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = false diff --git a/pytorch/inference/buildspec.yml b/pytorch/inference/buildspec.yml index a45a9be23f45..169c83b290e3 100644 --- a/pytorch/inference/buildspec.yml +++ b/pytorch/inference/buildspec.yml @@ -43,6 +43,7 @@ images: torch_serve_version: &TORCHSERVE_VERSION 0.11.0 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -59,6 +60,7 @@ images: torch_serve_version: &TORCHSERVE_VERSION 0.11.0 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + build_tag_override: "beta:2.2.0-gpu-py310-cu121-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 @@ -76,6 +78,7 @@ images: tool_kit_version: &SM_TOOLKIT_VERSION 2.0.23 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -93,6 +96,7 @@ images: tool_kit_version: &SM_TOOLKIT_VERSION 2.0.23 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index f641a6e89b98..240e838a2f57 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -1,7 +1,7 @@ boto3 coverage # Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) -docker<=6.1.3 +docker<=7.0.0 docker-compose Flask==1.1.1 fabric @@ -20,7 +20,7 @@ pytest-xdist requests requests_mock retrying==1.3.3 -sagemaker>=2,<3 +sagemaker>=2,<2.220.0 sagemaker-inference six tenacity From baf8648c6abbb13bd521e2bbce978d120b6bf596 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 15:26:21 -0700 Subject: [PATCH 02/11] test inf --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index ae13d02827f5..9cad78dac580 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,8 +37,8 @@ deep_canary_mode = false build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true -build_inference = false +build_training = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" From 846f58dd436f353edf71fff702327c3772c25566 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 15:27:01 -0700 Subject: [PATCH 03/11] pin docker --- test/sagemaker_tests/pytorch/inference/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index 240e838a2f57..7a0e4ae0f6e0 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -1,7 +1,7 @@ boto3 coverage # Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) -docker<=7.0.0 +docker<7.0.0 docker-compose Flask==1.1.1 fabric From 4dfb408bfcb268eddacb2ad1cf97cbf7ed87f066 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 15:37:42 -0700 Subject: [PATCH 04/11] buil true --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 9cad78dac580..066125b1ecae 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true [notify] ### Notify on test failures From 1c5327474975dc9eb4feba167e5dc6825aefde1b Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 15:39:58 -0700 Subject: [PATCH 05/11] fix build override --- pytorch/inference/buildspec.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/inference/buildspec.yml b/pytorch/inference/buildspec.yml index 169c83b290e3..a7c746950efd 100644 --- a/pytorch/inference/buildspec.yml +++ b/pytorch/inference/buildspec.yml @@ -60,7 +60,7 @@ images: torch_serve_version: &TORCHSERVE_VERSION 0.11.0 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - build_tag_override: "beta:2.2.0-gpu-py310-cu121-ubuntu20.04-ec2" + build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 @@ -96,7 +96,7 @@ images: tool_kit_version: &SM_TOOLKIT_VERSION 2.0.23 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker" + build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-sagemaker" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker From 96d822a21e6f078d1b558cfe05d6643b9ec8c605 Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 16:18:29 -0700 Subject: [PATCH 06/11] use sdk 2.220 --- dlc_developer_config.toml | 2 +- test/sagemaker_tests/pytorch/inference/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 066125b1ecae..9cad78dac580 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -42,7 +42,7 @@ build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index 7a0e4ae0f6e0..56f623b5f1b7 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -20,7 +20,7 @@ pytest-xdist requests requests_mock retrying==1.3.3 -sagemaker>=2,<2.220.0 +sagemaker==2.220.0 sagemaker-inference six tenacity From 8021dd79325744497bfb5192ec68951be871155f Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 16:33:39 -0700 Subject: [PATCH 07/11] unpin docker --- test/sagemaker_tests/pytorch/inference/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index 56f623b5f1b7..843c09b84a72 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -1,7 +1,7 @@ boto3 coverage # Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) -docker<7.0.0 +docker docker-compose Flask==1.1.1 fabric @@ -20,7 +20,7 @@ pytest-xdist requests requests_mock retrying==1.3.3 -sagemaker==2.220.0 +sagemaker>=2,<3 sagemaker-inference six tenacity From ef44782932bfe73f36e6fcb021aa358abc1b63fe Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 16:48:48 -0700 Subject: [PATCH 08/11] pin docker --- test/sagemaker_tests/pytorch/inference/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index 843c09b84a72..a1158cb6948f 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -1,7 +1,7 @@ boto3 coverage # Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) -docker +docker<7.0.0 docker-compose Flask==1.1.1 fabric From 8a9bd01ece234aea47e84e0bc905db873afcdabd Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Mon, 20 May 2024 18:40:08 -0700 Subject: [PATCH 09/11] Update requirements.txt --- test/sagemaker_tests/pytorch/inference/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index a1158cb6948f..6e0b61b468f7 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -1,7 +1,7 @@ boto3 coverage # Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) -docker<7.0.0 +docker<=6.1.3 docker-compose Flask==1.1.1 fabric @@ -20,7 +20,7 @@ pytest-xdist requests requests_mock retrying==1.3.3 -sagemaker>=2,<3 +sagemaker<=2.220.0 sagemaker-inference six tenacity From e822f96a381c9b1eee60b1d53ebd0417c57dd869 Mon Sep 17 00:00:00 2001 From: arjkesh <33526713+arjkesh@users.noreply.github.com> Date: Mon, 20 May 2024 18:54:50 -0700 Subject: [PATCH 10/11] Update requirements.txt --- test/sagemaker_tests/pytorch/inference/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sagemaker_tests/pytorch/inference/requirements.txt b/test/sagemaker_tests/pytorch/inference/requirements.txt index 6e0b61b468f7..effe790ac855 100644 --- a/test/sagemaker_tests/pytorch/inference/requirements.txt +++ b/test/sagemaker_tests/pytorch/inference/requirements.txt @@ -17,10 +17,10 @@ pytest<8.1 pytest-cov pytest-rerunfailures pytest-xdist -requests +requests<2.32.0 requests_mock retrying==1.3.3 -sagemaker<=2.220.0 +sagemaker>=2,<3 sagemaker-inference six tenacity From 825f595a65a080b9fb9ec9d9aa7cdee0131c7a2a Mon Sep 17 00:00:00 2001 From: Sirut Buasai Date: Mon, 20 May 2024 19:16:45 -0700 Subject: [PATCH 11/11] revert toml --- dlc_developer_config.toml | 16 ++++++++-------- pytorch/inference/buildspec.yml | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 9cad78dac580..f0aa95ef8e55 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -34,15 +34,15 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = false +build_training = true build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = false +do_build = true [notify] ### Notify on test failures @@ -53,12 +53,12 @@ notify_test_failures = false [test] ### On by default -sanity_tests = false +sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = false -eks_tests = false -ec2_tests = false +ecs_tests = true +eks_tests = true +ec2_tests = true # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -70,7 +70,7 @@ ec2_tests_on_heavy_instances = false ### SM specific tests ### Off by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = false diff --git a/pytorch/inference/buildspec.yml b/pytorch/inference/buildspec.yml index a7c746950efd..dfbbb742e6ce 100644 --- a/pytorch/inference/buildspec.yml +++ b/pytorch/inference/buildspec.yml @@ -43,7 +43,7 @@ images: torch_serve_version: &TORCHSERVE_VERSION 0.11.0 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] - build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-ec2" + # build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 context: @@ -60,7 +60,7 @@ images: torch_serve_version: &TORCHSERVE_VERSION 0.11.0 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-ec2" + # build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-ec2" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: ec2 @@ -78,7 +78,7 @@ images: tool_kit_version: &SM_TOOLKIT_VERSION 2.0.23 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] - build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker" + # build_tag_override: "beta:2.2.0-cpu-py310-ubuntu20.04-sagemaker" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker context: @@ -96,7 +96,7 @@ images: tool_kit_version: &SM_TOOLKIT_VERSION 2.0.23 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-sagemaker" + # build_tag_override: "beta:2.2.0-gpu-py310-cu118-ubuntu20.04-sagemaker" docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] target: sagemaker